mirror of
				https://github.com/simon987/Simple-Incremental-Search-Tool.git
				synced 2025-10-31 15:36:52 +00:00 
			
		
		
		
	Bug fixes + multi threading
This commit is contained in:
		
							parent
							
								
									36b9ed6cb7
								
							
						
					
					
						commit
						746ad25a4e
					
				| @ -29,6 +29,7 @@ Once the web server is running, you can connect to the search interface by typin | |||||||
| 
 | 
 | ||||||
| * Download and install [Elasticsearch](https://www.elastic.co/downloads/elasticsearch) | * Download and install [Elasticsearch](https://www.elastic.co/downloads/elasticsearch) | ||||||
| 
 | 
 | ||||||
|  | * Edit settings in [config.py](https://github.com/simon987/Simple-Incremental-Search-Tool/blob/master/config.py) (Default values are ok in most cases) | ||||||
| ```bash | ```bash | ||||||
| git clone https://github.com/simon987/Simple-Incremental-Search-Tool | git clone https://github.com/simon987/Simple-Incremental-Search-Tool | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										11
									
								
								config.py
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								config.py
									
									
									
									
									
								
							| @ -26,7 +26,14 @@ bcrypt_rounds = 14 | |||||||
| db_path = "./local_storage.db" | db_path = "./local_storage.db" | ||||||
| 
 | 
 | ||||||
| # Set to true to allow guests to search any directory | # Set to true to allow guests to search any directory | ||||||
| allow_guests = False | allow_guests = True | ||||||
|  | 
 | ||||||
|  | # Number of threads used for parsing | ||||||
|  | parse_threads = 8 | ||||||
|  | 
 | ||||||
|  | # Number of threads used for thumbnail generation | ||||||
|  | tn_threads = 32 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| try: | try: | ||||||
|     import cairosvg |     import cairosvg | ||||||
| @ -34,4 +41,4 @@ try: | |||||||
| except: | except: | ||||||
|     cairosvg = False |     cairosvg = False | ||||||
| 
 | 
 | ||||||
| VERSION = "1.0a" | VERSION = "1.1a" | ||||||
|  | |||||||
							
								
								
									
										128
									
								
								crawler.py
									
									
									
									
									
								
							
							
						
						
									
										128
									
								
								crawler.py
									
									
									
									
									
								
							| @ -2,6 +2,8 @@ import json | |||||||
| import os | import os | ||||||
| import shutil | import shutil | ||||||
| from multiprocessing import Process, Value | from multiprocessing import Process, Value | ||||||
|  | from queue import Queue, Empty, Full | ||||||
|  | from threading import Thread | ||||||
| 
 | 
 | ||||||
| from apscheduler.schedulers.background import BackgroundScheduler | from apscheduler.schedulers.background import BackgroundScheduler | ||||||
| 
 | 
 | ||||||
| @ -51,39 +53,42 @@ class Crawler: | |||||||
| 
 | 
 | ||||||
|         self.mime_guesser = mime_guesser |         self.mime_guesser = mime_guesser | ||||||
| 
 | 
 | ||||||
|     def crawl(self, root_dir: str, counter: Value = None): |     def crawl(self, root_dir: str, counter: Value = None, total_files = None): | ||||||
| 
 | 
 | ||||||
|         document_counter = 0 |         in_q = Queue(50000)  # TODO: get from config? | ||||||
|  |         out_q = Queue() | ||||||
|  | 
 | ||||||
|  |         threads = [] | ||||||
|  |         print("Creating %d threads" % (config.parse_threads,)) | ||||||
|  |         for _ in range(config.parse_threads): | ||||||
|  |             t = Thread(target=self.parse_file, args=[in_q, out_q, ]) | ||||||
|  |             threads.append(t) | ||||||
|  |             t.start() | ||||||
|  | 
 | ||||||
|  |         indexer_thread = Thread(target=self.index_file, args=[out_q, counter, ]) | ||||||
|  |         indexer_thread.start() | ||||||
| 
 | 
 | ||||||
|         for root, dirs, files in os.walk(root_dir): |         for root, dirs, files in os.walk(root_dir): | ||||||
| 
 |  | ||||||
|             for filename in files: |             for filename in files: | ||||||
|                 full_path = os.path.join(root, filename) |                 while True: | ||||||
| 
 |  | ||||||
|                 mime = self.mime_guesser.guess_mime(full_path) |  | ||||||
| 
 |  | ||||||
|                 parser = self.ext_map.get(mime, self.default_parser) |  | ||||||
| 
 |  | ||||||
|                 document_counter += 1 |  | ||||||
|                 if document_counter >= config.index_every: |  | ||||||
|                     document_counter = 0 |  | ||||||
| 
 |  | ||||||
|                     self.indexer.index(self.documents, self.dir_id) |  | ||||||
|                     self.documents.clear() |  | ||||||
| 
 |  | ||||||
|                     try: |                     try: | ||||||
|                     if counter: |                         in_q.put(os.path.join(root, filename), timeout=10) | ||||||
|                         counter.value += 1 |                         if total_files: | ||||||
|  |                             total_files.value += 1 | ||||||
|  |                         break | ||||||
|  |                     except Full: | ||||||
|  |                         continue | ||||||
| 
 | 
 | ||||||
|                     doc = parser.parse(full_path) |         in_q.join() | ||||||
|                     doc["mime"] = mime |         out_q.join() | ||||||
| 
 | 
 | ||||||
|                     self.documents.append(doc) |         for _ in threads: | ||||||
|                 except FileNotFoundError: |             in_q.put(None) | ||||||
|                     continue  # File was deleted |         out_q.put(None) | ||||||
| 
 | 
 | ||||||
|         if self.indexer is not None and len(self.documents) > 0: |         indexer_thread.join() | ||||||
|             self.indexer.index(self.documents, self.dir_id) |         for t in threads: | ||||||
|  |             t.join() | ||||||
| 
 | 
 | ||||||
|     def countFiles(self, root_dir: str): |     def countFiles(self, root_dir: str): | ||||||
|         count = 0 |         count = 0 | ||||||
| @ -93,6 +98,61 @@ class Crawler: | |||||||
| 
 | 
 | ||||||
|         return count |         return count | ||||||
| 
 | 
 | ||||||
|  |     def parse_file(self, in_q: Queue, out_q: Queue): | ||||||
|  | 
 | ||||||
|  |         while True: | ||||||
|  |             try: | ||||||
|  |                 full_path = in_q.get(timeout=1) | ||||||
|  |                 if full_path is None: | ||||||
|  |                     break | ||||||
|  |             except Empty: | ||||||
|  |                 break | ||||||
|  | 
 | ||||||
|  |             try: | ||||||
|  |                 mime = self.mime_guesser.guess_mime(full_path) | ||||||
|  |                 parser = self.ext_map.get(mime, self.default_parser) | ||||||
|  | 
 | ||||||
|  |                 doc = parser.parse(full_path) | ||||||
|  |                 doc["mime"] = mime | ||||||
|  |                 out_q.put(doc) | ||||||
|  |             finally: | ||||||
|  |                 in_q.task_done() | ||||||
|  | 
 | ||||||
|  |     def index_file(self, out_q: Queue, count: Value): | ||||||
|  | 
 | ||||||
|  |         if self.indexer is None: | ||||||
|  |             while True: | ||||||
|  |                 try: | ||||||
|  |                     doc = out_q.get(timeout=10) | ||||||
|  |                     if doc is None: | ||||||
|  |                         break | ||||||
|  |                 except Empty: | ||||||
|  |                     break | ||||||
|  |                 self.documents.append(doc) | ||||||
|  |                 out_q.task_done() | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |         while True: | ||||||
|  |             try: | ||||||
|  |                 doc = out_q.get(timeout=10) | ||||||
|  |                 if doc is None: | ||||||
|  |                     break | ||||||
|  |             except Empty: | ||||||
|  |                 break | ||||||
|  | 
 | ||||||
|  |             try: | ||||||
|  |                 self.documents.append(doc) | ||||||
|  |                 count.value += 1 | ||||||
|  | 
 | ||||||
|  |                 if count.value % config.index_every == 0: | ||||||
|  |                     self.indexer.index(self.documents, self.dir_id) | ||||||
|  |                     self.documents.clear() | ||||||
|  |             except: | ||||||
|  |                 pass | ||||||
|  |             finally: | ||||||
|  |                 out_q.task_done() | ||||||
|  |         self.indexer.index(self.documents, self.dir_id) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class TaskManager: | class TaskManager: | ||||||
|     def __init__(self, storage: LocalStorage): |     def __init__(self, storage: LocalStorage): | ||||||
| @ -112,10 +172,10 @@ class TaskManager: | |||||||
| 
 | 
 | ||||||
|         if task.type == Task.INDEX: |         if task.type == Task.INDEX: | ||||||
|             c = Crawler([]) |             c = Crawler([]) | ||||||
|             self.current_task.total_files.value = c.countFiles(directory.path) |             self.current_process = Process(target=self.execute_crawl, args=(directory, | ||||||
| 
 |                                                                             self.current_task.parsed_files, | ||||||
|             self.current_process = Process(target=self.execute_crawl, args=(directory, self.current_task.parsed_files, |                                                                             self.current_task.done, | ||||||
|                                                                             self.current_task.done)) |                                                                             self.current_task.total_files)) | ||||||
| 
 | 
 | ||||||
|         elif task.type == Task.GEN_THUMBNAIL: |         elif task.type == Task.GEN_THUMBNAIL: | ||||||
|             self.current_process = Process(target=self.execute_thumbnails, args=(directory, |             self.current_process = Process(target=self.execute_thumbnails, args=(directory, | ||||||
| @ -124,7 +184,7 @@ class TaskManager: | |||||||
|                                                                                  self.current_task.done)) |                                                                                  self.current_task.done)) | ||||||
|         self.current_process.start() |         self.current_process.start() | ||||||
| 
 | 
 | ||||||
|     def execute_crawl(self, directory: Directory, counter: Value, done: Value): |     def execute_crawl(self, directory: Directory, counter: Value, done: Value, total_files: Value): | ||||||
| 
 | 
 | ||||||
|         Search("changeme").delete_directory(directory.id) |         Search("changeme").delete_directory(directory.id) | ||||||
| 
 | 
 | ||||||
| @ -151,7 +211,7 @@ class TaskManager: | |||||||
|                      DocxParser(chksum_calcs, int(directory.get_option("SpreadsheetContentLength")), directory.path), |                      DocxParser(chksum_calcs, int(directory.get_option("SpreadsheetContentLength")), directory.path), | ||||||
|                      EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path)], |                      EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path)], | ||||||
|                     mime_guesser, self.indexer, directory.id) |                     mime_guesser, self.indexer, directory.id) | ||||||
|         c.crawl(directory.path, counter) |         c.crawl(directory.path, counter, total_files) | ||||||
| 
 | 
 | ||||||
|         done.value = 1 |         done.value = 1 | ||||||
| 
 | 
 | ||||||
| @ -161,14 +221,12 @@ class TaskManager: | |||||||
|         if os.path.exists(dest_path): |         if os.path.exists(dest_path): | ||||||
|             shutil.rmtree(dest_path) |             shutil.rmtree(dest_path) | ||||||
| 
 | 
 | ||||||
|         docs = list(Search("changeme").get_all_documents(directory.id)) |         docs = Search("changeme").get_all_documents(directory.id) | ||||||
| 
 |  | ||||||
|         total_files.value = len(docs) |  | ||||||
| 
 | 
 | ||||||
|         tn_generator = ThumbnailGenerator(int(directory.get_option("ThumbnailSize")), |         tn_generator = ThumbnailGenerator(int(directory.get_option("ThumbnailSize")), | ||||||
|                                           int(directory.get_option("ThumbnailQuality")), |                                           int(directory.get_option("ThumbnailQuality")), | ||||||
|                                           directory.get_option("ThumbnailColor")) |                                           directory.get_option("ThumbnailColor")) | ||||||
|         tn_generator.generate_all(docs, dest_path, counter, directory) |         tn_generator.generate_all(docs, dest_path, counter, directory, total_files) | ||||||
| 
 | 
 | ||||||
|         done.value = 1 |         done.value = 1 | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -143,7 +143,7 @@ class GenericFileParser(FileParser): | |||||||
|         name, extension = os.path.splitext(name) |         name, extension = os.path.splitext(name) | ||||||
| 
 | 
 | ||||||
|         info["size"] = file_stat.st_size |         info["size"] = file_stat.st_size | ||||||
|         info["path"] = path[self.root_dir_len:] |         info["path"] = os.path.relpath(path, self.root_dir) | ||||||
|         info["name"] = name |         info["name"] = name | ||||||
|         info["extension"] = extension[1:] |         info["extension"] = extension[1:] | ||||||
|         info["mtime"] = file_stat.st_mtime |         info["mtime"] = file_stat.st_mtime | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								run.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								run.py
									
									
									
									
									
								
							| @ -241,7 +241,7 @@ def search_liste_page(): | |||||||
| 
 | 
 | ||||||
| def get_allowed_dirs(username): | def get_allowed_dirs(username): | ||||||
|     if config.allow_guests: |     if config.allow_guests: | ||||||
|         return [x for x in storage.dirs() if x.enabled] |         return [x for x in storage.dirs() if storage.dirs()[x].enabled] | ||||||
|     if username: |     if username: | ||||||
|         user = storage.users()[username] |         user = storage.users()[username] | ||||||
|         return [x for x in storage.dirs() if storage.dirs()[x].enabled and x in user.readable_directories] |         return [x for x in storage.dirs() if storage.dirs()[x].enabled and x in user.readable_directories] | ||||||
|  | |||||||
| @ -149,7 +149,7 @@ class Search: | |||||||
|             "aggs": { |             "aggs": { | ||||||
|                 "total_size": {"sum": {"field": "size"}} |                 "total_size": {"sum": {"field": "size"}} | ||||||
|             }, |             }, | ||||||
|             "size": 40}, index=self.index_name, scroll="3m") |             "size": 40}, index=self.index_name, scroll="30m") | ||||||
| 
 | 
 | ||||||
|         return page |         return page | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -267,7 +267,7 @@ function createDocCard(hit) { | |||||||
|                 } |                 } | ||||||
|                 thumbnailOverlay.appendChild(resolutionBadge); |                 thumbnailOverlay.appendChild(resolutionBadge); | ||||||
| 
 | 
 | ||||||
|                 var format = hit["_source"]["format"]; |                 var format = hit["_source"]["format_name"]; | ||||||
| 
 | 
 | ||||||
|                 //Hover
 |                 //Hover
 | ||||||
|                 if(format === "GIF") { |                 if(format === "GIF") { | ||||||
| @ -429,6 +429,8 @@ window.addEventListener("scroll", function () { | |||||||
|                     if (hits.length !== 0) { |                     if (hits.length !== 0) { | ||||||
|                         coolingDown = false; |                         coolingDown = false; | ||||||
|                     } |                     } | ||||||
|  |                 } else if (this.status === 500) { | ||||||
|  |                     window.location.reload() | ||||||
|                 } |                 } | ||||||
|             }; |             }; | ||||||
|             xhttp.open("GET", "/scroll?scroll_id=" + scroll_id, true); |             xhttp.open("GET", "/scroll?scroll_id=" + scroll_id, true); | ||||||
|  | |||||||
| @ -143,7 +143,7 @@ | |||||||
|         </div> |         </div> | ||||||
| 
 | 
 | ||||||
|         <div class="card"> |         <div class="card"> | ||||||
|             <div class="card-header">Options <a href="#" style="float:right">Learn more <i |             <div class="card-header">Options <a href="https://github.com/simon987/Simple-Incremental-Search-Tool/blob/master/config.py#L1-L13" style="float:right">Learn more <i | ||||||
|                     class="fas fa-external-link-alt"></i></a></div> |                     class="fas fa-external-link-alt"></i></a></div> | ||||||
|             <div class="card-body"> |             <div class="card-body"> | ||||||
|                 <table class="info-table table-striped table-hover"> |                 <table class="info-table table-striped table-hover"> | ||||||
|  | |||||||
							
								
								
									
										53
									
								
								thumbnail.py
									
									
									
									
									
								
							
							
						
						
									
										53
									
								
								thumbnail.py
									
									
									
									
									
								
							| @ -1,6 +1,10 @@ | |||||||
|  | from queue import Full, Empty | ||||||
|  | from threading import Thread | ||||||
|  | 
 | ||||||
| from PIL import Image | from PIL import Image | ||||||
| import os | import os | ||||||
| from multiprocessing import Value, Process | from multiprocessing import Value, Process | ||||||
|  | from queue import Queue | ||||||
| import ffmpeg | import ffmpeg | ||||||
| import config | import config | ||||||
| 
 | 
 | ||||||
| @ -22,10 +26,11 @@ class ThumbnailGenerator: | |||||||
| 
 | 
 | ||||||
|         if mime == "image/svg+xml" and config.cairosvg: |         if mime == "image/svg+xml" and config.cairosvg: | ||||||
| 
 | 
 | ||||||
|  |             tmpfile = dest_path + "_tmp" | ||||||
|             try: |             try: | ||||||
|                 p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": "tmp"}) |                 p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": tmpfile}) | ||||||
|                 p.start() |                 p.start() | ||||||
|                 p.join(1) |                 p.join(5) | ||||||
| 
 | 
 | ||||||
|                 if p.is_alive(): |                 if p.is_alive(): | ||||||
|                     p.terminate() |                     p.terminate() | ||||||
| @ -35,8 +40,8 @@ class ThumbnailGenerator: | |||||||
|             except Exception: |             except Exception: | ||||||
|                 print("Couldn't make thumbnail for " + path) |                 print("Couldn't make thumbnail for " + path) | ||||||
| 
 | 
 | ||||||
|             if os.path.exists("tmp"): |             if os.path.exists(tmpfile): | ||||||
|                 os.remove("tmp") |                 os.remove(tmpfile) | ||||||
| 
 | 
 | ||||||
|         elif mime.startswith("image"): |         elif mime.startswith("image"): | ||||||
| 
 | 
 | ||||||
| @ -59,11 +64,16 @@ class ThumbnailGenerator: | |||||||
|             if os.path.exists("tmp"): |             if os.path.exists("tmp"): | ||||||
|                 os.remove("tmp") |                 os.remove("tmp") | ||||||
| 
 | 
 | ||||||
|     def generate_all(self, docs, dest_path,  counter: Value=None, directory=None): |     def worker(self, in_q: Queue, counter: Value, dest_path, directory): | ||||||
| 
 | 
 | ||||||
|         os.makedirs(dest_path, exist_ok=True) |         while True: | ||||||
|  |             try: | ||||||
|  |                 doc = in_q.get(timeout=1) | ||||||
|  |                 if doc is None: | ||||||
|  |                     break | ||||||
|  |             except Empty: | ||||||
|  |                 break | ||||||
| 
 | 
 | ||||||
|         for doc in docs: |  | ||||||
|             extension = "" if doc["_source"]["extension"] == "" else "." + doc["_source"]["extension"] |             extension = "" if doc["_source"]["extension"] == "" else "." + doc["_source"]["extension"] | ||||||
|             full_path = os.path.join(directory.path, doc["_source"]["path"], doc["_source"]["name"] + extension) |             full_path = os.path.join(directory.path, doc["_source"]["path"], doc["_source"]["name"] + extension) | ||||||
| 
 | 
 | ||||||
| @ -73,6 +83,35 @@ class ThumbnailGenerator: | |||||||
|             if counter is not None: |             if counter is not None: | ||||||
|                 counter.value += 1 |                 counter.value += 1 | ||||||
| 
 | 
 | ||||||
|  |             in_q.task_done() | ||||||
|  | 
 | ||||||
|  |     def generate_all(self, docs, dest_path, counter: Value = None, directory=None, total_count=None): | ||||||
|  | 
 | ||||||
|  |         os.makedirs(dest_path, exist_ok=True) | ||||||
|  | 
 | ||||||
|  |         in_q = Queue(50000)  # TODO: load from config? | ||||||
|  |         threads = [] | ||||||
|  |         for _ in range(config.tn_threads): | ||||||
|  |             t = Thread(target=self.worker, args=[in_q, counter, dest_path, directory]) | ||||||
|  |             threads.append(t) | ||||||
|  |             t.start() | ||||||
|  | 
 | ||||||
|  |         for doc in docs: | ||||||
|  |             while True: | ||||||
|  |                 try: | ||||||
|  |                     in_q.put(doc, timeout=10) | ||||||
|  |                     if total_count: | ||||||
|  |                         total_count.value += 1 | ||||||
|  |                     break | ||||||
|  |                 except Full: | ||||||
|  |                     continue | ||||||
|  | 
 | ||||||
|  |         in_q.join() | ||||||
|  |         for _ in threads: | ||||||
|  |             in_q.put(None) | ||||||
|  |         for t in threads: | ||||||
|  |             t.join() | ||||||
|  | 
 | ||||||
|     def generate_image(self, path, dest_path): |     def generate_image(self, path, dest_path): | ||||||
| 
 | 
 | ||||||
|         with open(path, "rb") as image_file: |         with open(path, "rb") as image_file: | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user