diff --git a/README.md b/README.md index c0360b1..824357e 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Once the web server is running, you can connect to the search interface by typin * Download and install [Elasticsearch](https://www.elastic.co/downloads/elasticsearch) +* Edit settings in [config.py](https://github.com/simon987/Simple-Incremental-Search-Tool/blob/master/config.py) (Default values are ok in most cases) ```bash git clone https://github.com/simon987/Simple-Incremental-Search-Tool diff --git a/config.py b/config.py index 0eef69b..ac844e6 100644 --- a/config.py +++ b/config.py @@ -26,7 +26,14 @@ bcrypt_rounds = 14 db_path = "./local_storage.db" # Set to true to allow guests to search any directory -allow_guests = False +allow_guests = True + +# Number of threads used for parsing +parse_threads = 8 + +# Number of threads used for thumbnail generation +tn_threads = 32 + try: import cairosvg @@ -34,4 +41,4 @@ try: except: cairosvg = False -VERSION = "1.0a" +VERSION = "1.1a" diff --git a/crawler.py b/crawler.py index f4aaae8..c14bc4d 100644 --- a/crawler.py +++ b/crawler.py @@ -2,6 +2,8 @@ import json import os import shutil from multiprocessing import Process, Value +from queue import Queue, Empty, Full +from threading import Thread from apscheduler.schedulers.background import BackgroundScheduler @@ -51,39 +53,42 @@ class Crawler: self.mime_guesser = mime_guesser - def crawl(self, root_dir: str, counter: Value = None): + def crawl(self, root_dir: str, counter: Value = None, total_files = None): - document_counter = 0 + in_q = Queue(50000) # TODO: get from config? + out_q = Queue() + + threads = [] + print("Creating %d threads" % (config.parse_threads,)) + for _ in range(config.parse_threads): + t = Thread(target=self.parse_file, args=[in_q, out_q, ]) + threads.append(t) + t.start() + + indexer_thread = Thread(target=self.index_file, args=[out_q, counter, ]) + indexer_thread.start() for root, dirs, files in os.walk(root_dir): - for filename in files: - full_path = os.path.join(root, filename) + while True: + try: + in_q.put(os.path.join(root, filename), timeout=10) + if total_files: + total_files.value += 1 + break + except Full: + continue - mime = self.mime_guesser.guess_mime(full_path) + in_q.join() + out_q.join() - parser = self.ext_map.get(mime, self.default_parser) + for _ in threads: + in_q.put(None) + out_q.put(None) - document_counter += 1 - if document_counter >= config.index_every: - document_counter = 0 - - self.indexer.index(self.documents, self.dir_id) - self.documents.clear() - - try: - if counter: - counter.value += 1 - - doc = parser.parse(full_path) - doc["mime"] = mime - - self.documents.append(doc) - except FileNotFoundError: - continue # File was deleted - - if self.indexer is not None and len(self.documents) > 0: - self.indexer.index(self.documents, self.dir_id) + indexer_thread.join() + for t in threads: + t.join() def countFiles(self, root_dir: str): count = 0 @@ -93,6 +98,61 @@ class Crawler: return count + def parse_file(self, in_q: Queue, out_q: Queue): + + while True: + try: + full_path = in_q.get(timeout=1) + if full_path is None: + break + except Empty: + break + + try: + mime = self.mime_guesser.guess_mime(full_path) + parser = self.ext_map.get(mime, self.default_parser) + + doc = parser.parse(full_path) + doc["mime"] = mime + out_q.put(doc) + finally: + in_q.task_done() + + def index_file(self, out_q: Queue, count: Value): + + if self.indexer is None: + while True: + try: + doc = out_q.get(timeout=10) + if doc is None: + break + except Empty: + break + self.documents.append(doc) + out_q.task_done() + return + + while True: + try: + doc = out_q.get(timeout=10) + if doc is None: + break + except Empty: + break + + try: + self.documents.append(doc) + count.value += 1 + + if count.value % config.index_every == 0: + self.indexer.index(self.documents, self.dir_id) + self.documents.clear() + except: + pass + finally: + out_q.task_done() + self.indexer.index(self.documents, self.dir_id) + class TaskManager: def __init__(self, storage: LocalStorage): @@ -112,10 +172,10 @@ class TaskManager: if task.type == Task.INDEX: c = Crawler([]) - self.current_task.total_files.value = c.countFiles(directory.path) - - self.current_process = Process(target=self.execute_crawl, args=(directory, self.current_task.parsed_files, - self.current_task.done)) + self.current_process = Process(target=self.execute_crawl, args=(directory, + self.current_task.parsed_files, + self.current_task.done, + self.current_task.total_files)) elif task.type == Task.GEN_THUMBNAIL: self.current_process = Process(target=self.execute_thumbnails, args=(directory, @@ -124,7 +184,7 @@ class TaskManager: self.current_task.done)) self.current_process.start() - def execute_crawl(self, directory: Directory, counter: Value, done: Value): + def execute_crawl(self, directory: Directory, counter: Value, done: Value, total_files: Value): Search("changeme").delete_directory(directory.id) @@ -151,7 +211,7 @@ class TaskManager: DocxParser(chksum_calcs, int(directory.get_option("SpreadsheetContentLength")), directory.path), EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path)], mime_guesser, self.indexer, directory.id) - c.crawl(directory.path, counter) + c.crawl(directory.path, counter, total_files) done.value = 1 @@ -161,14 +221,12 @@ class TaskManager: if os.path.exists(dest_path): shutil.rmtree(dest_path) - docs = list(Search("changeme").get_all_documents(directory.id)) - - total_files.value = len(docs) + docs = Search("changeme").get_all_documents(directory.id) tn_generator = ThumbnailGenerator(int(directory.get_option("ThumbnailSize")), int(directory.get_option("ThumbnailQuality")), directory.get_option("ThumbnailColor")) - tn_generator.generate_all(docs, dest_path, counter, directory) + tn_generator.generate_all(docs, dest_path, counter, directory, total_files) done.value = 1 diff --git a/parsing.py b/parsing.py index 26a17bd..353426e 100644 --- a/parsing.py +++ b/parsing.py @@ -143,7 +143,7 @@ class GenericFileParser(FileParser): name, extension = os.path.splitext(name) info["size"] = file_stat.st_size - info["path"] = path[self.root_dir_len:] + info["path"] = os.path.relpath(path, self.root_dir) info["name"] = name info["extension"] = extension[1:] info["mtime"] = file_stat.st_mtime diff --git a/run.py b/run.py index 741be76..0502779 100644 --- a/run.py +++ b/run.py @@ -241,7 +241,7 @@ def search_liste_page(): def get_allowed_dirs(username): if config.allow_guests: - return [x for x in storage.dirs() if x.enabled] + return [x for x in storage.dirs() if storage.dirs()[x].enabled] if username: user = storage.users()[username] return [x for x in storage.dirs() if storage.dirs()[x].enabled and x in user.readable_directories] diff --git a/search.py b/search.py index 62bafee..81e21ec 100644 --- a/search.py +++ b/search.py @@ -149,7 +149,7 @@ class Search: "aggs": { "total_size": {"sum": {"field": "size"}} }, - "size": 40}, index=self.index_name, scroll="3m") + "size": 40}, index=self.index_name, scroll="30m") return page diff --git a/static/js/search.js b/static/js/search.js index d43e9c6..2bdfb16 100644 --- a/static/js/search.js +++ b/static/js/search.js @@ -267,7 +267,7 @@ function createDocCard(hit) { } thumbnailOverlay.appendChild(resolutionBadge); - var format = hit["_source"]["format"]; + var format = hit["_source"]["format_name"]; //Hover if(format === "GIF") { @@ -429,6 +429,8 @@ window.addEventListener("scroll", function () { if (hits.length !== 0) { coolingDown = false; } + } else if (this.status === 500) { + window.location.reload() } }; xhttp.open("GET", "/scroll?scroll_id=" + scroll_id, true); diff --git a/templates/directory_manage.html b/templates/directory_manage.html index 3d13e01..0703f95 100644 --- a/templates/directory_manage.html +++ b/templates/directory_manage.html @@ -143,7 +143,7 @@