Bug fixes + multi threading

2025-10-31 07:26:53 +00:00 · 2019-03-10 20:15:53 -04:00 · 2019-03-10 20:15:53 -04:00 · 746ad25a4e
commit 746ad25a4e
parent 36b9ed6cb7
9 changed files with 157 additions and 50 deletions
--- a/README.md
+++ b/README.md
@ -29,6 +29,7 @@ Once the web server is running, you can connect to the search interface by typin
 * Download and install [Elasticsearch](https://www.elastic.co/downloads/elasticsearch)
 * Edit settings in [config.py](https://github.com/simon987/Simple-Incremental-Search-Tool/blob/master/config.py) (Default values are ok in most cases)
 ```bash
 git clone https://github.com/simon987/Simple-Incremental-Search-Tool
--- a/config.py
+++ b/config.py
@ -26,7 +26,14 @@ bcrypt_rounds = 14
 db_path = "./local_storage.db"
 # Set to true to allow guests to search any directory
-allow_guests = False
+allow_guests = True
 # Number of threads used for parsing
 parse_threads = 8
 # Number of threads used for thumbnail generation
 tn_threads = 32
 try:
    import cairosvg
@ -34,4 +41,4 @@ try:
 except:
    cairosvg = False
-VERSION = "1.0a"
+VERSION = "1.1a"
--- a/crawler.py
+++ b/crawler.py
@ -2,6 +2,8 @@ import json
 import os
 import shutil
 from multiprocessing import Process, Value
 from queue import Queue, Empty, Full
 from threading import Thread
 from apscheduler.schedulers.background import BackgroundScheduler
@ -51,39 +53,42 @@ class Crawler:
        self.mime_guesser = mime_guesser
-    def crawl(self, root_dir: str, counter: Value = None):
+    def crawl(self, root_dir: str, counter: Value = None, total_files = None):
-        document_counter = 0
+        in_q = Queue(50000)  # TODO: get from config?
        out_q = Queue()
        threads = []
        print("Creating %d threads" % (config.parse_threads,))
        for _ in range(config.parse_threads):
            t = Thread(target=self.parse_file, args=[in_q, out_q, ])
            threads.append(t)
            t.start()
        indexer_thread = Thread(target=self.index_file, args=[out_q, counter, ])
        indexer_thread.start()
        for root, dirs, files in os.walk(root_dir):
            for filename in files:
-                full_path = os.path.join(root, filename)
+                while True:
                mime = self.mime_guesser.guess_mime(full_path)
                parser = self.ext_map.get(mime, self.default_parser)
                document_counter += 1
                if document_counter >= config.index_every:
                    document_counter = 0
                    self.indexer.index(self.documents, self.dir_id)
                    self.documents.clear()
                    try:
-                    if counter:
+                        in_q.put(os.path.join(root, filename), timeout=10)
-                        counter.value += 1
+                        if total_files:
                            total_files.value += 1
                        break
                    except Full:
                        continue
-                    doc = parser.parse(full_path)
+        in_q.join()
-                    doc["mime"] = mime
+        out_q.join()
-                    self.documents.append(doc)
+        for _ in threads:
-                except FileNotFoundError:
+            in_q.put(None)
-                    continue  # File was deleted
+        out_q.put(None)
-        if self.indexer is not None and len(self.documents) > 0:
+        indexer_thread.join()
-            self.indexer.index(self.documents, self.dir_id)
+        for t in threads:
            t.join()
    def countFiles(self, root_dir: str):
        count = 0
@ -93,6 +98,61 @@ class Crawler:
        return count
    def parse_file(self, in_q: Queue, out_q: Queue):
        while True:
            try:
                full_path = in_q.get(timeout=1)
                if full_path is None:
                    break
            except Empty:
                break
            try:
                mime = self.mime_guesser.guess_mime(full_path)
                parser = self.ext_map.get(mime, self.default_parser)
                doc = parser.parse(full_path)
                doc["mime"] = mime
                out_q.put(doc)
            finally:
                in_q.task_done()
    def index_file(self, out_q: Queue, count: Value):
        if self.indexer is None:
            while True:
                try:
                    doc = out_q.get(timeout=10)
                    if doc is None:
                        break
                except Empty:
                    break
                self.documents.append(doc)
                out_q.task_done()
            return
        while True:
            try:
                doc = out_q.get(timeout=10)
                if doc is None:
                    break
            except Empty:
                break
            try:
                self.documents.append(doc)
                count.value += 1
                if count.value % config.index_every == 0:
                    self.indexer.index(self.documents, self.dir_id)
                    self.documents.clear()
            except:
                pass
            finally:
                out_q.task_done()
        self.indexer.index(self.documents, self.dir_id)
 class TaskManager:
    def __init__(self, storage: LocalStorage):
@ -112,10 +172,10 @@ class TaskManager:
        if task.type == Task.INDEX:
            c = Crawler([])
-            self.current_task.total_files.value = c.countFiles(directory.path)
+            self.current_process = Process(target=self.execute_crawl, args=(directory,
-
+                                                                            self.current_task.parsed_files,
-            self.current_process = Process(target=self.execute_crawl, args=(directory, self.current_task.parsed_files,
+                                                                            self.current_task.done,
-                                                                            self.current_task.done))
+                                                                            self.current_task.total_files))
        elif task.type == Task.GEN_THUMBNAIL:
            self.current_process = Process(target=self.execute_thumbnails, args=(directory,
@ -124,7 +184,7 @@ class TaskManager:
                                                                                 self.current_task.done))
        self.current_process.start()
-    def execute_crawl(self, directory: Directory, counter: Value, done: Value):
+    def execute_crawl(self, directory: Directory, counter: Value, done: Value, total_files: Value):
        Search("changeme").delete_directory(directory.id)
@ -151,7 +211,7 @@ class TaskManager:
                     DocxParser(chksum_calcs, int(directory.get_option("SpreadsheetContentLength")), directory.path),
                     EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path)],
                    mime_guesser, self.indexer, directory.id)
-        c.crawl(directory.path, counter)
+        c.crawl(directory.path, counter, total_files)
        done.value = 1
@ -161,14 +221,12 @@ class TaskManager:
        if os.path.exists(dest_path):
            shutil.rmtree(dest_path)
-        docs = list(Search("changeme").get_all_documents(directory.id))
+        docs = Search("changeme").get_all_documents(directory.id)
        total_files.value = len(docs)
        tn_generator = ThumbnailGenerator(int(directory.get_option("ThumbnailSize")),
                                          int(directory.get_option("ThumbnailQuality")),
                                          directory.get_option("ThumbnailColor"))
-        tn_generator.generate_all(docs, dest_path, counter, directory)
+        tn_generator.generate_all(docs, dest_path, counter, directory, total_files)
        done.value = 1
--- a/parsing.py
+++ b/parsing.py
@ -143,7 +143,7 @@ class GenericFileParser(FileParser):
        name, extension = os.path.splitext(name)
        info["size"] = file_stat.st_size
-        info["path"] = path[self.root_dir_len:]
+        info["path"] = os.path.relpath(path, self.root_dir)
        info["name"] = name
        info["extension"] = extension[1:]
        info["mtime"] = file_stat.st_mtime
--- a/run.py
+++ b/run.py
@ -241,7 +241,7 @@ def search_liste_page():
 def get_allowed_dirs(username):
    if config.allow_guests:
-        return [x for x in storage.dirs() if x.enabled]
+        return [x for x in storage.dirs() if storage.dirs()[x].enabled]
    if username:
        user = storage.users()[username]
        return [x for x in storage.dirs() if storage.dirs()[x].enabled and x in user.readable_directories]
--- a/search.py
+++ b/search.py
@ -149,7 +149,7 @@ class Search:
            "aggs": {
                "total_size": {"sum": {"field": "size"}}
            },
-            "size": 40}, index=self.index_name, scroll="3m")
+            "size": 40}, index=self.index_name, scroll="30m")
        return page
--- a/static/js/search.js
+++ b/static/js/search.js
@ -267,7 +267,7 @@ function createDocCard(hit) {
                }
                thumbnailOverlay.appendChild(resolutionBadge);
-                var format = hit["_source"]["format"];
+                var format = hit["_source"]["format_name"];
                //Hover
                if(format === "GIF") {
@ -429,6 +429,8 @@ window.addEventListener("scroll", function () {
                    if (hits.length !== 0) {
                        coolingDown = false;
                    }
                } else if (this.status === 500) {
                    window.location.reload()
                }
            };
            xhttp.open("GET", "/scroll?scroll_id=" + scroll_id, true);
--- a/templates/directory_manage.html
+++ b/templates/directory_manage.html
@ -143,7 +143,7 @@
        </div>
        <div class="card">
-            <div class="card-header">Options <a href="#" style="float:right">Learn more <i
+            <div class="card-header">Options <a href="https://github.com/simon987/Simple-Incremental-Search-Tool/blob/master/config.py#L1-L13" style="float:right">Learn more <i
                    class="fas fa-external-link-alt"></i></a></div>
            <div class="card-body">
                <table class="info-table table-striped table-hover">
--- a/thumbnail.py
+++ b/thumbnail.py
@ -1,6 +1,10 @@
 from queue import Full, Empty
 from threading import Thread
 from PIL import Image
 import os
 from multiprocessing import Value, Process
 from queue import Queue
 import ffmpeg
 import config
@ -22,10 +26,11 @@ class ThumbnailGenerator:
        if mime == "image/svg+xml" and config.cairosvg:
            tmpfile = dest_path + "_tmp"
            try:
-                p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": "tmp"})
+                p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": tmpfile})
                p.start()
-                p.join(1)
+                p.join(5)
                if p.is_alive():
                    p.terminate()
@ -35,8 +40,8 @@ class ThumbnailGenerator:
            except Exception:
                print("Couldn't make thumbnail for " + path)
-            if os.path.exists("tmp"):
+            if os.path.exists(tmpfile):
-                os.remove("tmp")
+                os.remove(tmpfile)
        elif mime.startswith("image"):
@ -59,11 +64,16 @@ class ThumbnailGenerator:
            if os.path.exists("tmp"):
                os.remove("tmp")
-    def generate_all(self, docs, dest_path,  counter: Value=None, directory=None):
+    def worker(self, in_q: Queue, counter: Value, dest_path, directory):
-        os.makedirs(dest_path, exist_ok=True)
+        while True:
            try:
                doc = in_q.get(timeout=1)
                if doc is None:
                    break
            except Empty:
                break
        for doc in docs:
            extension = "" if doc["_source"]["extension"] == "" else "." + doc["_source"]["extension"]
            full_path = os.path.join(directory.path, doc["_source"]["path"], doc["_source"]["name"] + extension)
@ -73,6 +83,35 @@ class ThumbnailGenerator:
            if counter is not None:
                counter.value += 1
            in_q.task_done()
    def generate_all(self, docs, dest_path, counter: Value = None, directory=None, total_count=None):
        os.makedirs(dest_path, exist_ok=True)
        in_q = Queue(50000)  # TODO: load from config?
        threads = []
        for _ in range(config.tn_threads):
            t = Thread(target=self.worker, args=[in_q, counter, dest_path, directory])
            threads.append(t)
            t.start()
        for doc in docs:
            while True:
                try:
                    in_q.put(doc, timeout=10)
                    if total_count:
                        total_count.value += 1
                    break
                except Full:
                    continue
        in_q.join()
        for _ in threads:
            in_q.put(None)
        for t in threads:
            t.join()
    def generate_image(self, path, dest_path):
        with open(path, "rb") as image_file: