Bug fixes

2025-04-24 12:35:51 +00:00 · 2018-04-21 20:36:49 -04:00 · 2018-04-21 20:36:49 -04:00 · b963b667b8
commit b963b667b8
parent 6b754b4bb4
18 changed files with 128 additions and 83 deletions
--- a/crawler.py
+++ b/crawler.py
@ -29,11 +29,13 @@ class RunningTask:

 class Crawler:

-    def __init__(self, enabled_parsers: list, mime_guesser: MimeGuesser=ContentMimeGuesser(), indexer=None, dir_id=0):
+    def __init__(self, enabled_parsers: list, mime_guesser: MimeGuesser=ContentMimeGuesser(), indexer=None, dir_id=0,
+                 root_dir="/"):
        self.documents = []
        self.enabled_parsers = enabled_parsers
        self.indexer = indexer
        self.dir_id = dir_id
+        self.root_dir = root_dir

        for parser in self.enabled_parsers:
            if parser.is_default:
@ -136,14 +138,14 @@ class TaskManager:
        mime_guesser = ExtensionMimeGuesser() if directory.get_option("MimeGuesser") == "extension" \
            else ContentMimeGuesser()

-        c = Crawler([GenericFileParser(chksum_calcs),
-                     MediaFileParser(chksum_calcs),
-                     TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
-                     PictureFileParser(chksum_calcs),
-                     FontParser(chksum_calcs),
-                     PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),  # todo get content len from other opt
-                     DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),  # todo get content len from other opt
-                     EbookParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))],  # todo get content len from other opt
+        c = Crawler([GenericFileParser(chksum_calcs, directory.path),
+                     MediaFileParser(chksum_calcs, directory.path),
+                     TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path),
+                     PictureFileParser(chksum_calcs, directory.path),
+                     FontParser(chksum_calcs, directory.path),
+                     PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path),  # todo get content len from other opt
+                     DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path),  # todo get content len from other opt
+                     EbookParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path)],  # todo get content len from other opt
                    mime_guesser, self.indexer, directory.id)
        c.crawl(directory.path, counter)

@ -162,7 +164,7 @@ class TaskManager:
        tn_generator = ThumbnailGenerator(int(directory.get_option("ThumbnailSize")),
                                          int(directory.get_option("ThumbnailQuality")),
                                          directory.get_option("ThumbnailColor"))
-        tn_generator.generate_all(docs, dest_path, counter)
+        tn_generator.generate_all(docs, dest_path, counter, directory)

        done.value = 1

--- a/indexer.py
+++ b/indexer.py
@ -15,7 +15,6 @@ class Indexer:

        try:
            requests.head("http://localhost:9200")
-            print("elasticsearch is already running")

        except requests.exceptions.ConnectionError:
            import time
--- a/parsing.py
+++ b/parsing.py
@ -9,7 +9,7 @@ import html
 import warnings
 import docx2txt
 import xlrd
-from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfparser import PDFParser, PDFSyntaxError
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
@ -124,8 +124,9 @@ class GenericFileParser(FileParser):
    mime_types = []
    is_default = True

-    def __init__(self, checksum_calculators: list):
+    def __init__(self, checksum_calculators: list, root_dir: str):
        self.checksum_calculators = checksum_calculators
+        self.root_dir = root_dir

    def parse(self, full_path: str) -> dict:
        """
@ -141,7 +142,7 @@ class GenericFileParser(FileParser):
        name, extension = os.path.splitext(name)

        info["size"] = file_stat.st_size
-        info["path"] = path  # todo save relative path
+        info["path"] = os.path.relpath(path, self.root_dir)
        info["name"] = name
        info["extension"] = extension[1:]
        info["mtime"] = file_stat.st_mtime
@ -156,8 +157,8 @@ class MediaFileParser(GenericFileParser):
    is_default = False
    relevant_properties = ["bit_rate", "nb_streams", "duration", "format_name", "format_long_name"]

-    def __init__(self, checksum_calculators: list):
-        super().__init__(checksum_calculators)
+    def __init__(self, checksum_calculators: list, root_dir):
+        super().__init__(checksum_calculators, root_dir)

        self.mime_types = [
            "video/3gpp", "video/mp4", "video/mpeg", "video/ogg", "video/quicktime",
@ -207,8 +208,8 @@ class MediaFileParser(GenericFileParser):
 class PictureFileParser(GenericFileParser):
    is_default = False

-    def __init__(self, checksum_calculators: list):
-        super().__init__(checksum_calculators)
+    def __init__(self, checksum_calculators: list, root_dir):
+        super().__init__(checksum_calculators, root_dir)

        self.mime_types = [
            "image/bmp", "image/cgm", "image/cis-cod", "image/g3fax", "image/gif",
@ -246,8 +247,8 @@ class PictureFileParser(GenericFileParser):
 class TextFileParser(GenericFileParser):
    is_default = False

-    def __init__(self, checksum_calculators: list, content_length: int):
-        super().__init__(checksum_calculators)
+    def __init__(self, checksum_calculators: list, content_length: int, root_dir):
+        super().__init__(checksum_calculators, root_dir)
        self.content_length = content_length

        self.mime_types = [
@ -271,7 +272,7 @@ class TextFileParser(GenericFileParser):
            "text/x-bibtex", "text/x-tcl", "text/x-c++", "text/x-shellscript", "text/x-msdos-batch",
            "text/x-makefile", "text/rtf", "text/x-objective-c", "text/troff", "text/x-m4",
            "text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po",
-            "text/x-makefile", "application/javascript"
+            "text/x-makefile", "application/javascript", "application/rtf"
        ]

    def parse(self, full_path: str):
@ -298,8 +299,8 @@ class TextFileParser(GenericFileParser):
 class FontParser(GenericFileParser):
    is_default = False

-    def __init__(self, checksum_calculators: list):
-        super().__init__(checksum_calculators)
+    def __init__(self, checksum_calculators: list, root_dir):
+        super().__init__(checksum_calculators, root_dir)

        self.mime_types = [
            "application/font-sfnt", "application/font-woff", "application/vdn.ms-fontobject",
@ -336,8 +337,8 @@ class FontParser(GenericFileParser):
 class PdfFileParser(GenericFileParser):
    is_default = False

-    def __init__(self, checksum_calculators: list, content_length: int):
-        super().__init__(checksum_calculators)
+    def __init__(self, checksum_calculators: list, content_length: int, root_dir):
+        super().__init__(checksum_calculators, root_dir)

        self.content_length = content_length

@ -351,11 +352,14 @@ class PdfFileParser(GenericFileParser):
        if self.content_length > 0:
            with open(full_path, "rb") as f:

+                try:
+                    parser = PDFParser(f)
+                    document = PDFDocument(parser)
+                except PDFSyntaxError:
+                    print("couldn't parse PDF " + full_path)
+                    return info
+
                info["content"] = ""
-
-                parser = PDFParser(f)
-                document = PDFDocument(parser)
-
                if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
                    if isinstance(document.info[0]["Title"], bytes):
                        info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
@ -399,8 +403,8 @@ class PdfFileParser(GenericFileParser):
 class EbookParser(GenericFileParser):
    is_default = False

-    def __init__(self, checksum_calculators: list, content_length: int):
-        super().__init__(checksum_calculators)
+    def __init__(self, checksum_calculators: list, content_length: int, root_dir):
+        super().__init__(checksum_calculators, root_dir)

        self.content_length = content_length

@ -435,8 +439,8 @@ class EbookParser(GenericFileParser):
 class DocxParser(GenericFileParser):
    is_default = False

-    def __init__(self, checksum_calculators: list, content_length: int):
-        super().__init__(checksum_calculators)
+    def __init__(self, checksum_calculators: list, content_length: int, root_dir):
+        super().__init__(checksum_calculators, root_dir)

        self.content_length = content_length

@ -447,12 +451,16 @@ class DocxParser(GenericFileParser):
    def parse(self, full_path: str):
        info = super().parse(full_path)

-        text = docx2txt.process(full_path)
+        if self.content_length > 0:
+            try:
+                text = docx2txt.process(full_path)

-        if len(text) < self.content_length:
-            info["content"] = text
-        else:
-            info["content"] = text[0:self.content_length]
+                if len(text) < self.content_length:
+                    info["content"] = text
+                else:
+                    info["content"] = text[0:self.content_length]
+            except:
+                print("Couldn't parse Ebook: " + full_path)

        return info

@ -460,8 +468,8 @@ class DocxParser(GenericFileParser):
 class SpreadSheetParser(GenericFileParser):
    is_default = False

-    def __init__(self, checksum_calculators: list, content_length: int):
-        super().__init__(checksum_calculators)
+    def __init__(self, checksum_calculators: list, content_length: int, root_dir):
+        super().__init__(checksum_calculators, root_dir)

        self.content_length = content_length

--- a/run.py
+++ b/run.py
@ -128,7 +128,23 @@ def search_route():
    size_max = request.json["size_max"]
    mime_types = request.json["mime_types"]
    must_match = request.json["must_match"]
-    directories = request.json["directories"]  # todo: make sure dir exists and is enabled
+    directories = request.json["directories"]
+
+    # Remove disabled & non-existing directories
+    for search_directory in directories:
+        directory_exists = False
+
+        for dir_id in storage.dirs():
+            if search_directory == dir_id:
+                directory_exists = True
+
+                if not storage.dirs()[dir_id].enabled:
+                    directories.remove(search_directory)
+                break
+
+        if not directory_exists:
+            directories.remove(search_directory)
+
    path = request.json["path"]

    page = search.search(query, size_min, size_max, mime_types, must_match, directories, path)
--- a/spec/Crawler_spec.py
+++ b/spec/Crawler_spec.py
@ -8,9 +8,9 @@ class CrawlerTest(TestCase):

    def test_dir_walk(self):

-        c = Crawler([GenericFileParser([Sha1CheckSumCalculator()])])
+        c = Crawler([GenericFileParser([Sha1CheckSumCalculator()], "test_files/")])

-        c.crawl("test_folder")
+        c.crawl("./test_folder")

        self.assertEqual(len(c.documents), 31)

@ -19,3 +19,16 @@ class CrawlerTest(TestCase):
        c = Crawler([])

        self.assertEqual(c.countFiles("test_folder"), 31)
+
+    def test_path(self):
+
+        c = Crawler([GenericFileParser([], "./test_folder")])
+        c.crawl("./test_folder")
+
+        file_count_in_sub2 = 0
+
+        for doc in c.documents:
+            if doc["path"] == "sub2":
+                file_count_in_sub2 += 1
+
+        self.assertEqual(file_count_in_sub2, 2)
--- a/spec/DocxParser_spec.py
+++ b/spec/DocxParser_spec.py
@ -6,7 +6,7 @@ class DocxParserTest(TestCase):

    def test_parse_content(self):

-        parser = DocxParser([], 1000)
+        parser = DocxParser([], 1000, "test_files/")

        info = parser.parse("test_files/docx1.docx")

--- a/spec/EbookParserTest.py
+++ b/spec/EbookParserTest.py
@ -6,7 +6,7 @@ class EbookParserTest(TestCase):

    def test_parse_content(self):

-        parser = EbookParser([], 1000)
+        parser = EbookParser([], 1000, "test_files/")

        info = parser.parse("test_files/epub1.epub")

--- a/spec/FileParser_spec.py
+++ b/spec/FileParser_spec.py
@ -16,34 +16,34 @@ class GenericFileParserTest(TestCase):
        test_file.close()
        os.utime("test_parse.txt", (1330123456, 1330654321))

-        self.parser = GenericFileParser([Md5CheckSumCalculator()])
+        self.parser = GenericFileParser([Md5CheckSumCalculator()], "./test_files/")

    def tearDown(self):
        os.remove("test_parse.txt")

    def test_parse_size(self):
-        result = self.parser.parse("test_parse.txt")
+        result = self.parser.parse("./test_parse.txt")

        self.assertEqual(result["size"], 8)

    def test_parse_name(self):
-        result = self.parser.parse("test_parse.txt")
+        result = self.parser.parse("./test_parse.txt")

        self.assertEqual(result["name"], "test_parse")

    def test_parse_ext(self):
-        result = self.parser.parse("test_parse.txt")
+        result = self.parser.parse("./test_parse.txt")

        self.assertEqual(result["extension"], "txt")

    def test_parse_md5(self):
-        result = self.parser.parse("test_parse.txt")
+        result = self.parser.parse("./test_parse.txt")

        self.assertEqual(result["md5"], "25D55AD283AA400AF464C76D713C07AD")

    def test_mtime(self):

-        result = self.parser.parse("test_parse.txt")
+        result = self.parser.parse("./test_parse.txt")

        self.assertEqual(result["mtime"], 1330654321)

--- a/spec/FontParser_spec.py
+++ b/spec/FontParser_spec.py
@ -6,7 +6,7 @@ class FontParserTest(TestCase):

    def test_parse_name_trueType(self):

-        parser = FontParser([])
+        parser = FontParser([], "test_files/")

        info = parser.parse("test_files/truetype1.ttf")

@ -14,7 +14,7 @@ class FontParserTest(TestCase):

    def test_parse_name_openType(self):

-        parser = FontParser([])
+        parser = FontParser([], "test_files/")

        info = parser.parse("test_files/opentype1.otf")

@ -22,7 +22,7 @@ class FontParserTest(TestCase):

    def test_parse_name_woff(self):

-        parser = FontParser([])
+        parser = FontParser([], "test_files/")

        info = parser.parse("test_files/woff.woff")

@ -30,7 +30,7 @@ class FontParserTest(TestCase):

    def test_parse_name_woff2(self):

-        parser = FontParser([])
+        parser = FontParser([], "test_files/")

        info = parser.parse("test_files/woff2.woff2")

--- a/spec/MediaFileParser_spec.py
+++ b/spec/MediaFileParser_spec.py
@ -6,37 +6,33 @@ class MediaFileParserTest(TestCase):

    def test_audio_wav(self):

-        parser = MediaFileParser([])
+        parser = MediaFileParser([], "test_files/")

-        info = parser.parse("test_files/cat1.wav")
+        info = parser.parse("./test_files/cat1.wav")

-        self.assertEqual(info["format_name"], "wav")
        self.assertEqual(info["format_long_name"], "WAV / WAVE (Waveform Audio)")
        self.assertEqual(info["duration"], 20.173875)

    def test_video_mov(self):
-        parser = MediaFileParser([])
+        parser = MediaFileParser([], "./test_files")

-        info = parser.parse("test_files/vid1.mp4")
+        info = parser.parse("./test_files/vid1.mp4")

-        self.assertEqual(info["format_name"], "mov,mp4,m4a,3gp,3g2,mj2")
        self.assertEqual(info["format_long_name"], "QuickTime / MOV")
        self.assertEqual(info["duration"], 5.334)

    def test_video_webm(self):
-        parser = MediaFileParser([])
+        parser = MediaFileParser([], "test_files/")

        info = parser.parse("test_files/vid2.webm")

-        self.assertEqual(info["format_name"], "matroska,webm")
        self.assertEqual(info["format_long_name"], "Matroska / WebM")
        self.assertEqual(info["duration"], 10.619)

    def test_video_ogg(self):
-        parser = MediaFileParser([])
+        parser = MediaFileParser([], "test_files/")

        info = parser.parse("test_files/vid3.ogv")

-        self.assertEqual(info["format_name"], "ogg")
        self.assertEqual(info["format_long_name"], "Ogg")
        self.assertEqual(info["duration"], 10.618867)
--- a/spec/PdfFileParser.py
+++ b/spec/PdfFileParser.py
@ -6,7 +6,7 @@ class PdfParserTest(TestCase):

    def test_parse_content(self):

-        parser = PdfFileParser([], 12488)
+        parser = PdfFileParser([], 12488, "test_files/")

        info = parser.parse("test_files/pdf1.pdf")

--- a/spec/PictureFileParser_spec.py
+++ b/spec/PictureFileParser_spec.py
@ -6,7 +6,7 @@ class PictureFileParserTest(TestCase):

    def test_parse_jpg(self):

-        parser = PictureFileParser([])
+        parser = PictureFileParser([], "test_files/")

        info = parser.parse("test_folder/sample_1.jpg")

@ -17,7 +17,7 @@ class PictureFileParserTest(TestCase):

    def test_parse_png(self):

-        parser = PictureFileParser([])
+        parser = PictureFileParser([], "test_files/")

        info = parser.parse("test_folder/sample_5.png")

@ -28,7 +28,7 @@ class PictureFileParserTest(TestCase):

    def test_parse_gif(self):

-        parser = PictureFileParser([])
+        parser = PictureFileParser([], "test_files/")

        info = parser.parse("test_folder/sample_6.gif")

@ -39,7 +39,7 @@ class PictureFileParserTest(TestCase):

    def test_parse_bmp(self):

-        parser = PictureFileParser([])
+        parser = PictureFileParser([], "test_files/")

        info = parser.parse("test_folder/sample_7.bmp")

--- a/spec/SpreadSheetParser_spec.py
+++ b/spec/SpreadSheetParser_spec.py
@ -6,7 +6,7 @@ class PdfParserTest(TestCase):

    def test_parse_content_xls(self):

-        parser = SpreadSheetParser([], 1500)
+        parser = SpreadSheetParser([], 1500, "test_files/")

        info = parser.parse("test_files/xls1.xls")

@ -14,7 +14,7 @@ class PdfParserTest(TestCase):

    def test_parse_content_xlsx(self):

-        parser = SpreadSheetParser([], 1500)
+        parser = SpreadSheetParser([], 1500, "test_files/")

        info = parser.parse("test_files/xlsx1.xlsx")

--- a/spec/TextFileParser_spec.py
+++ b/spec/TextFileParser_spec.py
@ -6,7 +6,7 @@ class TextFileParserTest(TestCase):

    def test_parse_csv(self):

-        parser = TextFileParser([], 1234)
+        parser = TextFileParser([], 1234, "test_files/")

        info = parser.parse("test_files/text.csv")

--- a/static/js/search.js
+++ b/static/js/search.js
@ -592,4 +592,4 @@ document.getElementById("pathBar").addEventListener("keyup", function () {
    searchQueued = true;
 });

-window.setInterval(search, 75);
+window.setInterval(search, 150);
--- a/templates/directory.html
+++ b/templates/directory.html
@ -43,7 +43,7 @@
                    {% for dir in directories %}
                        <tr>
                            <td>{{ directories[dir].name }}</td>
-                            <td><pre style="width: 80%">{{ directories[dir].path }}</pre></td>
+                            <td style="word-break: break-all"><pre>{{ directories[dir].path }}</pre></td>
                            <td><i class="far {{ "fa-check-square" if directories[dir].enabled else "fa-square" }}"></i></td>
                            <td>2018-02-21</td>
                            <td><a href="directory/{{ dir }}" class="btn btn-primary"><i class="fas fa-cog"></i> Manage</a> </td>
--- a/templates/directory_manage.html
+++ b/templates/directory_manage.html
@ -140,7 +140,6 @@
        </div>

        <div class="card">
-            {# TODO: put github wiki link #}
            <div class="card-header">Options <a href="#" style="float:right">Learn more <i class="fas fa-external-link-alt"></i></a></div>
            <div class="card-body">
                <table class="info-table table-striped table-hover">
--- a/thumbnail.py
+++ b/thumbnail.py
@ -1,6 +1,6 @@
 from PIL import Image
 import os
-from multiprocessing import Value
+from multiprocessing import Value, Process
 import ffmpeg
 import cairosvg

@ -20,12 +20,21 @@ class ThumbnailGenerator:
        if mime == "image/svg+xml":

            try:
-                cairosvg.svg2png(url=path, write_to="tmp")
-                self.generate_image("tmp", dest_path)
-                os.remove("tmp")
+                p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": "tmp"})
+                p.start()
+                p.join(1.5)
+
+                if p.is_alive():
+                    p.terminate()
+                    print("Timed out: " + path)
+                else:
+                    self.generate_image("tmp", dest_path)
            except Exception:
                print("Couldn't make thumbnail for " + path)

+            if os.path.exists("tmp"):
+                os.remove("tmp")
+
        elif mime.startswith("image"):

            try:
@ -41,18 +50,20 @@ class ThumbnailGenerator:
                 .run()
                 )
                self.generate_image("tmp", dest_path)
-                os.remove("tmp")
            except Exception as e:
                print(e)
                print("Couldn't make thumbnail for " + path)

-    def generate_all(self, docs, dest_path,  counter: Value=None):
+            if os.path.exists("tmp"):
+                os.remove("tmp")
+
+    def generate_all(self, docs, dest_path,  counter: Value=None, directory=None):

        os.makedirs(dest_path, exist_ok=True)

        for doc in docs:
            extension = "" if doc["_source"]["extension"] == "" else "." + doc["_source"]["extension"]
-            full_path = os.path.join(doc["_source"]["path"], doc["_source"]["name"] + extension)
+            full_path = os.path.join(directory.path, doc["_source"]["path"], doc["_source"]["name"] + extension)

            if os.path.isfile(full_path) and "mime" in doc["_source"]:
                self.generate(full_path, os.path.join(dest_path, doc["_id"]), doc["_source"]["mime"])
@ -61,6 +72,7 @@ class ThumbnailGenerator:
                counter.value += 1

    def generate_image(self, path, dest_path):
+
        with open(path, "rb") as image_file:
            with Image.open(image_file) as image: