Bug fixes, text encoding tag

2025-12-19 01:59:02 +00:00 · 2018-04-14 22:31:55 -04:00
parent 8b55c3b681
commit 157d2c1ea5
10 changed files with 31 additions and 70 deletions
--- a/config.py
+++ b/config.py
@@ -2,9 +2,10 @@ default_options = {
    "ThumbnailQuality": "85",
    "ThumbnailSize": "275",
    "ThumbnailColor": "FF00FF",
-    "TextFileContentLenght": "16384",
+    "TextFileContentLength": "8192",
    "MimeGuesser": "extension",  # extension, content
    "CheckSumCalculators": "",  # md5, sha1, sha256
    "FileParsers": "media, text, picture"  # media, text, picture
 }
 index_every = 50000
--- a/crawler.py
+++ b/crawler.py
@@ -135,7 +135,7 @@ class TaskManager:
        c = Crawler([GenericFileParser(chksum_calcs),
                     MediaFileParser(chksum_calcs),
-                     TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLenght"))),
+                     TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
                     PictureFileParser(chksum_calcs)],
                    mime_guesser, self.indexer, directory.id)
        c.crawl(directory.path, counter)
--- a/indexer.py
+++ b/indexer.py
@@ -63,11 +63,11 @@ class Indexer:
        self.es.indices.put_settings(body='{"analysis":{"tokenizer":{"path_tokenizer":{"type":"path_hierarchy"}}}}',
                                     index=self.index_name)
-        self.es.indices.put_settings(body='{"analysis":{"tokenizer":{"my_nGram_tokenizer":{"type":"nGram","min_gram":3,"max_gram":4}}}}',
+        self.es.indices.put_settings(body='{"analysis":{"tokenizer":{"my_nGram_tokenizer":{"type":"nGram","min_gram":3,"max_gram":3}}}}',
                                     index=self.index_name)
        self.es.indices.put_settings(body='{"analysis":{"analyzer":{"path_analyser":{"tokenizer":"path_tokenizer"}}}}',
                                     index=self.index_name)
-        self.es.indices.put_settings(body='{"analysis":{"analyzer":{"my_nGram":{"tokenizer":"my_nGram_tokenizer", "filter": ["lowercase"]}}}}',
+        self.es.indices.put_settings(body='{"analysis":{"analyzer":{"my_nGram":{"tokenizer":"my_nGram_tokenizer", "filter": ["lowercase", "asciifolding"]}}}}',
                                     index=self.index_name)
        self.es.indices.put_mapping(body='{"properties": {'
--- a/parsing.py
+++ b/parsing.py
@@ -20,7 +20,10 @@ class ContentMimeGuesser(MimeGuesser):
        self.libmagic = magic.Magic(mime=True)
    def guess_mime(self, full_path):
-        return self.libmagic.from_file(full_path)
+        try:
            return self.libmagic.from_file(full_path)
        except FileNotFoundError:
            return None
 class ExtensionMimeGuesser(MimeGuesser):
@@ -268,7 +271,13 @@ class TextFileParser(GenericFileParser):
            "text/x-script.scheme", "text/x-script.sh", "text/x-script.tcl",
            "text/x-script.tcsh", "text/x-script.zsh", "text/x-server-parsed-html",
            "text/x-setext", "text/x-sgml", "text/x-speech", "text/x-uil",
-            "text/x-uuencode", "text/x-vcalendar", "text/xml"
+            "text/x-uuencode", "text/x-vcalendar", "text/xml", "text/x-csrc", "text/csv",
            "text/x-c++src", "text/x-chdr", "text/markdown", "text/x-sh", "text/x-java",
            "text/x-python", "text/x-c++hdr", "text/x-tex", "text/x-diff", "text/x-haskell",
            "text/x-perl", "text/x-dsrc", "text/scriptlet", "text/x-scala", "text/calendar",
            "text/x-bibtex", "text/x-tcl", "text/x-c++", "text/x-shellscript", "text/x-msdos-batch",
            "text/x-makefile", "text/rtf", "text/x-objective-c", "text/troff", "text/x-m4",
            "text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po"
        ]
        self.encodings = [
--- a/spec/ThumbnailGenerator_spec.py
+++ b/spec/ThumbnailGenerator_spec.py
@@ -1,57 +0,0 @@
 from unittest import TestCase
 from thumbnail import ThumbnailGenerator
 from PIL import Image
 import os
 import shutil
 class ThumbnailGeneratorTest(TestCase):
    def test_generate(self):
        generator = ThumbnailGenerator(300)
        # Original image is 420x315
        generator.generate("test_folder/sample_1.jpg", "test_thumb1.jpg", "image/JPEG")
        img = Image.open("test_thumb1.jpg")
        width, height = img.size
        img.close()
        self.assertEqual(300, width)
        self.assertEqual(225, height)
        if os.path.isfile("test_thumb1.jpg"):
            os.remove("test_thumb1.jpg")
    def test_generate_all(self):
        shutil.rmtree("test_thumbnails")
        generator = ThumbnailGenerator(300)
        docs = [{'_source': {'path': 'test_folder', 'name': 'books.csv'}, '_id': 'books.csv-ID'},
                {'_source': {'path': 'test_folder', 'name': 'sample_3.jpg'}, '_id': 'sample_3.jpg-ID'},
                {'_source': {'path': 'test_folder', 'name': 'sample_5.png'}, '_id': 'sample_5.png-ID'},
                {'_source': {'path': 'test_folder', 'name': 'sample_6.gif'}, '_id': 'sample_6.gif-ID'},
                {'_source': {'path': 'test_folder', 'name': 'sample_7.bmp'}, '_id': 'sample_7.bmp-ID'},
                {'_source': {'path': 'test_folder', 'name': 'sample_2.jpeg'}, '_id': 'sample_2.jpeg-ID'}]
        generator.generate_all(docs, "test_thumbnails")
        self.assertFalse(os.path.isfile("test_thumbnails/books.csv-ID") and
                         os.path.getsize("test_thumbnails/books.csv-ID") > 0)
        self.assertTrue(os.path.isfile("test_thumbnails/sample_3.jpg-ID") and
                        os.path.getsize("test_thumbnails/sample_3.jpg-ID") > 0)
        self.assertTrue(os.path.isfile("test_thumbnails/sample_2.jpeg-ID") and
                        os.path.getsize("test_thumbnails/sample_2.jpeg-ID") > 0)
        self.assertTrue(os.path.isfile("test_thumbnails/sample_5.png-ID") and
                        os.path.getsize("test_thumbnails/sample_5.png-ID") > 0)
        self.assertTrue(os.path.isfile("test_thumbnails/sample_6.gif-ID") and
                        os.path.getsize("test_thumbnails/sample_6.gif-ID") > 0)
        self.assertTrue(os.path.isfile("test_thumbnails/sample_7.bmp-ID") and
                        os.path.getsize("test_thumbnails/sample_7.bmp-ID") > 0)
--- a/static/css/bootstrap.min.css
+++ b/static/css/bootstrap.min.css
--- a/static/css/bootstrap.min.js
+++ b/static/css/bootstrap.min.js
--- a/static/js/popper.min.js
+++ b/static/js/popper.min.js
--- a/templates/search.html
+++ b/templates/search.html
@@ -39,6 +39,11 @@
            background-color: #FFC107;
        }
        .badge-text {
            color: #FFFFFF;
            background-color: #FAAB3C;
        }
        .card-img-top {
@@ -324,7 +329,7 @@
                if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("name")) {
                    title.insertAdjacentHTML('afterbegin', hit["highlight"]["name"] + extension);
                } else {
-                    title.appendChild(document.createTextNode(hit["_source"]["name"]));
+                    title.appendChild(document.createTextNode(hit["_source"]["name"] + extension));
                }
                title.setAttribute("title", hit["_source"]["path"] + hit["_source"]["name"] + extension);
@@ -415,6 +420,13 @@
                            formatTag.appendChild(document.createTextNode(hit["_source"]["format_name"]));
                            tags.push(formatTag);
                            break;
                        case "text":
                            formatTag = document.createElement("span");
                            formatTag.setAttribute("class", "badge badge-pill badge-text");
                            formatTag.appendChild(document.createTextNode(hit["_source"]["encoding"]));
                            tags.push(formatTag);
                            break;
                    }
--- a/thumbnail.py
+++ b/thumbnail.py
@@ -41,7 +41,6 @@ class ThumbnailGenerator:
        os.makedirs(dest_path, exist_ok=True)
        for doc in docs:
            extension = "" if doc["_source"]["extension"] == "" else "." + doc["_source"]["extension"]
            full_path = os.path.join(doc["_source"]["path"], doc["_source"]["name"] + extension)