Finished path search & autocomplete. Added font parsing

2025-12-18 09:39:02 +00:00 · 2018-04-16 17:25:34 -04:00
parent b454653d51
commit 6d3cceb1b1
15 changed files with 124 additions and 41 deletions
--- a/config.py
+++ b/config.py
@@ -5,7 +5,7 @@ default_options = {
    "TextFileContentLength": "8192",
    "MimeGuesser": "extension",  # extension, content
    "CheckSumCalculators": "",  # md5, sha1, sha256
-    "FileParsers": "media, text, picture"  # media, text, picture
+    "FileParsers": "media, text, picture, font"  # media, text, picture
 }

 index_every = 10000
--- a/crawler.py
+++ b/crawler.py
@@ -4,7 +4,7 @@ import json
 from multiprocessing import Process, Value
 from apscheduler.schedulers.background import BackgroundScheduler
 from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
-    PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser
+    PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser
 from indexer import Indexer
 from search import Search
 from thumbnail import ThumbnailGenerator
@@ -12,6 +12,7 @@ from storage import Directory
 import shutil
 import config

+
 class RunningTask:

    def __init__(self, task: Task):
@@ -136,7 +137,8 @@ class TaskManager:
        c = Crawler([GenericFileParser(chksum_calcs),
                     MediaFileParser(chksum_calcs),
                     TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
-                     PictureFileParser(chksum_calcs)],
+                     PictureFileParser(chksum_calcs),
+                     FontParser(chksum_calcs)],
                    mime_guesser, self.indexer, directory.id)
        c.crawl(directory.path, counter)

--- a/indexer.py
+++ b/indexer.py
@@ -86,7 +86,8 @@ class Indexer:
            "artist": {"analyzer": "my_nGram", "type": "text"},
            "title": {"analyzer": "my_nGram", "type": "text"},
            "genre": {"analyzer": "my_nGram", "type": "text"},
-            "album_artist": {"analyzer": "my_nGram", "type": "text"}
+            "album_artist": {"analyzer": "my_nGram", "type": "text"},
+            "font_name": {"analyzer": "my_nGram", "type": "text"},
        }}, doc_type="file", index=self.index_name)

        self.es.indices.open(index=self.index_name)
--- a/parsing.py
+++ b/parsing.py
@@ -6,16 +6,17 @@ import subprocess
 import json
 import chardet
 import html
+import warnings
 from PIL import Image
+from fontTools.ttLib import TTFont, TTLibError
+

 class MimeGuesser:
-
    def guess_mime(self, full_path):
        raise NotImplementedError()


 class ContentMimeGuesser(MimeGuesser):
-
    def __init__(self):
        self.libmagic = magic.Magic(mime=True)

@@ -27,7 +28,6 @@ class ContentMimeGuesser(MimeGuesser):


 class ExtensionMimeGuesser(MimeGuesser):
-
    def guess_mime(self, full_path):
        return mimetypes.guess_type(full_path, strict=False)[0]

@@ -41,7 +41,6 @@ class FileParser:


 class FileCheckSumCalculator:
-
    def checksum(self, path: str) -> str:
        """
        Calculate the checksum of a file
@@ -52,7 +51,6 @@ class FileCheckSumCalculator:


 class Md5CheckSumCalculator(FileCheckSumCalculator):
-
    def __init__(self):
        self.name = "md5"

@@ -72,7 +70,6 @@ class Md5CheckSumCalculator(FileCheckSumCalculator):


 class Sha1CheckSumCalculator(FileCheckSumCalculator):
-
    def __init__(self):
        self.name = "sha1"

@@ -92,7 +89,6 @@ class Sha1CheckSumCalculator(FileCheckSumCalculator):


 class Sha256CheckSumCalculator(FileCheckSumCalculator):
-
    def __init__(self):
        self.name = "sha256"

@@ -112,12 +108,10 @@ class Sha256CheckSumCalculator(FileCheckSumCalculator):


 class GenericFileParser(FileParser):
-
    mime_types = []
    is_default = True

    def __init__(self, checksum_calculators: list):
-
        self.checksum_calculators = checksum_calculators

    def parse(self, full_path: str) -> dict:
@@ -146,7 +140,6 @@ class GenericFileParser(FileParser):


 class MediaFileParser(GenericFileParser):
-
    is_default = False
    relevant_properties = ["bit_rate", "nb_streams", "duration", "format_name", "format_long_name"]

@@ -154,7 +147,7 @@ class MediaFileParser(GenericFileParser):
        super().__init__(checksum_calculators)

        self.mime_types = [
-            "video/3gpp",  "video/mp4", "video/mpeg", "video/ogg", "video/quicktime",
+            "video/3gpp", "video/mp4", "video/mpeg", "video/ogg", "video/quicktime",
            "video/webm", "video/x-flv", "video/x-mng", "video/x-ms-asf",
            "video/x-ms-wmv", "video/x-msvideo", "audio/basic", "auido/L24",
            "audio/mid", "audio/mpeg", "audio/mp4", "audio/x-aiff",
@@ -208,15 +201,14 @@ class MediaFileParser(GenericFileParser):


 class PictureFileParser(GenericFileParser):
-
    is_default = False

    def __init__(self, checksum_calculators: list):
        super().__init__(checksum_calculators)

        self.mime_types = [
-            "image/bmp", "image/cgm",  "image/cis-cod", "image/g3fax", "image/gif",
-            "image/ief", "image/jpeg", "image/ktx", "image/pipeg",  "image/pjpeg",
+            "image/bmp", "image/cgm", "image/cis-cod", "image/g3fax", "image/gif",
+            "image/ief", "image/jpeg", "image/ktx", "image/pipeg", "image/pjpeg",
            "image/png", "image/prs.btif", "image/svg+xml", "image/tiff",
            "image/vnd.adobe.photoshop", "image/vnd.dece.graphic", "image/vnd.djvu",
            "image/vnd.dvb.subtitle", "image/vnd.dwg", "image/vnd.dxf",
@@ -234,11 +226,9 @@ class PictureFileParser(GenericFileParser):

        info = super().parse(full_path)

-
        try:
            with open(full_path, "rb") as image_file:
                with Image.open(image_file) as image:
-
                    info["mode"] = image.mode
                    info["format"] = image.format
                    info["width"] = image.width
@@ -250,7 +240,6 @@ class PictureFileParser(GenericFileParser):


 class TextFileParser(GenericFileParser):
-
    is_default = False

    def __init__(self, checksum_calculators: list, content_lenght: int):
@@ -259,14 +248,14 @@ class TextFileParser(GenericFileParser):

        self.mime_types = [
            "text/asp", "text/css", "text/ecmascript", "text/html", "text/javascript",
-            "text/mcf",  "text/pascal", "text/plain",  "text/richtext", "text/scriplet",
-            "text/sgml",  "text/tab-separated-values", "text/uri-list", "text/vnd.abc",
+            "text/mcf", "text/pascal", "text/plain", "text/richtext", "text/scriplet",
+            "text/sgml", "text/tab-separated-values", "text/uri-list", "text/vnd.abc",
            "text/vnd.fmi.flexstor", "text/vnd.rn-realtext", "text/vnd.wap.wml",
            "text/vnd.wap.wmlscript", "text/webviewhtml", "text/x-asm", "text/x-audiosoft-intra",
            "text/x-c", "text/x-component", "text/x-fortran", "text/x-h", "text/x-java-source",
-            "text/x-la-asf",  "text/x-m", "text/x-pascal", "text/x-script",
+            "text/x-la-asf", "text/x-m", "text/x-pascal", "text/x-script",
            "text/x-script.csh", "text/x-script.elisp", "text/x-script.guile",
-            "text/x-script.ksh", "text/x-script.lisp",  "text/x-script.perl",
+            "text/x-script.ksh", "text/x-script.lisp", "text/x-script.perl",
            "text/x-script.perl-module", "text/x-script.phyton", "text/x-script.rexx",
            "text/x-script.scheme", "text/x-script.sh", "text/x-script.tcl",
            "text/x-script.tcsh", "text/x-script.zsh", "text/x-server-parsed-html",
@@ -282,10 +271,10 @@ class TextFileParser(GenericFileParser):

        self.encodings = [
            'ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437',
-            'cp500', 'cp720',  'cp737', 'cp775', 'cp850', 'cp852', 'cp855',
-            'cp856', 'cp857',  'cp858', 'cp860', 'cp861',  'cp862', 'cp863',
-            'cp864', 'cp865',  'cp866', 'cp869', 'cp874', 'cp875',  'cp932',
-            'cp949', 'cp950',  'cp1006', 'cp1026', 'cp1125', 'cp1140',
+            'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855',
+            'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863',
+            'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932',
+            'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140',
            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
            'cp1256', 'cp1257', 'cp1258', 'cp65001', 'euc_jp', 'euc_jis_2004',
            'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp',
@@ -301,21 +290,57 @@ class TextFileParser(GenericFileParser):
        ]

    def parse(self, full_path: str):
-
        info = super().parse(full_path)

        with open(full_path, "rb") as text_file:
-
            raw_content = text_file.read(self.content_lenght)

            chardet.detect(raw_content)
            encoding = chardet.detect(raw_content)["encoding"]

            if encoding is not None and encoding in self.encodings:
-
                info["encoding"] = encoding
                content = raw_content.decode(encoding, "ignore")

                info["content"] = html.escape(content)

        return info
+
+
+class FontParser(GenericFileParser):
+    is_default = False
+
+    def __init__(self, checksum_calculators: list):
+        super().__init__(checksum_calculators)
+
+        self.mime_types = [
+            "application/font-sfnt", "application/font-woff", "application/vdn.ms-fontobject",
+            "application/x-font-ttf"
+        ]
+
+    def parse(self, full_path: str):
+
+        info = super().parse(full_path)
+        print(info)
+
+        with open(full_path, "rb") as f:
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+
+                try:
+
+                    font = TTFont(f)
+
+                    if "name" in font:
+                        try:
+                            for name in font["name"].names:
+                                if name.nameID == 4:
+                                    info["font_name"] = name.toUnicode("replace")
+                                    break
+                        except AssertionError:
+                            print("Could not read font name for " + full_path)
+                except TTLibError:
+                    print("Could not read font for " + full_path)
+
+        return info
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,10 @@ flask_bcrypt
 elasticsearch
 python-magic
 requests
-apscheduler
+apscheduler
+humanfriendly
+chardet
+fonttools
+brotli
+unicodedata2
+slate
--- a/run.py
+++ b/run.py
@@ -100,6 +100,7 @@ def thumb(doc_id):
 def search_page():

    mime_map = search.get_mime_map()
+    mime_map.append({"id": "any", "text": "Any"})

    return render_template("search.html", directories=storage.dirs(), mime_map=mime_map)

--- a/search.py
+++ b/search.py
@@ -97,13 +97,15 @@ class Search:

        filters = [
            {"range": {"size": {"gte": size_min, "lte": size_max}}},
-            {"terms": {"mime": mime_types}},
            {"terms": {"directory": directories}}
        ]

        if path != "":
            filters.append({"term": {"path": path}})

+        if mime_types != "any":
+            filters.append({"terms": {"mime": mime_types}})
+
        page = self.es.search(body={
            "query": {
                "bool": {
@@ -111,7 +113,7 @@ class Search:
                        "multi_match": {
                            "query": query,
                            "fields": ["name", "content", "album", "artist", "title", "genre",
-                                       "album_artist"],
+                                       "album_artist", "font_name"],
                            "operator": "and"
                        }
                    },
@@ -125,6 +127,7 @@ class Search:
                "fields": {
                    "content": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
                    "name": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
+                    "font_name": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
                }
            },
            "aggs": {
--- a/spec/FontParser_spec.py
+++ b/spec/FontParser_spec.py
@@ -0,0 +1,37 @@
+from unittest import TestCase
+from parsing import FontParser
+
+
+class FontParserTest(TestCase):
+
+    def test_parse_name_trueType(self):
+
+        parser = FontParser([])
+
+        info = parser.parse("test_files/truetype1.ttf")
+
+        self.assertEqual(info["font_name"], "Liberation Mono Bold")
+
+    def test_parse_name_openType(self):
+
+        parser = FontParser([])
+
+        info = parser.parse("test_files/opentype1.otf")
+
+        self.assertEqual(info["font_name"], "Linux Biolinum Keyboard O")
+
+    def test_parse_name_woff(self):
+
+        parser = FontParser([])
+
+        info = parser.parse("test_files/woff.woff")
+
+        self.assertEqual(info["font_name"], "Heart of Gold")
+
+    def test_parse_name_woff2(self):
+
+        parser = FontParser([])
+
+        info = parser.parse("test_files/woff2.woff2")
+
+        self.assertEqual(info["font_name"], "Heart of Gold")
--- a/spec/test_files/opentype1.otf
+++ b/spec/test_files/opentype1.otf
--- a/spec/test_files/truetype1.ttf
+++ b/spec/test_files/truetype1.ttf
--- a/spec/test_files/woff.woff
+++ b/spec/test_files/woff.woff
--- a/spec/test_files/woff2.woff2
+++ b/spec/test_files/woff2.woff2
--- a/templates/directory_manage.html
+++ b/templates/directory_manage.html
@@ -9,19 +9,19 @@

        function swapToForm(elem, fields, formAction, inputName) {

-            var form = document.createElement("form");
+            let form = document.createElement("form");
            form.setAttribute("action", formAction);

-            for (var i in fields) {
+            for (let i in fields) {

-                var hiddenInput = document.createElement("input");
+                let hiddenInput = document.createElement("input");
                hiddenInput.setAttribute("type", "hidden");
                hiddenInput.setAttribute("value", fields[i].value);
                hiddenInput.setAttribute("name", fields[i].name);
                form.appendChild(hiddenInput);
            }

-            var input = document.createElement("input");
+            let input = document.createElement("input");
            input.setAttribute("class", "form-control");
            input.setAttribute("type", "text");
            input.setAttribute("name", inputName);
--- a/templates/search.html
+++ b/templates/search.html
@@ -563,9 +563,17 @@
                        docCard.appendChild(contentDiv);
                    }

+                    //Font_name
+                    if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("font_name")) {
+                        let contentDiv = document.createElement("div");
+                        contentDiv.setAttribute("class", "content-div bg-light");
+                        contentDiv.insertAdjacentHTML('afterbegin', hit["highlight"]["font_name"][0]);
+                        docCard.appendChild(contentDiv);
+                    }
+
                    //Audio
                    if (mimeCategory === "audio") {
-
+                        //TODO
                    }

                    if (thumbnail !== null) {
--- a/thumbnail.py
+++ b/thumbnail.py
@@ -28,7 +28,7 @@ class ThumbnailGenerator:
            try:
                (ffmpeg.
                 input(path)
-                 .output("tmp", vframes=1, f="image2", loglevel="error")
+                 .overwrite_output("tmp", vframes=1, f="image2", loglevel="error")
                 .run()
                 )
                self.generate_image("tmp", dest_path)