diff --git a/config.py b/config.py index bcf3231..6d87da2 100644 --- a/config.py +++ b/config.py @@ -5,7 +5,7 @@ default_options = { "TextFileContentLength": "8192", "MimeGuesser": "extension", # extension, content "CheckSumCalculators": "", # md5, sha1, sha256 - "FileParsers": "media, text, picture" # media, text, picture + "FileParsers": "media, text, picture, font" # media, text, picture } index_every = 10000 diff --git a/crawler.py b/crawler.py index 80fe001..7c9c7cd 100644 --- a/crawler.py +++ b/crawler.py @@ -4,7 +4,7 @@ import json from multiprocessing import Process, Value from apscheduler.schedulers.background import BackgroundScheduler from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \ - PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser + PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser from indexer import Indexer from search import Search from thumbnail import ThumbnailGenerator @@ -12,6 +12,7 @@ from storage import Directory import shutil import config + class RunningTask: def __init__(self, task: Task): @@ -136,7 +137,8 @@ class TaskManager: c = Crawler([GenericFileParser(chksum_calcs), MediaFileParser(chksum_calcs), TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), - PictureFileParser(chksum_calcs)], + PictureFileParser(chksum_calcs), + FontParser(chksum_calcs)], mime_guesser, self.indexer, directory.id) c.crawl(directory.path, counter) diff --git a/indexer.py b/indexer.py index 7e36e43..364d2ed 100644 --- a/indexer.py +++ b/indexer.py @@ -86,7 +86,8 @@ class Indexer: "artist": {"analyzer": "my_nGram", "type": "text"}, "title": {"analyzer": "my_nGram", "type": "text"}, "genre": {"analyzer": "my_nGram", "type": "text"}, - "album_artist": {"analyzer": "my_nGram", "type": "text"} + "album_artist": {"analyzer": "my_nGram", "type": "text"}, + "font_name": {"analyzer": "my_nGram", "type": "text"}, }}, doc_type="file", index=self.index_name) self.es.indices.open(index=self.index_name) diff --git a/parsing.py b/parsing.py index 27aa930..050f553 100644 --- a/parsing.py +++ b/parsing.py @@ -6,16 +6,17 @@ import subprocess import json import chardet import html +import warnings from PIL import Image +from fontTools.ttLib import TTFont, TTLibError + class MimeGuesser: - def guess_mime(self, full_path): raise NotImplementedError() class ContentMimeGuesser(MimeGuesser): - def __init__(self): self.libmagic = magic.Magic(mime=True) @@ -27,7 +28,6 @@ class ContentMimeGuesser(MimeGuesser): class ExtensionMimeGuesser(MimeGuesser): - def guess_mime(self, full_path): return mimetypes.guess_type(full_path, strict=False)[0] @@ -41,7 +41,6 @@ class FileParser: class FileCheckSumCalculator: - def checksum(self, path: str) -> str: """ Calculate the checksum of a file @@ -52,7 +51,6 @@ class FileCheckSumCalculator: class Md5CheckSumCalculator(FileCheckSumCalculator): - def __init__(self): self.name = "md5" @@ -72,7 +70,6 @@ class Md5CheckSumCalculator(FileCheckSumCalculator): class Sha1CheckSumCalculator(FileCheckSumCalculator): - def __init__(self): self.name = "sha1" @@ -92,7 +89,6 @@ class Sha1CheckSumCalculator(FileCheckSumCalculator): class Sha256CheckSumCalculator(FileCheckSumCalculator): - def __init__(self): self.name = "sha256" @@ -112,12 +108,10 @@ class Sha256CheckSumCalculator(FileCheckSumCalculator): class GenericFileParser(FileParser): - mime_types = [] is_default = True def __init__(self, checksum_calculators: list): - self.checksum_calculators = checksum_calculators def parse(self, full_path: str) -> dict: @@ -146,7 +140,6 @@ class GenericFileParser(FileParser): class MediaFileParser(GenericFileParser): - is_default = False relevant_properties = ["bit_rate", "nb_streams", "duration", "format_name", "format_long_name"] @@ -154,7 +147,7 @@ class MediaFileParser(GenericFileParser): super().__init__(checksum_calculators) self.mime_types = [ - "video/3gpp", "video/mp4", "video/mpeg", "video/ogg", "video/quicktime", + "video/3gpp", "video/mp4", "video/mpeg", "video/ogg", "video/quicktime", "video/webm", "video/x-flv", "video/x-mng", "video/x-ms-asf", "video/x-ms-wmv", "video/x-msvideo", "audio/basic", "auido/L24", "audio/mid", "audio/mpeg", "audio/mp4", "audio/x-aiff", @@ -208,15 +201,14 @@ class MediaFileParser(GenericFileParser): class PictureFileParser(GenericFileParser): - is_default = False def __init__(self, checksum_calculators: list): super().__init__(checksum_calculators) self.mime_types = [ - "image/bmp", "image/cgm", "image/cis-cod", "image/g3fax", "image/gif", - "image/ief", "image/jpeg", "image/ktx", "image/pipeg", "image/pjpeg", + "image/bmp", "image/cgm", "image/cis-cod", "image/g3fax", "image/gif", + "image/ief", "image/jpeg", "image/ktx", "image/pipeg", "image/pjpeg", "image/png", "image/prs.btif", "image/svg+xml", "image/tiff", "image/vnd.adobe.photoshop", "image/vnd.dece.graphic", "image/vnd.djvu", "image/vnd.dvb.subtitle", "image/vnd.dwg", "image/vnd.dxf", @@ -234,11 +226,9 @@ class PictureFileParser(GenericFileParser): info = super().parse(full_path) - try: with open(full_path, "rb") as image_file: with Image.open(image_file) as image: - info["mode"] = image.mode info["format"] = image.format info["width"] = image.width @@ -250,7 +240,6 @@ class PictureFileParser(GenericFileParser): class TextFileParser(GenericFileParser): - is_default = False def __init__(self, checksum_calculators: list, content_lenght: int): @@ -259,14 +248,14 @@ class TextFileParser(GenericFileParser): self.mime_types = [ "text/asp", "text/css", "text/ecmascript", "text/html", "text/javascript", - "text/mcf", "text/pascal", "text/plain", "text/richtext", "text/scriplet", - "text/sgml", "text/tab-separated-values", "text/uri-list", "text/vnd.abc", + "text/mcf", "text/pascal", "text/plain", "text/richtext", "text/scriplet", + "text/sgml", "text/tab-separated-values", "text/uri-list", "text/vnd.abc", "text/vnd.fmi.flexstor", "text/vnd.rn-realtext", "text/vnd.wap.wml", "text/vnd.wap.wmlscript", "text/webviewhtml", "text/x-asm", "text/x-audiosoft-intra", "text/x-c", "text/x-component", "text/x-fortran", "text/x-h", "text/x-java-source", - "text/x-la-asf", "text/x-m", "text/x-pascal", "text/x-script", + "text/x-la-asf", "text/x-m", "text/x-pascal", "text/x-script", "text/x-script.csh", "text/x-script.elisp", "text/x-script.guile", - "text/x-script.ksh", "text/x-script.lisp", "text/x-script.perl", + "text/x-script.ksh", "text/x-script.lisp", "text/x-script.perl", "text/x-script.perl-module", "text/x-script.phyton", "text/x-script.rexx", "text/x-script.scheme", "text/x-script.sh", "text/x-script.tcl", "text/x-script.tcsh", "text/x-script.zsh", "text/x-server-parsed-html", @@ -282,10 +271,10 @@ class TextFileParser(GenericFileParser): self.encodings = [ 'ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437', - 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', - 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', - 'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932', - 'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140', + 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', + 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', + 'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932', + 'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256', 'cp1257', 'cp1258', 'cp65001', 'euc_jp', 'euc_jis_2004', 'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp', @@ -301,21 +290,57 @@ class TextFileParser(GenericFileParser): ] def parse(self, full_path: str): - info = super().parse(full_path) with open(full_path, "rb") as text_file: - raw_content = text_file.read(self.content_lenght) chardet.detect(raw_content) encoding = chardet.detect(raw_content)["encoding"] if encoding is not None and encoding in self.encodings: - info["encoding"] = encoding content = raw_content.decode(encoding, "ignore") info["content"] = html.escape(content) return info + + +class FontParser(GenericFileParser): + is_default = False + + def __init__(self, checksum_calculators: list): + super().__init__(checksum_calculators) + + self.mime_types = [ + "application/font-sfnt", "application/font-woff", "application/vdn.ms-fontobject", + "application/x-font-ttf" + ] + + def parse(self, full_path: str): + + info = super().parse(full_path) + print(info) + + with open(full_path, "rb") as f: + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + try: + + font = TTFont(f) + + if "name" in font: + try: + for name in font["name"].names: + if name.nameID == 4: + info["font_name"] = name.toUnicode("replace") + break + except AssertionError: + print("Could not read font name for " + full_path) + except TTLibError: + print("Could not read font for " + full_path) + + return info diff --git a/requirements.txt b/requirements.txt index 3d0e91d..0799d8b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,10 @@ flask_bcrypt elasticsearch python-magic requests -apscheduler \ No newline at end of file +apscheduler +humanfriendly +chardet +fonttools +brotli +unicodedata2 +slate \ No newline at end of file diff --git a/run.py b/run.py index 76afa9d..213678b 100644 --- a/run.py +++ b/run.py @@ -100,6 +100,7 @@ def thumb(doc_id): def search_page(): mime_map = search.get_mime_map() + mime_map.append({"id": "any", "text": "Any"}) return render_template("search.html", directories=storage.dirs(), mime_map=mime_map) diff --git a/search.py b/search.py index 5803e36..9200407 100644 --- a/search.py +++ b/search.py @@ -97,13 +97,15 @@ class Search: filters = [ {"range": {"size": {"gte": size_min, "lte": size_max}}}, - {"terms": {"mime": mime_types}}, {"terms": {"directory": directories}} ] if path != "": filters.append({"term": {"path": path}}) + if mime_types != "any": + filters.append({"terms": {"mime": mime_types}}) + page = self.es.search(body={ "query": { "bool": { @@ -111,7 +113,7 @@ class Search: "multi_match": { "query": query, "fields": ["name", "content", "album", "artist", "title", "genre", - "album_artist"], + "album_artist", "font_name"], "operator": "and" } }, @@ -125,6 +127,7 @@ class Search: "fields": { "content": {"pre_tags": [""], "post_tags": [""]}, "name": {"pre_tags": [""], "post_tags": [""]}, + "font_name": {"pre_tags": [""], "post_tags": [""]}, } }, "aggs": { diff --git a/spec/FontParser_spec.py b/spec/FontParser_spec.py new file mode 100644 index 0000000..3bc0ad7 --- /dev/null +++ b/spec/FontParser_spec.py @@ -0,0 +1,37 @@ +from unittest import TestCase +from parsing import FontParser + + +class FontParserTest(TestCase): + + def test_parse_name_trueType(self): + + parser = FontParser([]) + + info = parser.parse("test_files/truetype1.ttf") + + self.assertEqual(info["font_name"], "Liberation Mono Bold") + + def test_parse_name_openType(self): + + parser = FontParser([]) + + info = parser.parse("test_files/opentype1.otf") + + self.assertEqual(info["font_name"], "Linux Biolinum Keyboard O") + + def test_parse_name_woff(self): + + parser = FontParser([]) + + info = parser.parse("test_files/woff.woff") + + self.assertEqual(info["font_name"], "Heart of Gold") + + def test_parse_name_woff2(self): + + parser = FontParser([]) + + info = parser.parse("test_files/woff2.woff2") + + self.assertEqual(info["font_name"], "Heart of Gold") diff --git a/spec/test_files/opentype1.otf b/spec/test_files/opentype1.otf new file mode 100644 index 0000000..7bfb993 Binary files /dev/null and b/spec/test_files/opentype1.otf differ diff --git a/spec/test_files/truetype1.ttf b/spec/test_files/truetype1.ttf new file mode 100644 index 0000000..b1f2eb2 Binary files /dev/null and b/spec/test_files/truetype1.ttf differ diff --git a/spec/test_files/woff.woff b/spec/test_files/woff.woff new file mode 100644 index 0000000..922bf9e Binary files /dev/null and b/spec/test_files/woff.woff differ diff --git a/spec/test_files/woff2.woff2 b/spec/test_files/woff2.woff2 new file mode 100644 index 0000000..e530f58 Binary files /dev/null and b/spec/test_files/woff2.woff2 differ diff --git a/templates/directory_manage.html b/templates/directory_manage.html index 1409767..ea10465 100644 --- a/templates/directory_manage.html +++ b/templates/directory_manage.html @@ -9,19 +9,19 @@ function swapToForm(elem, fields, formAction, inputName) { - var form = document.createElement("form"); + let form = document.createElement("form"); form.setAttribute("action", formAction); - for (var i in fields) { + for (let i in fields) { - var hiddenInput = document.createElement("input"); + let hiddenInput = document.createElement("input"); hiddenInput.setAttribute("type", "hidden"); hiddenInput.setAttribute("value", fields[i].value); hiddenInput.setAttribute("name", fields[i].name); form.appendChild(hiddenInput); } - var input = document.createElement("input"); + let input = document.createElement("input"); input.setAttribute("class", "form-control"); input.setAttribute("type", "text"); input.setAttribute("name", inputName); diff --git a/templates/search.html b/templates/search.html index ecdb876..3eb39ef 100644 --- a/templates/search.html +++ b/templates/search.html @@ -563,9 +563,17 @@ docCard.appendChild(contentDiv); } + //Font_name + if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("font_name")) { + let contentDiv = document.createElement("div"); + contentDiv.setAttribute("class", "content-div bg-light"); + contentDiv.insertAdjacentHTML('afterbegin', hit["highlight"]["font_name"][0]); + docCard.appendChild(contentDiv); + } + //Audio if (mimeCategory === "audio") { - + //TODO } if (thumbnail !== null) { diff --git a/thumbnail.py b/thumbnail.py index 77d71d7..ddb01c5 100644 --- a/thumbnail.py +++ b/thumbnail.py @@ -28,7 +28,7 @@ class ThumbnailGenerator: try: (ffmpeg. input(path) - .output("tmp", vframes=1, f="image2", loglevel="error") + .overwrite_output("tmp", vframes=1, f="image2", loglevel="error") .run() ) self.generate_image("tmp", dest_path)