Finished path search & autocomplete. Added font parsing

This commit is contained in:
simon987 2018-04-16 17:25:34 -04:00
parent b454653d51
commit 6d3cceb1b1
15 changed files with 124 additions and 41 deletions

View File

@ -5,7 +5,7 @@ default_options = {
"TextFileContentLength": "8192", "TextFileContentLength": "8192",
"MimeGuesser": "extension", # extension, content "MimeGuesser": "extension", # extension, content
"CheckSumCalculators": "", # md5, sha1, sha256 "CheckSumCalculators": "", # md5, sha1, sha256
"FileParsers": "media, text, picture" # media, text, picture "FileParsers": "media, text, picture, font" # media, text, picture
} }
index_every = 10000 index_every = 10000

View File

@ -4,7 +4,7 @@ import json
from multiprocessing import Process, Value from multiprocessing import Process, Value
from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.schedulers.background import BackgroundScheduler
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \ from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser
from indexer import Indexer from indexer import Indexer
from search import Search from search import Search
from thumbnail import ThumbnailGenerator from thumbnail import ThumbnailGenerator
@ -12,6 +12,7 @@ from storage import Directory
import shutil import shutil
import config import config
class RunningTask: class RunningTask:
def __init__(self, task: Task): def __init__(self, task: Task):
@ -136,7 +137,8 @@ class TaskManager:
c = Crawler([GenericFileParser(chksum_calcs), c = Crawler([GenericFileParser(chksum_calcs),
MediaFileParser(chksum_calcs), MediaFileParser(chksum_calcs),
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
PictureFileParser(chksum_calcs)], PictureFileParser(chksum_calcs),
FontParser(chksum_calcs)],
mime_guesser, self.indexer, directory.id) mime_guesser, self.indexer, directory.id)
c.crawl(directory.path, counter) c.crawl(directory.path, counter)

View File

@ -86,7 +86,8 @@ class Indexer:
"artist": {"analyzer": "my_nGram", "type": "text"}, "artist": {"analyzer": "my_nGram", "type": "text"},
"title": {"analyzer": "my_nGram", "type": "text"}, "title": {"analyzer": "my_nGram", "type": "text"},
"genre": {"analyzer": "my_nGram", "type": "text"}, "genre": {"analyzer": "my_nGram", "type": "text"},
"album_artist": {"analyzer": "my_nGram", "type": "text"} "album_artist": {"analyzer": "my_nGram", "type": "text"},
"font_name": {"analyzer": "my_nGram", "type": "text"},
}}, doc_type="file", index=self.index_name) }}, doc_type="file", index=self.index_name)
self.es.indices.open(index=self.index_name) self.es.indices.open(index=self.index_name)

View File

@ -6,16 +6,17 @@ import subprocess
import json import json
import chardet import chardet
import html import html
import warnings
from PIL import Image from PIL import Image
from fontTools.ttLib import TTFont, TTLibError
class MimeGuesser: class MimeGuesser:
def guess_mime(self, full_path): def guess_mime(self, full_path):
raise NotImplementedError() raise NotImplementedError()
class ContentMimeGuesser(MimeGuesser): class ContentMimeGuesser(MimeGuesser):
def __init__(self): def __init__(self):
self.libmagic = magic.Magic(mime=True) self.libmagic = magic.Magic(mime=True)
@ -27,7 +28,6 @@ class ContentMimeGuesser(MimeGuesser):
class ExtensionMimeGuesser(MimeGuesser): class ExtensionMimeGuesser(MimeGuesser):
def guess_mime(self, full_path): def guess_mime(self, full_path):
return mimetypes.guess_type(full_path, strict=False)[0] return mimetypes.guess_type(full_path, strict=False)[0]
@ -41,7 +41,6 @@ class FileParser:
class FileCheckSumCalculator: class FileCheckSumCalculator:
def checksum(self, path: str) -> str: def checksum(self, path: str) -> str:
""" """
Calculate the checksum of a file Calculate the checksum of a file
@ -52,7 +51,6 @@ class FileCheckSumCalculator:
class Md5CheckSumCalculator(FileCheckSumCalculator): class Md5CheckSumCalculator(FileCheckSumCalculator):
def __init__(self): def __init__(self):
self.name = "md5" self.name = "md5"
@ -72,7 +70,6 @@ class Md5CheckSumCalculator(FileCheckSumCalculator):
class Sha1CheckSumCalculator(FileCheckSumCalculator): class Sha1CheckSumCalculator(FileCheckSumCalculator):
def __init__(self): def __init__(self):
self.name = "sha1" self.name = "sha1"
@ -92,7 +89,6 @@ class Sha1CheckSumCalculator(FileCheckSumCalculator):
class Sha256CheckSumCalculator(FileCheckSumCalculator): class Sha256CheckSumCalculator(FileCheckSumCalculator):
def __init__(self): def __init__(self):
self.name = "sha256" self.name = "sha256"
@ -112,12 +108,10 @@ class Sha256CheckSumCalculator(FileCheckSumCalculator):
class GenericFileParser(FileParser): class GenericFileParser(FileParser):
mime_types = [] mime_types = []
is_default = True is_default = True
def __init__(self, checksum_calculators: list): def __init__(self, checksum_calculators: list):
self.checksum_calculators = checksum_calculators self.checksum_calculators = checksum_calculators
def parse(self, full_path: str) -> dict: def parse(self, full_path: str) -> dict:
@ -146,7 +140,6 @@ class GenericFileParser(FileParser):
class MediaFileParser(GenericFileParser): class MediaFileParser(GenericFileParser):
is_default = False is_default = False
relevant_properties = ["bit_rate", "nb_streams", "duration", "format_name", "format_long_name"] relevant_properties = ["bit_rate", "nb_streams", "duration", "format_name", "format_long_name"]
@ -154,7 +147,7 @@ class MediaFileParser(GenericFileParser):
super().__init__(checksum_calculators) super().__init__(checksum_calculators)
self.mime_types = [ self.mime_types = [
"video/3gpp", "video/mp4", "video/mpeg", "video/ogg", "video/quicktime", "video/3gpp", "video/mp4", "video/mpeg", "video/ogg", "video/quicktime",
"video/webm", "video/x-flv", "video/x-mng", "video/x-ms-asf", "video/webm", "video/x-flv", "video/x-mng", "video/x-ms-asf",
"video/x-ms-wmv", "video/x-msvideo", "audio/basic", "auido/L24", "video/x-ms-wmv", "video/x-msvideo", "audio/basic", "auido/L24",
"audio/mid", "audio/mpeg", "audio/mp4", "audio/x-aiff", "audio/mid", "audio/mpeg", "audio/mp4", "audio/x-aiff",
@ -208,15 +201,14 @@ class MediaFileParser(GenericFileParser):
class PictureFileParser(GenericFileParser): class PictureFileParser(GenericFileParser):
is_default = False is_default = False
def __init__(self, checksum_calculators: list): def __init__(self, checksum_calculators: list):
super().__init__(checksum_calculators) super().__init__(checksum_calculators)
self.mime_types = [ self.mime_types = [
"image/bmp", "image/cgm", "image/cis-cod", "image/g3fax", "image/gif", "image/bmp", "image/cgm", "image/cis-cod", "image/g3fax", "image/gif",
"image/ief", "image/jpeg", "image/ktx", "image/pipeg", "image/pjpeg", "image/ief", "image/jpeg", "image/ktx", "image/pipeg", "image/pjpeg",
"image/png", "image/prs.btif", "image/svg+xml", "image/tiff", "image/png", "image/prs.btif", "image/svg+xml", "image/tiff",
"image/vnd.adobe.photoshop", "image/vnd.dece.graphic", "image/vnd.djvu", "image/vnd.adobe.photoshop", "image/vnd.dece.graphic", "image/vnd.djvu",
"image/vnd.dvb.subtitle", "image/vnd.dwg", "image/vnd.dxf", "image/vnd.dvb.subtitle", "image/vnd.dwg", "image/vnd.dxf",
@ -234,11 +226,9 @@ class PictureFileParser(GenericFileParser):
info = super().parse(full_path) info = super().parse(full_path)
try: try:
with open(full_path, "rb") as image_file: with open(full_path, "rb") as image_file:
with Image.open(image_file) as image: with Image.open(image_file) as image:
info["mode"] = image.mode info["mode"] = image.mode
info["format"] = image.format info["format"] = image.format
info["width"] = image.width info["width"] = image.width
@ -250,7 +240,6 @@ class PictureFileParser(GenericFileParser):
class TextFileParser(GenericFileParser): class TextFileParser(GenericFileParser):
is_default = False is_default = False
def __init__(self, checksum_calculators: list, content_lenght: int): def __init__(self, checksum_calculators: list, content_lenght: int):
@ -259,14 +248,14 @@ class TextFileParser(GenericFileParser):
self.mime_types = [ self.mime_types = [
"text/asp", "text/css", "text/ecmascript", "text/html", "text/javascript", "text/asp", "text/css", "text/ecmascript", "text/html", "text/javascript",
"text/mcf", "text/pascal", "text/plain", "text/richtext", "text/scriplet", "text/mcf", "text/pascal", "text/plain", "text/richtext", "text/scriplet",
"text/sgml", "text/tab-separated-values", "text/uri-list", "text/vnd.abc", "text/sgml", "text/tab-separated-values", "text/uri-list", "text/vnd.abc",
"text/vnd.fmi.flexstor", "text/vnd.rn-realtext", "text/vnd.wap.wml", "text/vnd.fmi.flexstor", "text/vnd.rn-realtext", "text/vnd.wap.wml",
"text/vnd.wap.wmlscript", "text/webviewhtml", "text/x-asm", "text/x-audiosoft-intra", "text/vnd.wap.wmlscript", "text/webviewhtml", "text/x-asm", "text/x-audiosoft-intra",
"text/x-c", "text/x-component", "text/x-fortran", "text/x-h", "text/x-java-source", "text/x-c", "text/x-component", "text/x-fortran", "text/x-h", "text/x-java-source",
"text/x-la-asf", "text/x-m", "text/x-pascal", "text/x-script", "text/x-la-asf", "text/x-m", "text/x-pascal", "text/x-script",
"text/x-script.csh", "text/x-script.elisp", "text/x-script.guile", "text/x-script.csh", "text/x-script.elisp", "text/x-script.guile",
"text/x-script.ksh", "text/x-script.lisp", "text/x-script.perl", "text/x-script.ksh", "text/x-script.lisp", "text/x-script.perl",
"text/x-script.perl-module", "text/x-script.phyton", "text/x-script.rexx", "text/x-script.perl-module", "text/x-script.phyton", "text/x-script.rexx",
"text/x-script.scheme", "text/x-script.sh", "text/x-script.tcl", "text/x-script.scheme", "text/x-script.sh", "text/x-script.tcl",
"text/x-script.tcsh", "text/x-script.zsh", "text/x-server-parsed-html", "text/x-script.tcsh", "text/x-script.zsh", "text/x-server-parsed-html",
@ -282,10 +271,10 @@ class TextFileParser(GenericFileParser):
self.encodings = [ self.encodings = [
'ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437', 'ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437',
'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855',
'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863',
'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932', 'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932',
'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140', 'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140',
'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
'cp1256', 'cp1257', 'cp1258', 'cp65001', 'euc_jp', 'euc_jis_2004', 'cp1256', 'cp1257', 'cp1258', 'cp65001', 'euc_jp', 'euc_jis_2004',
'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp', 'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp',
@ -301,21 +290,57 @@ class TextFileParser(GenericFileParser):
] ]
def parse(self, full_path: str): def parse(self, full_path: str):
info = super().parse(full_path) info = super().parse(full_path)
with open(full_path, "rb") as text_file: with open(full_path, "rb") as text_file:
raw_content = text_file.read(self.content_lenght) raw_content = text_file.read(self.content_lenght)
chardet.detect(raw_content) chardet.detect(raw_content)
encoding = chardet.detect(raw_content)["encoding"] encoding = chardet.detect(raw_content)["encoding"]
if encoding is not None and encoding in self.encodings: if encoding is not None and encoding in self.encodings:
info["encoding"] = encoding info["encoding"] = encoding
content = raw_content.decode(encoding, "ignore") content = raw_content.decode(encoding, "ignore")
info["content"] = html.escape(content) info["content"] = html.escape(content)
return info return info
class FontParser(GenericFileParser):
is_default = False
def __init__(self, checksum_calculators: list):
super().__init__(checksum_calculators)
self.mime_types = [
"application/font-sfnt", "application/font-woff", "application/vdn.ms-fontobject",
"application/x-font-ttf"
]
def parse(self, full_path: str):
info = super().parse(full_path)
print(info)
with open(full_path, "rb") as f:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
try:
font = TTFont(f)
if "name" in font:
try:
for name in font["name"].names:
if name.nameID == 4:
info["font_name"] = name.toUnicode("replace")
break
except AssertionError:
print("Could not read font name for " + full_path)
except TTLibError:
print("Could not read font for " + full_path)
return info

View File

@ -5,3 +5,9 @@ elasticsearch
python-magic python-magic
requests requests
apscheduler apscheduler
humanfriendly
chardet
fonttools
brotli
unicodedata2
slate

1
run.py
View File

@ -100,6 +100,7 @@ def thumb(doc_id):
def search_page(): def search_page():
mime_map = search.get_mime_map() mime_map = search.get_mime_map()
mime_map.append({"id": "any", "text": "Any"})
return render_template("search.html", directories=storage.dirs(), mime_map=mime_map) return render_template("search.html", directories=storage.dirs(), mime_map=mime_map)

View File

@ -97,13 +97,15 @@ class Search:
filters = [ filters = [
{"range": {"size": {"gte": size_min, "lte": size_max}}}, {"range": {"size": {"gte": size_min, "lte": size_max}}},
{"terms": {"mime": mime_types}},
{"terms": {"directory": directories}} {"terms": {"directory": directories}}
] ]
if path != "": if path != "":
filters.append({"term": {"path": path}}) filters.append({"term": {"path": path}})
if mime_types != "any":
filters.append({"terms": {"mime": mime_types}})
page = self.es.search(body={ page = self.es.search(body={
"query": { "query": {
"bool": { "bool": {
@ -111,7 +113,7 @@ class Search:
"multi_match": { "multi_match": {
"query": query, "query": query,
"fields": ["name", "content", "album", "artist", "title", "genre", "fields": ["name", "content", "album", "artist", "title", "genre",
"album_artist"], "album_artist", "font_name"],
"operator": "and" "operator": "and"
} }
}, },
@ -125,6 +127,7 @@ class Search:
"fields": { "fields": {
"content": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]}, "content": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
"name": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]}, "name": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
"font_name": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
} }
}, },
"aggs": { "aggs": {

37
spec/FontParser_spec.py Normal file
View File

@ -0,0 +1,37 @@
from unittest import TestCase
from parsing import FontParser
class FontParserTest(TestCase):
def test_parse_name_trueType(self):
parser = FontParser([])
info = parser.parse("test_files/truetype1.ttf")
self.assertEqual(info["font_name"], "Liberation Mono Bold")
def test_parse_name_openType(self):
parser = FontParser([])
info = parser.parse("test_files/opentype1.otf")
self.assertEqual(info["font_name"], "Linux Biolinum Keyboard O")
def test_parse_name_woff(self):
parser = FontParser([])
info = parser.parse("test_files/woff.woff")
self.assertEqual(info["font_name"], "Heart of Gold")
def test_parse_name_woff2(self):
parser = FontParser([])
info = parser.parse("test_files/woff2.woff2")
self.assertEqual(info["font_name"], "Heart of Gold")

Binary file not shown.

Binary file not shown.

BIN
spec/test_files/woff.woff Normal file

Binary file not shown.

BIN
spec/test_files/woff2.woff2 Normal file

Binary file not shown.

View File

@ -9,19 +9,19 @@
function swapToForm(elem, fields, formAction, inputName) { function swapToForm(elem, fields, formAction, inputName) {
var form = document.createElement("form"); let form = document.createElement("form");
form.setAttribute("action", formAction); form.setAttribute("action", formAction);
for (var i in fields) { for (let i in fields) {
var hiddenInput = document.createElement("input"); let hiddenInput = document.createElement("input");
hiddenInput.setAttribute("type", "hidden"); hiddenInput.setAttribute("type", "hidden");
hiddenInput.setAttribute("value", fields[i].value); hiddenInput.setAttribute("value", fields[i].value);
hiddenInput.setAttribute("name", fields[i].name); hiddenInput.setAttribute("name", fields[i].name);
form.appendChild(hiddenInput); form.appendChild(hiddenInput);
} }
var input = document.createElement("input"); let input = document.createElement("input");
input.setAttribute("class", "form-control"); input.setAttribute("class", "form-control");
input.setAttribute("type", "text"); input.setAttribute("type", "text");
input.setAttribute("name", inputName); input.setAttribute("name", inputName);

View File

@ -563,9 +563,17 @@
docCard.appendChild(contentDiv); docCard.appendChild(contentDiv);
} }
//Font_name
if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("font_name")) {
let contentDiv = document.createElement("div");
contentDiv.setAttribute("class", "content-div bg-light");
contentDiv.insertAdjacentHTML('afterbegin', hit["highlight"]["font_name"][0]);
docCard.appendChild(contentDiv);
}
//Audio //Audio
if (mimeCategory === "audio") { if (mimeCategory === "audio") {
//TODO
} }
if (thumbnail !== null) { if (thumbnail !== null) {

View File

@ -28,7 +28,7 @@ class ThumbnailGenerator:
try: try:
(ffmpeg. (ffmpeg.
input(path) input(path)
.output("tmp", vframes=1, f="image2", loglevel="error") .overwrite_output("tmp", vframes=1, f="image2", loglevel="error")
.run() .run()
) )
self.generate_image("tmp", dest_path) self.generate_image("tmp", dest_path)