mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-10 14:06:41 +00:00
Finished path search & autocomplete. Added font parsing
This commit is contained in:
parent
b454653d51
commit
6d3cceb1b1
@ -5,7 +5,7 @@ default_options = {
|
||||
"TextFileContentLength": "8192",
|
||||
"MimeGuesser": "extension", # extension, content
|
||||
"CheckSumCalculators": "", # md5, sha1, sha256
|
||||
"FileParsers": "media, text, picture" # media, text, picture
|
||||
"FileParsers": "media, text, picture, font" # media, text, picture
|
||||
}
|
||||
|
||||
index_every = 10000
|
||||
|
@ -4,7 +4,7 @@ import json
|
||||
from multiprocessing import Process, Value
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
|
||||
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser
|
||||
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser
|
||||
from indexer import Indexer
|
||||
from search import Search
|
||||
from thumbnail import ThumbnailGenerator
|
||||
@ -12,6 +12,7 @@ from storage import Directory
|
||||
import shutil
|
||||
import config
|
||||
|
||||
|
||||
class RunningTask:
|
||||
|
||||
def __init__(self, task: Task):
|
||||
@ -136,7 +137,8 @@ class TaskManager:
|
||||
c = Crawler([GenericFileParser(chksum_calcs),
|
||||
MediaFileParser(chksum_calcs),
|
||||
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
|
||||
PictureFileParser(chksum_calcs)],
|
||||
PictureFileParser(chksum_calcs),
|
||||
FontParser(chksum_calcs)],
|
||||
mime_guesser, self.indexer, directory.id)
|
||||
c.crawl(directory.path, counter)
|
||||
|
||||
|
@ -86,7 +86,8 @@ class Indexer:
|
||||
"artist": {"analyzer": "my_nGram", "type": "text"},
|
||||
"title": {"analyzer": "my_nGram", "type": "text"},
|
||||
"genre": {"analyzer": "my_nGram", "type": "text"},
|
||||
"album_artist": {"analyzer": "my_nGram", "type": "text"}
|
||||
"album_artist": {"analyzer": "my_nGram", "type": "text"},
|
||||
"font_name": {"analyzer": "my_nGram", "type": "text"},
|
||||
}}, doc_type="file", index=self.index_name)
|
||||
|
||||
self.es.indices.open(index=self.index_name)
|
||||
|
81
parsing.py
81
parsing.py
@ -6,16 +6,17 @@ import subprocess
|
||||
import json
|
||||
import chardet
|
||||
import html
|
||||
import warnings
|
||||
from PIL import Image
|
||||
from fontTools.ttLib import TTFont, TTLibError
|
||||
|
||||
|
||||
class MimeGuesser:
|
||||
|
||||
def guess_mime(self, full_path):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class ContentMimeGuesser(MimeGuesser):
|
||||
|
||||
def __init__(self):
|
||||
self.libmagic = magic.Magic(mime=True)
|
||||
|
||||
@ -27,7 +28,6 @@ class ContentMimeGuesser(MimeGuesser):
|
||||
|
||||
|
||||
class ExtensionMimeGuesser(MimeGuesser):
|
||||
|
||||
def guess_mime(self, full_path):
|
||||
return mimetypes.guess_type(full_path, strict=False)[0]
|
||||
|
||||
@ -41,7 +41,6 @@ class FileParser:
|
||||
|
||||
|
||||
class FileCheckSumCalculator:
|
||||
|
||||
def checksum(self, path: str) -> str:
|
||||
"""
|
||||
Calculate the checksum of a file
|
||||
@ -52,7 +51,6 @@ class FileCheckSumCalculator:
|
||||
|
||||
|
||||
class Md5CheckSumCalculator(FileCheckSumCalculator):
|
||||
|
||||
def __init__(self):
|
||||
self.name = "md5"
|
||||
|
||||
@ -72,7 +70,6 @@ class Md5CheckSumCalculator(FileCheckSumCalculator):
|
||||
|
||||
|
||||
class Sha1CheckSumCalculator(FileCheckSumCalculator):
|
||||
|
||||
def __init__(self):
|
||||
self.name = "sha1"
|
||||
|
||||
@ -92,7 +89,6 @@ class Sha1CheckSumCalculator(FileCheckSumCalculator):
|
||||
|
||||
|
||||
class Sha256CheckSumCalculator(FileCheckSumCalculator):
|
||||
|
||||
def __init__(self):
|
||||
self.name = "sha256"
|
||||
|
||||
@ -112,12 +108,10 @@ class Sha256CheckSumCalculator(FileCheckSumCalculator):
|
||||
|
||||
|
||||
class GenericFileParser(FileParser):
|
||||
|
||||
mime_types = []
|
||||
is_default = True
|
||||
|
||||
def __init__(self, checksum_calculators: list):
|
||||
|
||||
self.checksum_calculators = checksum_calculators
|
||||
|
||||
def parse(self, full_path: str) -> dict:
|
||||
@ -146,7 +140,6 @@ class GenericFileParser(FileParser):
|
||||
|
||||
|
||||
class MediaFileParser(GenericFileParser):
|
||||
|
||||
is_default = False
|
||||
relevant_properties = ["bit_rate", "nb_streams", "duration", "format_name", "format_long_name"]
|
||||
|
||||
@ -154,7 +147,7 @@ class MediaFileParser(GenericFileParser):
|
||||
super().__init__(checksum_calculators)
|
||||
|
||||
self.mime_types = [
|
||||
"video/3gpp", "video/mp4", "video/mpeg", "video/ogg", "video/quicktime",
|
||||
"video/3gpp", "video/mp4", "video/mpeg", "video/ogg", "video/quicktime",
|
||||
"video/webm", "video/x-flv", "video/x-mng", "video/x-ms-asf",
|
||||
"video/x-ms-wmv", "video/x-msvideo", "audio/basic", "auido/L24",
|
||||
"audio/mid", "audio/mpeg", "audio/mp4", "audio/x-aiff",
|
||||
@ -208,15 +201,14 @@ class MediaFileParser(GenericFileParser):
|
||||
|
||||
|
||||
class PictureFileParser(GenericFileParser):
|
||||
|
||||
is_default = False
|
||||
|
||||
def __init__(self, checksum_calculators: list):
|
||||
super().__init__(checksum_calculators)
|
||||
|
||||
self.mime_types = [
|
||||
"image/bmp", "image/cgm", "image/cis-cod", "image/g3fax", "image/gif",
|
||||
"image/ief", "image/jpeg", "image/ktx", "image/pipeg", "image/pjpeg",
|
||||
"image/bmp", "image/cgm", "image/cis-cod", "image/g3fax", "image/gif",
|
||||
"image/ief", "image/jpeg", "image/ktx", "image/pipeg", "image/pjpeg",
|
||||
"image/png", "image/prs.btif", "image/svg+xml", "image/tiff",
|
||||
"image/vnd.adobe.photoshop", "image/vnd.dece.graphic", "image/vnd.djvu",
|
||||
"image/vnd.dvb.subtitle", "image/vnd.dwg", "image/vnd.dxf",
|
||||
@ -234,11 +226,9 @@ class PictureFileParser(GenericFileParser):
|
||||
|
||||
info = super().parse(full_path)
|
||||
|
||||
|
||||
try:
|
||||
with open(full_path, "rb") as image_file:
|
||||
with Image.open(image_file) as image:
|
||||
|
||||
info["mode"] = image.mode
|
||||
info["format"] = image.format
|
||||
info["width"] = image.width
|
||||
@ -250,7 +240,6 @@ class PictureFileParser(GenericFileParser):
|
||||
|
||||
|
||||
class TextFileParser(GenericFileParser):
|
||||
|
||||
is_default = False
|
||||
|
||||
def __init__(self, checksum_calculators: list, content_lenght: int):
|
||||
@ -259,14 +248,14 @@ class TextFileParser(GenericFileParser):
|
||||
|
||||
self.mime_types = [
|
||||
"text/asp", "text/css", "text/ecmascript", "text/html", "text/javascript",
|
||||
"text/mcf", "text/pascal", "text/plain", "text/richtext", "text/scriplet",
|
||||
"text/sgml", "text/tab-separated-values", "text/uri-list", "text/vnd.abc",
|
||||
"text/mcf", "text/pascal", "text/plain", "text/richtext", "text/scriplet",
|
||||
"text/sgml", "text/tab-separated-values", "text/uri-list", "text/vnd.abc",
|
||||
"text/vnd.fmi.flexstor", "text/vnd.rn-realtext", "text/vnd.wap.wml",
|
||||
"text/vnd.wap.wmlscript", "text/webviewhtml", "text/x-asm", "text/x-audiosoft-intra",
|
||||
"text/x-c", "text/x-component", "text/x-fortran", "text/x-h", "text/x-java-source",
|
||||
"text/x-la-asf", "text/x-m", "text/x-pascal", "text/x-script",
|
||||
"text/x-la-asf", "text/x-m", "text/x-pascal", "text/x-script",
|
||||
"text/x-script.csh", "text/x-script.elisp", "text/x-script.guile",
|
||||
"text/x-script.ksh", "text/x-script.lisp", "text/x-script.perl",
|
||||
"text/x-script.ksh", "text/x-script.lisp", "text/x-script.perl",
|
||||
"text/x-script.perl-module", "text/x-script.phyton", "text/x-script.rexx",
|
||||
"text/x-script.scheme", "text/x-script.sh", "text/x-script.tcl",
|
||||
"text/x-script.tcsh", "text/x-script.zsh", "text/x-server-parsed-html",
|
||||
@ -282,10 +271,10 @@ class TextFileParser(GenericFileParser):
|
||||
|
||||
self.encodings = [
|
||||
'ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437',
|
||||
'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855',
|
||||
'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863',
|
||||
'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932',
|
||||
'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140',
|
||||
'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855',
|
||||
'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863',
|
||||
'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932',
|
||||
'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140',
|
||||
'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
|
||||
'cp1256', 'cp1257', 'cp1258', 'cp65001', 'euc_jp', 'euc_jis_2004',
|
||||
'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp',
|
||||
@ -301,21 +290,57 @@ class TextFileParser(GenericFileParser):
|
||||
]
|
||||
|
||||
def parse(self, full_path: str):
|
||||
|
||||
info = super().parse(full_path)
|
||||
|
||||
with open(full_path, "rb") as text_file:
|
||||
|
||||
raw_content = text_file.read(self.content_lenght)
|
||||
|
||||
chardet.detect(raw_content)
|
||||
encoding = chardet.detect(raw_content)["encoding"]
|
||||
|
||||
if encoding is not None and encoding in self.encodings:
|
||||
|
||||
info["encoding"] = encoding
|
||||
content = raw_content.decode(encoding, "ignore")
|
||||
|
||||
info["content"] = html.escape(content)
|
||||
|
||||
return info
|
||||
|
||||
|
||||
class FontParser(GenericFileParser):
|
||||
is_default = False
|
||||
|
||||
def __init__(self, checksum_calculators: list):
|
||||
super().__init__(checksum_calculators)
|
||||
|
||||
self.mime_types = [
|
||||
"application/font-sfnt", "application/font-woff", "application/vdn.ms-fontobject",
|
||||
"application/x-font-ttf"
|
||||
]
|
||||
|
||||
def parse(self, full_path: str):
|
||||
|
||||
info = super().parse(full_path)
|
||||
print(info)
|
||||
|
||||
with open(full_path, "rb") as f:
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
|
||||
try:
|
||||
|
||||
font = TTFont(f)
|
||||
|
||||
if "name" in font:
|
||||
try:
|
||||
for name in font["name"].names:
|
||||
if name.nameID == 4:
|
||||
info["font_name"] = name.toUnicode("replace")
|
||||
break
|
||||
except AssertionError:
|
||||
print("Could not read font name for " + full_path)
|
||||
except TTLibError:
|
||||
print("Could not read font for " + full_path)
|
||||
|
||||
return info
|
||||
|
@ -4,4 +4,10 @@ flask_bcrypt
|
||||
elasticsearch
|
||||
python-magic
|
||||
requests
|
||||
apscheduler
|
||||
apscheduler
|
||||
humanfriendly
|
||||
chardet
|
||||
fonttools
|
||||
brotli
|
||||
unicodedata2
|
||||
slate
|
1
run.py
1
run.py
@ -100,6 +100,7 @@ def thumb(doc_id):
|
||||
def search_page():
|
||||
|
||||
mime_map = search.get_mime_map()
|
||||
mime_map.append({"id": "any", "text": "Any"})
|
||||
|
||||
return render_template("search.html", directories=storage.dirs(), mime_map=mime_map)
|
||||
|
||||
|
@ -97,13 +97,15 @@ class Search:
|
||||
|
||||
filters = [
|
||||
{"range": {"size": {"gte": size_min, "lte": size_max}}},
|
||||
{"terms": {"mime": mime_types}},
|
||||
{"terms": {"directory": directories}}
|
||||
]
|
||||
|
||||
if path != "":
|
||||
filters.append({"term": {"path": path}})
|
||||
|
||||
if mime_types != "any":
|
||||
filters.append({"terms": {"mime": mime_types}})
|
||||
|
||||
page = self.es.search(body={
|
||||
"query": {
|
||||
"bool": {
|
||||
@ -111,7 +113,7 @@ class Search:
|
||||
"multi_match": {
|
||||
"query": query,
|
||||
"fields": ["name", "content", "album", "artist", "title", "genre",
|
||||
"album_artist"],
|
||||
"album_artist", "font_name"],
|
||||
"operator": "and"
|
||||
}
|
||||
},
|
||||
@ -125,6 +127,7 @@ class Search:
|
||||
"fields": {
|
||||
"content": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
|
||||
"name": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
|
||||
"font_name": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
|
||||
}
|
||||
},
|
||||
"aggs": {
|
||||
|
37
spec/FontParser_spec.py
Normal file
37
spec/FontParser_spec.py
Normal file
@ -0,0 +1,37 @@
|
||||
from unittest import TestCase
|
||||
from parsing import FontParser
|
||||
|
||||
|
||||
class FontParserTest(TestCase):
|
||||
|
||||
def test_parse_name_trueType(self):
|
||||
|
||||
parser = FontParser([])
|
||||
|
||||
info = parser.parse("test_files/truetype1.ttf")
|
||||
|
||||
self.assertEqual(info["font_name"], "Liberation Mono Bold")
|
||||
|
||||
def test_parse_name_openType(self):
|
||||
|
||||
parser = FontParser([])
|
||||
|
||||
info = parser.parse("test_files/opentype1.otf")
|
||||
|
||||
self.assertEqual(info["font_name"], "Linux Biolinum Keyboard O")
|
||||
|
||||
def test_parse_name_woff(self):
|
||||
|
||||
parser = FontParser([])
|
||||
|
||||
info = parser.parse("test_files/woff.woff")
|
||||
|
||||
self.assertEqual(info["font_name"], "Heart of Gold")
|
||||
|
||||
def test_parse_name_woff2(self):
|
||||
|
||||
parser = FontParser([])
|
||||
|
||||
info = parser.parse("test_files/woff2.woff2")
|
||||
|
||||
self.assertEqual(info["font_name"], "Heart of Gold")
|
BIN
spec/test_files/opentype1.otf
Normal file
BIN
spec/test_files/opentype1.otf
Normal file
Binary file not shown.
BIN
spec/test_files/truetype1.ttf
Normal file
BIN
spec/test_files/truetype1.ttf
Normal file
Binary file not shown.
BIN
spec/test_files/woff.woff
Normal file
BIN
spec/test_files/woff.woff
Normal file
Binary file not shown.
BIN
spec/test_files/woff2.woff2
Normal file
BIN
spec/test_files/woff2.woff2
Normal file
Binary file not shown.
@ -9,19 +9,19 @@
|
||||
|
||||
function swapToForm(elem, fields, formAction, inputName) {
|
||||
|
||||
var form = document.createElement("form");
|
||||
let form = document.createElement("form");
|
||||
form.setAttribute("action", formAction);
|
||||
|
||||
for (var i in fields) {
|
||||
for (let i in fields) {
|
||||
|
||||
var hiddenInput = document.createElement("input");
|
||||
let hiddenInput = document.createElement("input");
|
||||
hiddenInput.setAttribute("type", "hidden");
|
||||
hiddenInput.setAttribute("value", fields[i].value);
|
||||
hiddenInput.setAttribute("name", fields[i].name);
|
||||
form.appendChild(hiddenInput);
|
||||
}
|
||||
|
||||
var input = document.createElement("input");
|
||||
let input = document.createElement("input");
|
||||
input.setAttribute("class", "form-control");
|
||||
input.setAttribute("type", "text");
|
||||
input.setAttribute("name", inputName);
|
||||
|
@ -563,9 +563,17 @@
|
||||
docCard.appendChild(contentDiv);
|
||||
}
|
||||
|
||||
//Font_name
|
||||
if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("font_name")) {
|
||||
let contentDiv = document.createElement("div");
|
||||
contentDiv.setAttribute("class", "content-div bg-light");
|
||||
contentDiv.insertAdjacentHTML('afterbegin', hit["highlight"]["font_name"][0]);
|
||||
docCard.appendChild(contentDiv);
|
||||
}
|
||||
|
||||
//Audio
|
||||
if (mimeCategory === "audio") {
|
||||
|
||||
//TODO
|
||||
}
|
||||
|
||||
if (thumbnail !== null) {
|
||||
|
@ -28,7 +28,7 @@ class ThumbnailGenerator:
|
||||
try:
|
||||
(ffmpeg.
|
||||
input(path)
|
||||
.output("tmp", vframes=1, f="image2", loglevel="error")
|
||||
.overwrite_output("tmp", vframes=1, f="image2", loglevel="error")
|
||||
.run()
|
||||
)
|
||||
self.generate_image("tmp", dest_path)
|
||||
|
Loading…
x
Reference in New Issue
Block a user