Audio tags in search page and svg thumbnail generation

This commit is contained in:
simon 2018-04-17 11:45:31 -04:00
parent dff7ddc511
commit 4eb9cf6b63
8 changed files with 128 additions and 105 deletions

View File

@ -2,8 +2,10 @@ default_options = {
"ThumbnailQuality": "85", "ThumbnailQuality": "85",
"ThumbnailSize": "275", "ThumbnailSize": "275",
"ThumbnailColor": "FF00FF", "ThumbnailColor": "FF00FF",
"TextFileContentLength": "8192", "TextFileContentLength": "2000",
"PdfFileContentLength": "8192", "PdfFileContentLength": "2000",
"SpreadsheetContentLength": "2000",
"EbookContentLength": "2000",
"MimeGuesser": "extension", # extension, content "MimeGuesser": "extension", # extension, content
"CheckSumCalculators": "", # md5, sha1, sha256 "CheckSumCalculators": "", # md5, sha1, sha256
"FileParsers": "media, text, picture, font" # media, text, picture "FileParsers": "media, text, picture, font" # media, text, picture

View File

@ -5,7 +5,7 @@ from multiprocessing import Process, Value
from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.schedulers.background import BackgroundScheduler
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \ from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \ PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
PdfFileParser, DocxParser PdfFileParser, DocxParser, EbookParser
from indexer import Indexer from indexer import Indexer
from search import Search from search import Search
from thumbnail import ThumbnailGenerator from thumbnail import ThumbnailGenerator
@ -77,7 +77,7 @@ class Crawler:
except FileNotFoundError: except FileNotFoundError:
continue # File was deleted continue # File was deleted
if self.indexer is not None: if self.indexer is not None and len(self.documents) > 0:
self.indexer.index(self.documents, self.dir_id) self.indexer.index(self.documents, self.dir_id)
def countFiles(self, root_dir: str): def countFiles(self, root_dir: str):
@ -141,7 +141,8 @@ class TaskManager:
PictureFileParser(chksum_calcs), PictureFileParser(chksum_calcs),
FontParser(chksum_calcs), FontParser(chksum_calcs),
PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt
DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt
EbookParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
mime_guesser, self.indexer, directory.id) mime_guesser, self.indexer, directory.id)
c.crawl(directory.path, counter) c.crawl(directory.path, counter)

View File

@ -279,43 +279,27 @@ class TextFileParser(GenericFileParser):
"text/x-perl", "text/x-dsrc", "text/scriptlet", "text/x-scala", "text/calendar", "text/x-perl", "text/x-dsrc", "text/scriptlet", "text/x-scala", "text/calendar",
"text/x-bibtex", "text/x-tcl", "text/x-c++", "text/x-shellscript", "text/x-msdos-batch", "text/x-bibtex", "text/x-tcl", "text/x-c++", "text/x-shellscript", "text/x-msdos-batch",
"text/x-makefile", "text/rtf", "text/x-objective-c", "text/troff", "text/x-m4", "text/x-makefile", "text/rtf", "text/x-objective-c", "text/troff", "text/x-m4",
"text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po" "text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po",
] "text/x-makefile"
self.encodings = [
'ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437',
'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855',
'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863',
'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932',
'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140',
'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
'cp1256', 'cp1257', 'cp1258', 'cp65001', 'euc_jp', 'euc_jis_2004',
'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp',
'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 'iso2022_jp_3',
'iso2022_jp_ext', 'iso2022_kr', 'latin_1', 'iso8859_2', 'iso8859_3',
'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8',
'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14',
'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2',
'mac_roman', 'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004',
'shift_jisx0213', 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16', 'utf_16_be',
'utf_16_le', 'utf_7', 'utf_8', 'utf_8_sig'
] ]
def parse(self, full_path: str): def parse(self, full_path: str):
info = super().parse(full_path) info = super().parse(full_path)
with open(full_path, "rb") as text_file: if self.content_length > 0:
raw_content = text_file.read(self.content_length) with open(full_path, "rb") as text_file:
raw_content = text_file.read(self.content_length)
chardet.detect(raw_content) chardet.detect(raw_content)
encoding = chardet.detect(raw_content)["encoding"] encoding = chardet.detect(raw_content)["encoding"]
if encoding is not None and encoding in self.encodings: if encoding is not None:
info["encoding"] = encoding info["encoding"] = encoding
content = raw_content.decode(encoding, "ignore") try:
content = raw_content.decode(encoding, "ignore")
info["content"] = html.escape(content) info["content"] = html.escape(content)
except Exception:
print("Unknown encoding: " + encoding)
return info return info
@ -373,49 +357,50 @@ class PdfFileParser(GenericFileParser):
def parse(self, full_path: str): def parse(self, full_path: str):
info = super().parse(full_path) info = super().parse(full_path)
with open(full_path, "rb") as f: if self.content_length > 0:
with open(full_path, "rb") as f:
info["content"] = "" info["content"] = ""
parser = PDFParser(f) parser = PDFParser(f)
document = PDFDocument(parser) document = PDFDocument(parser)
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"": if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
if isinstance(document.info[0]["Title"], bytes): if isinstance(document.info[0]["Title"], bytes):
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n" info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
else: else:
info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n" info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
try: try:
if document.is_extractable: if document.is_extractable:
resource_manager = PDFResourceManager() resource_manager = PDFResourceManager()
la_params = LAParams() la_params = LAParams()
device = PDFPageAggregator(resource_manager, laparams=la_params) device = PDFPageAggregator(resource_manager, laparams=la_params)
interpreter = PDFPageInterpreter(resource_manager, device) interpreter = PDFPageInterpreter(resource_manager, device)
for page in PDFPage.create_pages(document): for page in PDFPage.create_pages(document):
interpreter.process_page(page) interpreter.process_page(page)
layout = device.get_result() layout = device.get_result()
for lt_obj in layout: for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
text = lt_obj.get_text() text = lt_obj.get_text()
if len(info["content"]) + len(text) <= self.content_length: if len(info["content"]) + len(text) <= self.content_length:
info["content"] += text info["content"] += text
else: else:
info["content"] += text[0:self.content_length - len(info["content"])] info["content"] += text[0:self.content_length - len(info["content"])]
break break
else: else:
continue continue
break break
else: else:
print("PDF is not extractable: " + full_path) print("PDF is not extractable: " + full_path)
except ValueError: except ValueError:
print("Couldn't parse page for " + full_path) print("Couldn't parse page for " + full_path)
return info return info

View File

@ -15,4 +15,5 @@ ebooklib
html2text html2text
docx2txt docx2txt
xlrd xlrd
six six
cairosvg

10
run.py
View File

@ -67,7 +67,7 @@ def download(doc_id):
extension = "" if doc["extension"] is None or doc["extension"] == "" else "." + doc["extension"] extension = "" if doc["extension"] is None or doc["extension"] == "" else "." + doc["extension"]
full_path = os.path.join(directory.path, doc["path"], doc["name"] + extension) full_path = os.path.join(directory.path, doc["path"], doc["name"] + extension)
return send_file(full_path, mimetype=doc["mime"]) return send_file(full_path, mimetype=doc["mime"], conditional=True)
@app.route("/thumb/<doc_id>") @app.route("/thumb/<doc_id>")
@ -195,9 +195,13 @@ def directory_update(dir_id):
# Only name and enabled status can be updated # Only name and enabled status can be updated
updated_dir = Directory(path, enabled, directory.options, name) updated_dir = Directory(path, enabled, directory.options, name)
updated_dir.id = dir_id updated_dir.id = dir_id
storage.update_directory(updated_dir)
flash("<strong>Updated directory</strong>", "success") try:
storage.update_directory(updated_dir)
flash("<strong>Updated directory</strong>", "success")
except DuplicateDirectoryException:
flash("<strong>Couldn't update directory</strong> Make sure that the path is unique", "danger")
return redirect("/directory/" + str(dir_id)) return redirect("/directory/" + str(dir_id))

View File

@ -278,14 +278,17 @@ class LocalStorage:
self.dir_cache_outdated = True self.dir_cache_outdated = True
conn = sqlite3.connect(self.db_path) try:
c = conn.cursor() conn = sqlite3.connect(self.db_path)
c.execute("UPDATE Directory SET name=?, path=?, enabled=? WHERE id=?", c = conn.cursor()
(directory.name, directory.path, directory.enabled, directory.id)) c.execute("UPDATE Directory SET name=?, path=?, enabled=? WHERE id=?",
(directory.name, directory.path, directory.enabled, directory.id))
c.close() c.close()
conn.commit() conn.commit()
conn.close() conn.close()
except sqlite3.IntegrityError:
raise DuplicateDirectoryException("Duplicate directory: " + directory.path)
def save_option(self, option: Option): def save_option(self, option: Option):

View File

@ -7,7 +7,7 @@
{% block body %} {% block body %}
<style> <style>
body {overflow-y:scroll;} body {overflow-y:scroll;}
.document { .document {
padding: 0.5rem; padding: 0.5rem;
} }
@ -79,13 +79,17 @@
.fit { .fit {
width: 100%; width: 100%;
height: 100%; height: 100%;
{# margin-top: 3px;#}
padding: 3px; padding: 3px;
min-width: 64px; min-width: 64px;
max-width: 100%; max-width: 100%;
max-height: 256px; max-height: 256px;
} }
.audio-fit {
height: 39px;
vertical-align: bottom;
}
@media (min-width: 1200px) { @media (min-width: 1200px) {
.card-columns { .card-columns {
column-count: 4; column-count: 4;
@ -156,7 +160,6 @@
<div class="container"> <div class="container">
<div class="card"> <div class="card">
{# <div class="card-header">An excellent form</div>#}
<div class="card-body"> <div class="card-body">
<div class="form-group"> <div class="form-group">
<input id="pathBar" type="search" class="form-control" placeholder="Path"> <input id="pathBar" type="search" class="form-control" placeholder="Path">
@ -190,8 +193,6 @@
<div class="col"> <div class="col">
<label>Mime types</label> <label>Mime types</label>
<button class="btn btn-xs btn-success" onclick="toggleTree()" style="float: right">Toggle</button>
<div class="tree"></div> <div class="tree"></div>
</div> </div>
</div> </div>
@ -209,9 +210,20 @@
//Select all //Select all
tree.select(); tree.select();
tree.node("any").deselect();
tree.on("node.click", function(event, node, handler) { tree.on("node.click", function(event, node, handler) {
event.preventTreeDefault(); event.preventTreeDefault();
if (node.id === "any") {
if (!node.itree.state.checked) {
tree.deselect();
}
} else {
tree.node("any").deselect();
}
handler(); handler();
searchQueued = true; searchQueued = true;
}) })
@ -530,26 +542,27 @@
} }
break; break;
case "image": case "image": {
let formatTag = document.createElement("span");
formatTag = document.createElement("span");
formatTag.setAttribute("class", "badge badge-pill badge-image"); formatTag.setAttribute("class", "badge badge-pill badge-image");
formatTag.appendChild(document.createTextNode(format)); formatTag.appendChild(document.createTextNode(format));
tags.push(formatTag); tags.push(formatTag);
}
break; break;
case "audio": case "audio": {
formatTag = document.createElement("span"); let formatTag = document.createElement("span");
formatTag.setAttribute("class", "badge badge-pill badge-audio"); formatTag.setAttribute("class", "badge badge-pill badge-audio");
formatTag.appendChild(document.createTextNode(hit["_source"]["format_name"])); formatTag.appendChild(document.createTextNode(hit["_source"]["format_name"]));
tags.push(formatTag); tags.push(formatTag);
}
break; break;
case "text": case "text": {
formatTag = document.createElement("span"); let formatTag = document.createElement("span");
formatTag.setAttribute("class", "badge badge-pill badge-text"); formatTag.setAttribute("class", "badge badge-pill badge-text");
formatTag.appendChild(document.createTextNode(hit["_source"]["encoding"])); formatTag.appendChild(document.createTextNode(hit["_source"]["encoding"]));
tags.push(formatTag); tags.push(formatTag);
}
break; break;
} }
@ -563,17 +576,17 @@
docCard.appendChild(contentDiv); docCard.appendChild(contentDiv);
} }
//Font_name
if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("font_name")) {
let contentDiv = document.createElement("div");
contentDiv.setAttribute("class", "content-div bg-light");
contentDiv.insertAdjacentHTML('afterbegin', hit["highlight"]["font_name"][0]);
docCard.appendChild(contentDiv);
}
//Audio //Audio
if (mimeCategory === "audio") { if (mimeCategory === "audio" && hit["_source"].hasOwnProperty("format_long_name")) {
//TODO
let audio = document.createElement("audio");
audio.setAttribute("preload", "none");
audio.setAttribute("class", "audio-fit fit");
audio.setAttribute("controls", "");
audio.setAttribute("type", hit["_source"]["mime"]);
audio.setAttribute("src", "file/" + hit["_id"]);
docCard.appendChild(audio)
} }
if (thumbnail !== null) { if (thumbnail !== null) {
@ -671,6 +684,11 @@
let selected = tree.selected(); let selected = tree.selected();
for (let i = 0; i < selected.length; i++) { for (let i = 0; i < selected.length; i++) {
if(selected[i].id === "any") {
return "any"
}
//Only get children //Only get children
if (selected[i].text.indexOf("(") !== -1) { if (selected[i].text.indexOf("(") !== -1) {
mimeTypes.push(selected[i].id); mimeTypes.push(selected[i].id);
@ -734,7 +752,7 @@
postBody.mime_types = getSelectedMimeTypes(); postBody.mime_types = getSelectedMimeTypes();
postBody.must_match = must_match; postBody.must_match = must_match;
postBody.directories = selectedDirs; postBody.directories = selectedDirs;
postBody.path = pathBar.value.replace(/\/$/, ""); //remove trailing slashes postBody.path = pathBar.value.replace(/\/$/, "").toLowerCase(); //remove trailing slashes
xhttp.setRequestHeader('content-type', 'application/json'); xhttp.setRequestHeader('content-type', 'application/json');
xhttp.send(JSON.stringify(postBody)); xhttp.send(JSON.stringify(postBody));
} }

View File

@ -1,8 +1,8 @@
from PIL import Image from PIL import Image
import os import os
from parsing import ContentMimeGuesser, ExtensionMimeGuesser
from multiprocessing import Value from multiprocessing import Value
import ffmpeg import ffmpeg
import cairosvg
class ThumbnailGenerator: class ThumbnailGenerator:
@ -17,7 +17,16 @@ class ThumbnailGenerator:
if mime is None: if mime is None:
return return
if mime.startswith("image"): if mime == "image/svg+xml":
try:
cairosvg.svg2png(url=path, write_to="tmp")
self.generate_image("tmp", dest_path)
os.remove("tmp")
except Exception:
print("Couldn't make thumbnail for " + path)
elif mime.startswith("image"):
try: try:
self.generate_image(path, dest_path) self.generate_image(path, dest_path)