Audio tags in search page and svg thumbnail generation

This commit is contained in:
simon 2018-04-17 11:45:31 -04:00
parent dff7ddc511
commit 4eb9cf6b63
8 changed files with 128 additions and 105 deletions

View File

@ -2,8 +2,10 @@ default_options = {
"ThumbnailQuality": "85",
"ThumbnailSize": "275",
"ThumbnailColor": "FF00FF",
"TextFileContentLength": "8192",
"PdfFileContentLength": "8192",
"TextFileContentLength": "2000",
"PdfFileContentLength": "2000",
"SpreadsheetContentLength": "2000",
"EbookContentLength": "2000",
"MimeGuesser": "extension", # extension, content
"CheckSumCalculators": "", # md5, sha1, sha256
"FileParsers": "media, text, picture, font" # media, text, picture

View File

@ -5,7 +5,7 @@ from multiprocessing import Process, Value
from apscheduler.schedulers.background import BackgroundScheduler
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
PdfFileParser, DocxParser
PdfFileParser, DocxParser, EbookParser
from indexer import Indexer
from search import Search
from thumbnail import ThumbnailGenerator
@ -77,7 +77,7 @@ class Crawler:
except FileNotFoundError:
continue # File was deleted
if self.indexer is not None:
if self.indexer is not None and len(self.documents) > 0:
self.indexer.index(self.documents, self.dir_id)
def countFiles(self, root_dir: str):
@ -141,7 +141,8 @@ class TaskManager:
PictureFileParser(chksum_calcs),
FontParser(chksum_calcs),
PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt
DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt
EbookParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
mime_guesser, self.indexer, directory.id)
c.crawl(directory.path, counter)

View File

@ -279,43 +279,27 @@ class TextFileParser(GenericFileParser):
"text/x-perl", "text/x-dsrc", "text/scriptlet", "text/x-scala", "text/calendar",
"text/x-bibtex", "text/x-tcl", "text/x-c++", "text/x-shellscript", "text/x-msdos-batch",
"text/x-makefile", "text/rtf", "text/x-objective-c", "text/troff", "text/x-m4",
"text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po"
]
self.encodings = [
'ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437',
'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855',
'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863',
'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932',
'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140',
'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
'cp1256', 'cp1257', 'cp1258', 'cp65001', 'euc_jp', 'euc_jis_2004',
'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp',
'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 'iso2022_jp_3',
'iso2022_jp_ext', 'iso2022_kr', 'latin_1', 'iso8859_2', 'iso8859_3',
'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8',
'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14',
'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2',
'mac_roman', 'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004',
'shift_jisx0213', 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16', 'utf_16_be',
'utf_16_le', 'utf_7', 'utf_8', 'utf_8_sig'
"text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po",
"text/x-makefile"
]
def parse(self, full_path: str):
info = super().parse(full_path)
with open(full_path, "rb") as text_file:
raw_content = text_file.read(self.content_length)
if self.content_length > 0:
with open(full_path, "rb") as text_file:
raw_content = text_file.read(self.content_length)
chardet.detect(raw_content)
encoding = chardet.detect(raw_content)["encoding"]
chardet.detect(raw_content)
encoding = chardet.detect(raw_content)["encoding"]
if encoding is not None and encoding in self.encodings:
info["encoding"] = encoding
content = raw_content.decode(encoding, "ignore")
info["content"] = html.escape(content)
if encoding is not None:
info["encoding"] = encoding
try:
content = raw_content.decode(encoding, "ignore")
info["content"] = html.escape(content)
except Exception:
print("Unknown encoding: " + encoding)
return info
@ -373,49 +357,50 @@ class PdfFileParser(GenericFileParser):
def parse(self, full_path: str):
info = super().parse(full_path)
with open(full_path, "rb") as f:
if self.content_length > 0:
with open(full_path, "rb") as f:
info["content"] = ""
info["content"] = ""
parser = PDFParser(f)
document = PDFDocument(parser)
parser = PDFParser(f)
document = PDFDocument(parser)
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
if isinstance(document.info[0]["Title"], bytes):
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
else:
info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
if isinstance(document.info[0]["Title"], bytes):
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
else:
info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
try:
if document.is_extractable:
resource_manager = PDFResourceManager()
la_params = LAParams()
try:
if document.is_extractable:
resource_manager = PDFResourceManager()
la_params = LAParams()
device = PDFPageAggregator(resource_manager, laparams=la_params)
interpreter = PDFPageInterpreter(resource_manager, device)
device = PDFPageAggregator(resource_manager, laparams=la_params)
interpreter = PDFPageInterpreter(resource_manager, device)
for page in PDFPage.create_pages(document):
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
text = lt_obj.get_text()
text = lt_obj.get_text()
if len(info["content"]) + len(text) <= self.content_length:
info["content"] += text
else:
info["content"] += text[0:self.content_length - len(info["content"])]
break
else:
continue
break
else:
print("PDF is not extractable: " + full_path)
except ValueError:
print("Couldn't parse page for " + full_path)
if len(info["content"]) + len(text) <= self.content_length:
info["content"] += text
else:
info["content"] += text[0:self.content_length - len(info["content"])]
break
else:
continue
break
else:
print("PDF is not extractable: " + full_path)
except ValueError:
print("Couldn't parse page for " + full_path)
return info

View File

@ -15,4 +15,5 @@ ebooklib
html2text
docx2txt
xlrd
six
six
cairosvg

10
run.py
View File

@ -67,7 +67,7 @@ def download(doc_id):
extension = "" if doc["extension"] is None or doc["extension"] == "" else "." + doc["extension"]
full_path = os.path.join(directory.path, doc["path"], doc["name"] + extension)
return send_file(full_path, mimetype=doc["mime"])
return send_file(full_path, mimetype=doc["mime"], conditional=True)
@app.route("/thumb/<doc_id>")
@ -195,9 +195,13 @@ def directory_update(dir_id):
# Only name and enabled status can be updated
updated_dir = Directory(path, enabled, directory.options, name)
updated_dir.id = dir_id
storage.update_directory(updated_dir)
flash("<strong>Updated directory</strong>", "success")
try:
storage.update_directory(updated_dir)
flash("<strong>Updated directory</strong>", "success")
except DuplicateDirectoryException:
flash("<strong>Couldn't update directory</strong> Make sure that the path is unique", "danger")
return redirect("/directory/" + str(dir_id))

View File

@ -278,14 +278,17 @@ class LocalStorage:
self.dir_cache_outdated = True
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute("UPDATE Directory SET name=?, path=?, enabled=? WHERE id=?",
(directory.name, directory.path, directory.enabled, directory.id))
try:
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute("UPDATE Directory SET name=?, path=?, enabled=? WHERE id=?",
(directory.name, directory.path, directory.enabled, directory.id))
c.close()
conn.commit()
conn.close()
c.close()
conn.commit()
conn.close()
except sqlite3.IntegrityError:
raise DuplicateDirectoryException("Duplicate directory: " + directory.path)
def save_option(self, option: Option):

View File

@ -7,7 +7,7 @@
{% block body %}
<style>
body {overflow-y:scroll;}
body {overflow-y:scroll;}
.document {
padding: 0.5rem;
}
@ -79,13 +79,17 @@
.fit {
width: 100%;
height: 100%;
{# margin-top: 3px;#}
padding: 3px;
min-width: 64px;
max-width: 100%;
max-height: 256px;
}
.audio-fit {
height: 39px;
vertical-align: bottom;
}
@media (min-width: 1200px) {
.card-columns {
column-count: 4;
@ -156,7 +160,6 @@
<div class="container">
<div class="card">
{# <div class="card-header">An excellent form</div>#}
<div class="card-body">
<div class="form-group">
<input id="pathBar" type="search" class="form-control" placeholder="Path">
@ -190,8 +193,6 @@
<div class="col">
<label>Mime types</label>
<button class="btn btn-xs btn-success" onclick="toggleTree()" style="float: right">Toggle</button>
<div class="tree"></div>
</div>
</div>
@ -209,9 +210,20 @@
//Select all
tree.select();
tree.node("any").deselect();
tree.on("node.click", function(event, node, handler) {
event.preventTreeDefault();
if (node.id === "any") {
if (!node.itree.state.checked) {
tree.deselect();
}
} else {
tree.node("any").deselect();
}
handler();
searchQueued = true;
})
@ -530,26 +542,27 @@
}
break;
case "image":
formatTag = document.createElement("span");
case "image": {
let formatTag = document.createElement("span");
formatTag.setAttribute("class", "badge badge-pill badge-image");
formatTag.appendChild(document.createTextNode(format));
tags.push(formatTag);
}
break;
case "audio":
formatTag = document.createElement("span");
case "audio": {
let formatTag = document.createElement("span");
formatTag.setAttribute("class", "badge badge-pill badge-audio");
formatTag.appendChild(document.createTextNode(hit["_source"]["format_name"]));
tags.push(formatTag);
}
break;
case "text":
formatTag = document.createElement("span");
case "text": {
let formatTag = document.createElement("span");
formatTag.setAttribute("class", "badge badge-pill badge-text");
formatTag.appendChild(document.createTextNode(hit["_source"]["encoding"]));
tags.push(formatTag);
}
break;
}
@ -563,17 +576,17 @@
docCard.appendChild(contentDiv);
}
//Font_name
if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("font_name")) {
let contentDiv = document.createElement("div");
contentDiv.setAttribute("class", "content-div bg-light");
contentDiv.insertAdjacentHTML('afterbegin', hit["highlight"]["font_name"][0]);
docCard.appendChild(contentDiv);
}
//Audio
if (mimeCategory === "audio") {
//TODO
if (mimeCategory === "audio" && hit["_source"].hasOwnProperty("format_long_name")) {
let audio = document.createElement("audio");
audio.setAttribute("preload", "none");
audio.setAttribute("class", "audio-fit fit");
audio.setAttribute("controls", "");
audio.setAttribute("type", hit["_source"]["mime"]);
audio.setAttribute("src", "file/" + hit["_id"]);
docCard.appendChild(audio)
}
if (thumbnail !== null) {
@ -671,6 +684,11 @@
let selected = tree.selected();
for (let i = 0; i < selected.length; i++) {
if(selected[i].id === "any") {
return "any"
}
//Only get children
if (selected[i].text.indexOf("(") !== -1) {
mimeTypes.push(selected[i].id);
@ -734,7 +752,7 @@
postBody.mime_types = getSelectedMimeTypes();
postBody.must_match = must_match;
postBody.directories = selectedDirs;
postBody.path = pathBar.value.replace(/\/$/, ""); //remove trailing slashes
postBody.path = pathBar.value.replace(/\/$/, "").toLowerCase(); //remove trailing slashes
xhttp.setRequestHeader('content-type', 'application/json');
xhttp.send(JSON.stringify(postBody));
}

View File

@ -1,8 +1,8 @@
from PIL import Image
import os
from parsing import ContentMimeGuesser, ExtensionMimeGuesser
from multiprocessing import Value
import ffmpeg
import cairosvg
class ThumbnailGenerator:
@ -17,7 +17,16 @@ class ThumbnailGenerator:
if mime is None:
return
if mime.startswith("image"):
if mime == "image/svg+xml":
try:
cairosvg.svg2png(url=path, write_to="tmp")
self.generate_image("tmp", dest_path)
os.remove("tmp")
except Exception:
print("Couldn't make thumbnail for " + path)
elif mime.startswith("image"):
try:
self.generate_image(path, dest_path)