mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-10 14:06:41 +00:00
Audio tags in search page and svg thumbnail generation
This commit is contained in:
parent
dff7ddc511
commit
4eb9cf6b63
@ -2,8 +2,10 @@ default_options = {
|
||||
"ThumbnailQuality": "85",
|
||||
"ThumbnailSize": "275",
|
||||
"ThumbnailColor": "FF00FF",
|
||||
"TextFileContentLength": "8192",
|
||||
"PdfFileContentLength": "8192",
|
||||
"TextFileContentLength": "2000",
|
||||
"PdfFileContentLength": "2000",
|
||||
"SpreadsheetContentLength": "2000",
|
||||
"EbookContentLength": "2000",
|
||||
"MimeGuesser": "extension", # extension, content
|
||||
"CheckSumCalculators": "", # md5, sha1, sha256
|
||||
"FileParsers": "media, text, picture, font" # media, text, picture
|
||||
|
@ -5,7 +5,7 @@ from multiprocessing import Process, Value
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
|
||||
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
|
||||
PdfFileParser, DocxParser
|
||||
PdfFileParser, DocxParser, EbookParser
|
||||
from indexer import Indexer
|
||||
from search import Search
|
||||
from thumbnail import ThumbnailGenerator
|
||||
@ -77,7 +77,7 @@ class Crawler:
|
||||
except FileNotFoundError:
|
||||
continue # File was deleted
|
||||
|
||||
if self.indexer is not None:
|
||||
if self.indexer is not None and len(self.documents) > 0:
|
||||
self.indexer.index(self.documents, self.dir_id)
|
||||
|
||||
def countFiles(self, root_dir: str):
|
||||
@ -141,7 +141,8 @@ class TaskManager:
|
||||
PictureFileParser(chksum_calcs),
|
||||
FontParser(chksum_calcs),
|
||||
PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt
|
||||
DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
|
||||
DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt
|
||||
EbookParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
|
||||
mime_guesser, self.indexer, directory.id)
|
||||
c.crawl(directory.path, counter)
|
||||
|
||||
|
111
parsing.py
111
parsing.py
@ -279,43 +279,27 @@ class TextFileParser(GenericFileParser):
|
||||
"text/x-perl", "text/x-dsrc", "text/scriptlet", "text/x-scala", "text/calendar",
|
||||
"text/x-bibtex", "text/x-tcl", "text/x-c++", "text/x-shellscript", "text/x-msdos-batch",
|
||||
"text/x-makefile", "text/rtf", "text/x-objective-c", "text/troff", "text/x-m4",
|
||||
"text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po"
|
||||
]
|
||||
|
||||
self.encodings = [
|
||||
'ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437',
|
||||
'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855',
|
||||
'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863',
|
||||
'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932',
|
||||
'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140',
|
||||
'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
|
||||
'cp1256', 'cp1257', 'cp1258', 'cp65001', 'euc_jp', 'euc_jis_2004',
|
||||
'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp',
|
||||
'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 'iso2022_jp_3',
|
||||
'iso2022_jp_ext', 'iso2022_kr', 'latin_1', 'iso8859_2', 'iso8859_3',
|
||||
'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8',
|
||||
'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14',
|
||||
'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
|
||||
'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2',
|
||||
'mac_roman', 'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004',
|
||||
'shift_jisx0213', 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16', 'utf_16_be',
|
||||
'utf_16_le', 'utf_7', 'utf_8', 'utf_8_sig'
|
||||
"text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po",
|
||||
"text/x-makefile"
|
||||
]
|
||||
|
||||
def parse(self, full_path: str):
|
||||
info = super().parse(full_path)
|
||||
|
||||
with open(full_path, "rb") as text_file:
|
||||
raw_content = text_file.read(self.content_length)
|
||||
if self.content_length > 0:
|
||||
with open(full_path, "rb") as text_file:
|
||||
raw_content = text_file.read(self.content_length)
|
||||
|
||||
chardet.detect(raw_content)
|
||||
encoding = chardet.detect(raw_content)["encoding"]
|
||||
chardet.detect(raw_content)
|
||||
encoding = chardet.detect(raw_content)["encoding"]
|
||||
|
||||
if encoding is not None and encoding in self.encodings:
|
||||
info["encoding"] = encoding
|
||||
content = raw_content.decode(encoding, "ignore")
|
||||
|
||||
info["content"] = html.escape(content)
|
||||
if encoding is not None:
|
||||
info["encoding"] = encoding
|
||||
try:
|
||||
content = raw_content.decode(encoding, "ignore")
|
||||
info["content"] = html.escape(content)
|
||||
except Exception:
|
||||
print("Unknown encoding: " + encoding)
|
||||
|
||||
return info
|
||||
|
||||
@ -373,49 +357,50 @@ class PdfFileParser(GenericFileParser):
|
||||
def parse(self, full_path: str):
|
||||
info = super().parse(full_path)
|
||||
|
||||
with open(full_path, "rb") as f:
|
||||
if self.content_length > 0:
|
||||
with open(full_path, "rb") as f:
|
||||
|
||||
info["content"] = ""
|
||||
info["content"] = ""
|
||||
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
|
||||
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
|
||||
if isinstance(document.info[0]["Title"], bytes):
|
||||
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
|
||||
else:
|
||||
info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
|
||||
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
|
||||
if isinstance(document.info[0]["Title"], bytes):
|
||||
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
|
||||
else:
|
||||
info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
|
||||
|
||||
try:
|
||||
if document.is_extractable:
|
||||
resource_manager = PDFResourceManager()
|
||||
la_params = LAParams()
|
||||
try:
|
||||
if document.is_extractable:
|
||||
resource_manager = PDFResourceManager()
|
||||
la_params = LAParams()
|
||||
|
||||
device = PDFPageAggregator(resource_manager, laparams=la_params)
|
||||
interpreter = PDFPageInterpreter(resource_manager, device)
|
||||
device = PDFPageAggregator(resource_manager, laparams=la_params)
|
||||
interpreter = PDFPageInterpreter(resource_manager, device)
|
||||
|
||||
for page in PDFPage.create_pages(document):
|
||||
for page in PDFPage.create_pages(document):
|
||||
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
|
||||
for lt_obj in layout:
|
||||
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
|
||||
for lt_obj in layout:
|
||||
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
|
||||
|
||||
text = lt_obj.get_text()
|
||||
text = lt_obj.get_text()
|
||||
|
||||
if len(info["content"]) + len(text) <= self.content_length:
|
||||
info["content"] += text
|
||||
else:
|
||||
info["content"] += text[0:self.content_length - len(info["content"])]
|
||||
break
|
||||
else:
|
||||
continue
|
||||
break
|
||||
else:
|
||||
print("PDF is not extractable: " + full_path)
|
||||
except ValueError:
|
||||
print("Couldn't parse page for " + full_path)
|
||||
if len(info["content"]) + len(text) <= self.content_length:
|
||||
info["content"] += text
|
||||
else:
|
||||
info["content"] += text[0:self.content_length - len(info["content"])]
|
||||
break
|
||||
else:
|
||||
continue
|
||||
break
|
||||
else:
|
||||
print("PDF is not extractable: " + full_path)
|
||||
except ValueError:
|
||||
print("Couldn't parse page for " + full_path)
|
||||
|
||||
return info
|
||||
|
||||
|
@ -15,4 +15,5 @@ ebooklib
|
||||
html2text
|
||||
docx2txt
|
||||
xlrd
|
||||
six
|
||||
six
|
||||
cairosvg
|
10
run.py
10
run.py
@ -67,7 +67,7 @@ def download(doc_id):
|
||||
extension = "" if doc["extension"] is None or doc["extension"] == "" else "." + doc["extension"]
|
||||
full_path = os.path.join(directory.path, doc["path"], doc["name"] + extension)
|
||||
|
||||
return send_file(full_path, mimetype=doc["mime"])
|
||||
return send_file(full_path, mimetype=doc["mime"], conditional=True)
|
||||
|
||||
|
||||
@app.route("/thumb/<doc_id>")
|
||||
@ -195,9 +195,13 @@ def directory_update(dir_id):
|
||||
# Only name and enabled status can be updated
|
||||
updated_dir = Directory(path, enabled, directory.options, name)
|
||||
updated_dir.id = dir_id
|
||||
storage.update_directory(updated_dir)
|
||||
|
||||
flash("<strong>Updated directory</strong>", "success")
|
||||
try:
|
||||
storage.update_directory(updated_dir)
|
||||
flash("<strong>Updated directory</strong>", "success")
|
||||
|
||||
except DuplicateDirectoryException:
|
||||
flash("<strong>Couldn't update directory</strong> Make sure that the path is unique", "danger")
|
||||
|
||||
return redirect("/directory/" + str(dir_id))
|
||||
|
||||
|
17
storage.py
17
storage.py
@ -278,14 +278,17 @@ class LocalStorage:
|
||||
|
||||
self.dir_cache_outdated = True
|
||||
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
c = conn.cursor()
|
||||
c.execute("UPDATE Directory SET name=?, path=?, enabled=? WHERE id=?",
|
||||
(directory.name, directory.path, directory.enabled, directory.id))
|
||||
try:
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
c = conn.cursor()
|
||||
c.execute("UPDATE Directory SET name=?, path=?, enabled=? WHERE id=?",
|
||||
(directory.name, directory.path, directory.enabled, directory.id))
|
||||
|
||||
c.close()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
c.close()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except sqlite3.IntegrityError:
|
||||
raise DuplicateDirectoryException("Duplicate directory: " + directory.path)
|
||||
|
||||
def save_option(self, option: Option):
|
||||
|
||||
|
@ -7,7 +7,7 @@
|
||||
{% block body %}
|
||||
|
||||
<style>
|
||||
body {overflow-y:scroll;}
|
||||
body {overflow-y:scroll;}
|
||||
.document {
|
||||
padding: 0.5rem;
|
||||
}
|
||||
@ -79,13 +79,17 @@
|
||||
.fit {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
{# margin-top: 3px;#}
|
||||
padding: 3px;
|
||||
min-width: 64px;
|
||||
max-width: 100%;
|
||||
max-height: 256px;
|
||||
}
|
||||
|
||||
.audio-fit {
|
||||
height: 39px;
|
||||
vertical-align: bottom;
|
||||
}
|
||||
|
||||
@media (min-width: 1200px) {
|
||||
.card-columns {
|
||||
column-count: 4;
|
||||
@ -156,7 +160,6 @@
|
||||
<div class="container">
|
||||
|
||||
<div class="card">
|
||||
{# <div class="card-header">An excellent form</div>#}
|
||||
<div class="card-body">
|
||||
<div class="form-group">
|
||||
<input id="pathBar" type="search" class="form-control" placeholder="Path">
|
||||
@ -190,8 +193,6 @@
|
||||
<div class="col">
|
||||
<label>Mime types</label>
|
||||
|
||||
<button class="btn btn-xs btn-success" onclick="toggleTree()" style="float: right">Toggle</button>
|
||||
|
||||
<div class="tree"></div>
|
||||
</div>
|
||||
</div>
|
||||
@ -209,9 +210,20 @@
|
||||
|
||||
//Select all
|
||||
tree.select();
|
||||
tree.node("any").deselect();
|
||||
|
||||
tree.on("node.click", function(event, node, handler) {
|
||||
event.preventTreeDefault();
|
||||
|
||||
if (node.id === "any") {
|
||||
|
||||
if (!node.itree.state.checked) {
|
||||
tree.deselect();
|
||||
}
|
||||
} else {
|
||||
tree.node("any").deselect();
|
||||
}
|
||||
|
||||
handler();
|
||||
searchQueued = true;
|
||||
})
|
||||
@ -530,26 +542,27 @@
|
||||
}
|
||||
|
||||
break;
|
||||
case "image":
|
||||
|
||||
formatTag = document.createElement("span");
|
||||
case "image": {
|
||||
let formatTag = document.createElement("span");
|
||||
formatTag.setAttribute("class", "badge badge-pill badge-image");
|
||||
formatTag.appendChild(document.createTextNode(format));
|
||||
tags.push(formatTag);
|
||||
|
||||
}
|
||||
break;
|
||||
case "audio":
|
||||
formatTag = document.createElement("span");
|
||||
case "audio": {
|
||||
let formatTag = document.createElement("span");
|
||||
formatTag.setAttribute("class", "badge badge-pill badge-audio");
|
||||
formatTag.appendChild(document.createTextNode(hit["_source"]["format_name"]));
|
||||
tags.push(formatTag);
|
||||
}
|
||||
|
||||
break;
|
||||
case "text":
|
||||
formatTag = document.createElement("span");
|
||||
case "text": {
|
||||
let formatTag = document.createElement("span");
|
||||
formatTag.setAttribute("class", "badge badge-pill badge-text");
|
||||
formatTag.appendChild(document.createTextNode(hit["_source"]["encoding"]));
|
||||
tags.push(formatTag);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
@ -563,17 +576,17 @@
|
||||
docCard.appendChild(contentDiv);
|
||||
}
|
||||
|
||||
//Font_name
|
||||
if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("font_name")) {
|
||||
let contentDiv = document.createElement("div");
|
||||
contentDiv.setAttribute("class", "content-div bg-light");
|
||||
contentDiv.insertAdjacentHTML('afterbegin', hit["highlight"]["font_name"][0]);
|
||||
docCard.appendChild(contentDiv);
|
||||
}
|
||||
|
||||
//Audio
|
||||
if (mimeCategory === "audio") {
|
||||
//TODO
|
||||
if (mimeCategory === "audio" && hit["_source"].hasOwnProperty("format_long_name")) {
|
||||
|
||||
let audio = document.createElement("audio");
|
||||
audio.setAttribute("preload", "none");
|
||||
audio.setAttribute("class", "audio-fit fit");
|
||||
audio.setAttribute("controls", "");
|
||||
audio.setAttribute("type", hit["_source"]["mime"]);
|
||||
audio.setAttribute("src", "file/" + hit["_id"]);
|
||||
|
||||
docCard.appendChild(audio)
|
||||
}
|
||||
|
||||
if (thumbnail !== null) {
|
||||
@ -671,6 +684,11 @@
|
||||
let selected = tree.selected();
|
||||
|
||||
for (let i = 0; i < selected.length; i++) {
|
||||
|
||||
if(selected[i].id === "any") {
|
||||
return "any"
|
||||
}
|
||||
|
||||
//Only get children
|
||||
if (selected[i].text.indexOf("(") !== -1) {
|
||||
mimeTypes.push(selected[i].id);
|
||||
@ -734,7 +752,7 @@
|
||||
postBody.mime_types = getSelectedMimeTypes();
|
||||
postBody.must_match = must_match;
|
||||
postBody.directories = selectedDirs;
|
||||
postBody.path = pathBar.value.replace(/\/$/, ""); //remove trailing slashes
|
||||
postBody.path = pathBar.value.replace(/\/$/, "").toLowerCase(); //remove trailing slashes
|
||||
xhttp.setRequestHeader('content-type', 'application/json');
|
||||
xhttp.send(JSON.stringify(postBody));
|
||||
}
|
||||
|
13
thumbnail.py
13
thumbnail.py
@ -1,8 +1,8 @@
|
||||
from PIL import Image
|
||||
import os
|
||||
from parsing import ContentMimeGuesser, ExtensionMimeGuesser
|
||||
from multiprocessing import Value
|
||||
import ffmpeg
|
||||
import cairosvg
|
||||
|
||||
|
||||
class ThumbnailGenerator:
|
||||
@ -17,7 +17,16 @@ class ThumbnailGenerator:
|
||||
if mime is None:
|
||||
return
|
||||
|
||||
if mime.startswith("image"):
|
||||
if mime == "image/svg+xml":
|
||||
|
||||
try:
|
||||
cairosvg.svg2png(url=path, write_to="tmp")
|
||||
self.generate_image("tmp", dest_path)
|
||||
os.remove("tmp")
|
||||
except Exception:
|
||||
print("Couldn't make thumbnail for " + path)
|
||||
|
||||
elif mime.startswith("image"):
|
||||
|
||||
try:
|
||||
self.generate_image(path, dest_path)
|
||||
|
Loading…
x
Reference in New Issue
Block a user