Audio tags in search page and svg thumbnail generation

2025-10-27 05:56:52 +00:00 · 2018-04-17 11:45:31 -04:00 · 2018-04-17 11:45:31 -04:00 · 4eb9cf6b63
commit 4eb9cf6b63
parent dff7ddc511
8 changed files with 128 additions and 105 deletions
--- a/config.py
+++ b/config.py
@ -2,8 +2,10 @@ default_options = {
    "ThumbnailQuality": "85",
    "ThumbnailSize": "275",
    "ThumbnailColor": "FF00FF",
-    "TextFileContentLength": "8192",
-    "PdfFileContentLength": "8192",
+    "TextFileContentLength": "2000",
+    "PdfFileContentLength": "2000",
+    "SpreadsheetContentLength": "2000",
+    "EbookContentLength": "2000",
    "MimeGuesser": "extension",  # extension, content
    "CheckSumCalculators": "",  # md5, sha1, sha256
    "FileParsers": "media, text, picture, font"  # media, text, picture
--- a/crawler.py
+++ b/crawler.py
@ -5,7 +5,7 @@ from multiprocessing import Process, Value
 from apscheduler.schedulers.background import BackgroundScheduler
 from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
    PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
-    PdfFileParser, DocxParser
+    PdfFileParser, DocxParser, EbookParser
 from indexer import Indexer
 from search import Search
 from thumbnail import ThumbnailGenerator
@ -77,7 +77,7 @@ class Crawler:
                except FileNotFoundError:
                    continue  # File was deleted

-        if self.indexer is not None:
+        if self.indexer is not None and len(self.documents) > 0:
            self.indexer.index(self.documents, self.dir_id)

    def countFiles(self, root_dir: str):
@ -141,7 +141,8 @@ class TaskManager:
                     PictureFileParser(chksum_calcs),
                     FontParser(chksum_calcs),
                     PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),  # todo get content len from other opt
-                     DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))],  # todo get content len from other opt
+                     DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),  # todo get content len from other opt
+                     EbookParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))],  # todo get content len from other opt
                    mime_guesser, self.indexer, directory.id)
        c.crawl(directory.path, counter)

--- a/parsing.py
+++ b/parsing.py
@ -279,43 +279,27 @@ class TextFileParser(GenericFileParser):
            "text/x-perl", "text/x-dsrc", "text/scriptlet", "text/x-scala", "text/calendar",
            "text/x-bibtex", "text/x-tcl", "text/x-c++", "text/x-shellscript", "text/x-msdos-batch",
            "text/x-makefile", "text/rtf", "text/x-objective-c", "text/troff", "text/x-m4",
-            "text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po"
-        ]
-
-        self.encodings = [
-            'ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437',
-            'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855',
-            'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863',
-            'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932',
-            'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140',
-            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
-            'cp1256', 'cp1257', 'cp1258', 'cp65001', 'euc_jp', 'euc_jis_2004',
-            'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp',
-            'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 'iso2022_jp_3',
-            'iso2022_jp_ext', 'iso2022_kr', 'latin_1', 'iso8859_2', 'iso8859_3',
-            'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8',
-            'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14',
-            'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
-            'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2',
-            'mac_roman', 'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004',
-            'shift_jisx0213', 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16', 'utf_16_be',
-            'utf_16_le', 'utf_7', 'utf_8', 'utf_8_sig'
+            "text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po",
+            "text/x-makefile"
        ]

    def parse(self, full_path: str):
        info = super().parse(full_path)

-        with open(full_path, "rb") as text_file:
-            raw_content = text_file.read(self.content_length)
+        if self.content_length > 0:
+            with open(full_path, "rb") as text_file:
+                raw_content = text_file.read(self.content_length)

-            chardet.detect(raw_content)
-            encoding = chardet.detect(raw_content)["encoding"]
+                chardet.detect(raw_content)
+                encoding = chardet.detect(raw_content)["encoding"]

-            if encoding is not None and encoding in self.encodings:
-                info["encoding"] = encoding
-                content = raw_content.decode(encoding, "ignore")
-
-                info["content"] = html.escape(content)
+                if encoding is not None:
+                    info["encoding"] = encoding
+                    try:
+                        content = raw_content.decode(encoding, "ignore")
+                        info["content"] = html.escape(content)
+                    except Exception:
+                        print("Unknown encoding: " + encoding)

        return info

@ -373,49 +357,50 @@ class PdfFileParser(GenericFileParser):
    def parse(self, full_path: str):
        info = super().parse(full_path)

-        with open(full_path, "rb") as f:
+        if self.content_length > 0:
+            with open(full_path, "rb") as f:

-            info["content"] = ""
+                info["content"] = ""

-            parser = PDFParser(f)
-            document = PDFDocument(parser)
+                parser = PDFParser(f)
+                document = PDFDocument(parser)

-            if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
-                if isinstance(document.info[0]["Title"], bytes):
-                    info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
-                else:
-                    info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
+                if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
+                    if isinstance(document.info[0]["Title"], bytes):
+                        info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
+                    else:
+                        info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"

-            try:
-                if document.is_extractable:
-                    resource_manager = PDFResourceManager()
-                    la_params = LAParams()
+                try:
+                    if document.is_extractable:
+                        resource_manager = PDFResourceManager()
+                        la_params = LAParams()

-                    device = PDFPageAggregator(resource_manager, laparams=la_params)
-                    interpreter = PDFPageInterpreter(resource_manager, device)
+                        device = PDFPageAggregator(resource_manager, laparams=la_params)
+                        interpreter = PDFPageInterpreter(resource_manager, device)

-                    for page in PDFPage.create_pages(document):
+                        for page in PDFPage.create_pages(document):

-                        interpreter.process_page(page)
-                        layout = device.get_result()
+                            interpreter.process_page(page)
+                            layout = device.get_result()

-                        for lt_obj in layout:
-                            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
+                            for lt_obj in layout:
+                                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):

-                                text = lt_obj.get_text()
+                                    text = lt_obj.get_text()

-                                if len(info["content"]) + len(text) <= self.content_length:
-                                    info["content"] += text
-                                else:
-                                    info["content"] += text[0:self.content_length - len(info["content"])]
-                                    break
-                        else:
-                            continue
-                        break
-                else:
-                    print("PDF is not extractable: " + full_path)
-            except ValueError:
-                print("Couldn't parse page for " + full_path)
+                                    if len(info["content"]) + len(text) <= self.content_length:
+                                        info["content"] += text
+                                    else:
+                                        info["content"] += text[0:self.content_length - len(info["content"])]
+                                        break
+                            else:
+                                continue
+                            break
+                    else:
+                        print("PDF is not extractable: " + full_path)
+                except ValueError:
+                    print("Couldn't parse page for " + full_path)

        return info

--- a/requirements.txt
+++ b/requirements.txt
@ -15,4 +15,5 @@ ebooklib
 html2text
 docx2txt
 xlrd
-six
+six
+cairosvg
--- a/run.py
+++ b/run.py
@ -67,7 +67,7 @@ def download(doc_id):
    extension = "" if doc["extension"] is None or doc["extension"] == "" else "." + doc["extension"]
    full_path = os.path.join(directory.path, doc["path"], doc["name"] + extension)

-    return send_file(full_path, mimetype=doc["mime"])
+    return send_file(full_path, mimetype=doc["mime"], conditional=True)


@app.route("/thumb/<doc_id>")
@ -195,9 +195,13 @@ def directory_update(dir_id):
    # Only name and enabled status can be updated
    updated_dir = Directory(path, enabled, directory.options, name)
    updated_dir.id = dir_id
-    storage.update_directory(updated_dir)

-    flash("<strong>Updated directory</strong>", "success")
+    try:
+        storage.update_directory(updated_dir)
+        flash("<strong>Updated directory</strong>", "success")
+
+    except DuplicateDirectoryException:
+        flash("<strong>Couldn't update directory</strong> Make sure that the path is unique", "danger")

    return redirect("/directory/" + str(dir_id))

--- a/storage.py
+++ b/storage.py
@ -278,14 +278,17 @@ class LocalStorage:

        self.dir_cache_outdated = True

-        conn = sqlite3.connect(self.db_path)
-        c = conn.cursor()
-        c.execute("UPDATE Directory SET name=?, path=?, enabled=? WHERE id=?",
-                  (directory.name, directory.path, directory.enabled, directory.id))
+        try:
+            conn = sqlite3.connect(self.db_path)
+            c = conn.cursor()
+            c.execute("UPDATE Directory SET name=?, path=?, enabled=? WHERE id=?",
+                      (directory.name, directory.path, directory.enabled, directory.id))

-        c.close()
-        conn.commit()
-        conn.close()
+            c.close()
+            conn.commit()
+            conn.close()
+        except sqlite3.IntegrityError:
+            raise DuplicateDirectoryException("Duplicate directory: " + directory.path)

    def save_option(self, option: Option):

--- a/templates/search.html
+++ b/templates/search.html
@ -7,7 +7,7 @@
 {% block body %}

    <style>
-        body    {overflow-y:scroll;}
+        body {overflow-y:scroll;}
        .document {
            padding: 0.5rem;
        }
@ -79,13 +79,17 @@
        .fit {
            width: 100%;
            height: 100%;
-        {#            margin-top: 3px;#}
            padding: 3px;
            min-width: 64px;
            max-width: 100%;
            max-height: 256px;
        }

+        .audio-fit {
+            height: 39px;
+            vertical-align: bottom;
+        }
+
        @media (min-width: 1200px) {
            .card-columns {
                column-count: 4;
@ -156,7 +160,6 @@
    <div class="container">

        <div class="card">
-            {#            <div class="card-header">An excellent form</div>#}
            <div class="card-body">
                <div class="form-group">
                    <input id="pathBar" type="search" class="form-control" placeholder="Path">
@ -190,8 +193,6 @@
                    <div class="col">
                        <label>Mime types</label>

-                        <button class="btn btn-xs btn-success" onclick="toggleTree()" style="float: right">Toggle</button>
-
                        <div class="tree"></div>
                    </div>
                </div>
@ -209,9 +210,20 @@

                    //Select all
                    tree.select();
+                    tree.node("any").deselect();

                    tree.on("node.click", function(event, node, handler) {
                        event.preventTreeDefault();
+
+                        if (node.id === "any") {
+
+                            if (!node.itree.state.checked) {
+                                tree.deselect();
+                            }
+                        } else {
+                            tree.node("any").deselect();
+                        }
+
                        handler();
                        searchQueued = true;
                    })
@ -530,26 +542,27 @@
                            }

                            break;
-                        case "image":
-
-                            formatTag = document.createElement("span");
+                        case "image": {
+                            let formatTag = document.createElement("span");
                            formatTag.setAttribute("class", "badge badge-pill badge-image");
                            formatTag.appendChild(document.createTextNode(format));
                            tags.push(formatTag);
-
+                        }
                            break;
-                        case "audio":
-                            formatTag = document.createElement("span");
+                        case "audio": {
+                            let formatTag = document.createElement("span");
                            formatTag.setAttribute("class", "badge badge-pill badge-audio");
                            formatTag.appendChild(document.createTextNode(hit["_source"]["format_name"]));
                            tags.push(formatTag);
+                        }

                            break;
-                        case "text":
-                            formatTag = document.createElement("span");
+                        case "text": {
+                            let formatTag = document.createElement("span");
                            formatTag.setAttribute("class", "badge badge-pill badge-text");
                            formatTag.appendChild(document.createTextNode(hit["_source"]["encoding"]));
                            tags.push(formatTag);
+                        }

                            break;
                    }
@ -563,17 +576,17 @@
                        docCard.appendChild(contentDiv);
                    }

-                    //Font_name
-                    if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("font_name")) {
-                        let contentDiv = document.createElement("div");
-                        contentDiv.setAttribute("class", "content-div bg-light");
-                        contentDiv.insertAdjacentHTML('afterbegin', hit["highlight"]["font_name"][0]);
-                        docCard.appendChild(contentDiv);
-                    }
-
                    //Audio
-                    if (mimeCategory === "audio") {
-                        //TODO
+                    if (mimeCategory === "audio" && hit["_source"].hasOwnProperty("format_long_name")) {
+
+                        let audio = document.createElement("audio");
+                        audio.setAttribute("preload", "none");
+                        audio.setAttribute("class", "audio-fit fit");
+                        audio.setAttribute("controls", "");
+                        audio.setAttribute("type", hit["_source"]["mime"]);
+                        audio.setAttribute("src", "file/" + hit["_id"]);
+
+                        docCard.appendChild(audio)
                    }

                    if (thumbnail !== null) {
@ -671,6 +684,11 @@
                let selected = tree.selected();

                for (let i = 0; i < selected.length; i++) {
+
+                    if(selected[i].id === "any") {
+                        return "any"
+                    }
+
                    //Only get children
                    if (selected[i].text.indexOf("(") !== -1) {
                        mimeTypes.push(selected[i].id);
@ -734,7 +752,7 @@
                    postBody.mime_types = getSelectedMimeTypes();
                    postBody.must_match = must_match;
                    postBody.directories = selectedDirs;
-                    postBody.path = pathBar.value.replace(/\/$/, ""); //remove trailing slashes
+                    postBody.path = pathBar.value.replace(/\/$/, "").toLowerCase(); //remove trailing slashes
                    xhttp.setRequestHeader('content-type', 'application/json');
                    xhttp.send(JSON.stringify(postBody));
                }
--- a/thumbnail.py
+++ b/thumbnail.py
@ -1,8 +1,8 @@
 from PIL import Image
 import os
-from parsing import ContentMimeGuesser, ExtensionMimeGuesser
 from multiprocessing import Value
 import ffmpeg
+import cairosvg


 class ThumbnailGenerator:
@ -17,7 +17,16 @@ class ThumbnailGenerator:
        if mime is None:
            return

-        if mime.startswith("image"):
+        if mime == "image/svg+xml":
+
+            try:
+                cairosvg.svg2png(url=path, write_to="tmp")
+                self.generate_image("tmp", dest_path)
+                os.remove("tmp")
+            except Exception:
+                print("Couldn't make thumbnail for " + path)
+
+        elif mime.startswith("image"):

            try:
                self.generate_image(path, dest_path)