From 4eb9cf6b6310b685f1ad6aeedc6bb871c9b2567a Mon Sep 17 00:00:00 2001
From: simon <fortier.simon@hotmail.com>
Date: Tue, 17 Apr 2018 11:45:31 -0400
Subject: [PATCH] Audio tags in search page and svg thumbnail generation

---
 config.py             |   6 ++-
 crawler.py            |   7 +--
 parsing.py            | 111 ++++++++++++++++++------------------------
 requirements.txt      |   3 +-
 run.py                |  10 ++--
 storage.py            |  17 ++++---
 templates/search.html |  66 ++++++++++++++++---------
 thumbnail.py          |  13 ++++-
 8 files changed, 128 insertions(+), 105 deletions(-)

diff --git a/config.py b/config.py
index 09e876e..a2de785 100644
--- a/config.py
+++ b/config.py
@@ -2,8 +2,10 @@ default_options = {
     "ThumbnailQuality": "85",
     "ThumbnailSize": "275",
     "ThumbnailColor": "FF00FF",
-    "TextFileContentLength": "8192",
-    "PdfFileContentLength": "8192",
+    "TextFileContentLength": "2000",
+    "PdfFileContentLength": "2000",
+    "SpreadsheetContentLength": "2000",
+    "EbookContentLength": "2000",
     "MimeGuesser": "extension",  # extension, content
     "CheckSumCalculators": "",  # md5, sha1, sha256
     "FileParsers": "media, text, picture, font"  # media, text, picture
diff --git a/crawler.py b/crawler.py
index 99bf7f9..b535873 100644
--- a/crawler.py
+++ b/crawler.py
@@ -5,7 +5,7 @@ from multiprocessing import Process, Value
 from apscheduler.schedulers.background import BackgroundScheduler
 from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
     PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
-    PdfFileParser, DocxParser
+    PdfFileParser, DocxParser, EbookParser
 from indexer import Indexer
 from search import Search
 from thumbnail import ThumbnailGenerator
@@ -77,7 +77,7 @@ class Crawler:
                 except FileNotFoundError:
                     continue  # File was deleted
 
-        if self.indexer is not None:
+        if self.indexer is not None and len(self.documents) > 0:
             self.indexer.index(self.documents, self.dir_id)
 
     def countFiles(self, root_dir: str):
@@ -141,7 +141,8 @@ class TaskManager:
                      PictureFileParser(chksum_calcs),
                      FontParser(chksum_calcs),
                      PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),  # todo get content len from other opt
-                     DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))],  # todo get content len from other opt
+                     DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),  # todo get content len from other opt
+                     EbookParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))],  # todo get content len from other opt
                     mime_guesser, self.indexer, directory.id)
         c.crawl(directory.path, counter)
 
diff --git a/parsing.py b/parsing.py
index 3960901..ae80329 100644
--- a/parsing.py
+++ b/parsing.py
@@ -279,43 +279,27 @@ class TextFileParser(GenericFileParser):
             "text/x-perl", "text/x-dsrc", "text/scriptlet", "text/x-scala", "text/calendar",
             "text/x-bibtex", "text/x-tcl", "text/x-c++", "text/x-shellscript", "text/x-msdos-batch",
             "text/x-makefile", "text/rtf", "text/x-objective-c", "text/troff", "text/x-m4",
-            "text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po"
-        ]
-
-        self.encodings = [
-            'ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437',
-            'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855',
-            'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863',
-            'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932',
-            'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140',
-            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
-            'cp1256', 'cp1257', 'cp1258', 'cp65001', 'euc_jp', 'euc_jis_2004',
-            'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp',
-            'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 'iso2022_jp_3',
-            'iso2022_jp_ext', 'iso2022_kr', 'latin_1', 'iso8859_2', 'iso8859_3',
-            'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8',
-            'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14',
-            'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
-            'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2',
-            'mac_roman', 'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004',
-            'shift_jisx0213', 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16', 'utf_16_be',
-            'utf_16_le', 'utf_7', 'utf_8', 'utf_8_sig'
+            "text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po",
+            "text/x-makefile"
         ]
 
     def parse(self, full_path: str):
         info = super().parse(full_path)
 
-        with open(full_path, "rb") as text_file:
-            raw_content = text_file.read(self.content_length)
+        if self.content_length > 0:
+            with open(full_path, "rb") as text_file:
+                raw_content = text_file.read(self.content_length)
 
-            chardet.detect(raw_content)
-            encoding = chardet.detect(raw_content)["encoding"]
+                chardet.detect(raw_content)
+                encoding = chardet.detect(raw_content)["encoding"]
 
-            if encoding is not None and encoding in self.encodings:
-                info["encoding"] = encoding
-                content = raw_content.decode(encoding, "ignore")
-
-                info["content"] = html.escape(content)
+                if encoding is not None:
+                    info["encoding"] = encoding
+                    try:
+                        content = raw_content.decode(encoding, "ignore")
+                        info["content"] = html.escape(content)
+                    except Exception:
+                        print("Unknown encoding: " + encoding)
 
         return info
 
@@ -373,49 +357,50 @@ class PdfFileParser(GenericFileParser):
     def parse(self, full_path: str):
         info = super().parse(full_path)
 
-        with open(full_path, "rb") as f:
+        if self.content_length > 0:
+            with open(full_path, "rb") as f:
 
-            info["content"] = ""
+                info["content"] = ""
 
-            parser = PDFParser(f)
-            document = PDFDocument(parser)
+                parser = PDFParser(f)
+                document = PDFDocument(parser)
 
-            if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
-                if isinstance(document.info[0]["Title"], bytes):
-                    info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
-                else:
-                    info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
+                if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
+                    if isinstance(document.info[0]["Title"], bytes):
+                        info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
+                    else:
+                        info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
 
-            try:
-                if document.is_extractable:
-                    resource_manager = PDFResourceManager()
-                    la_params = LAParams()
+                try:
+                    if document.is_extractable:
+                        resource_manager = PDFResourceManager()
+                        la_params = LAParams()
 
-                    device = PDFPageAggregator(resource_manager, laparams=la_params)
-                    interpreter = PDFPageInterpreter(resource_manager, device)
+                        device = PDFPageAggregator(resource_manager, laparams=la_params)
+                        interpreter = PDFPageInterpreter(resource_manager, device)
 
-                    for page in PDFPage.create_pages(document):
+                        for page in PDFPage.create_pages(document):
 
-                        interpreter.process_page(page)
-                        layout = device.get_result()
+                            interpreter.process_page(page)
+                            layout = device.get_result()
 
-                        for lt_obj in layout:
-                            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
+                            for lt_obj in layout:
+                                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
 
-                                text = lt_obj.get_text()
+                                    text = lt_obj.get_text()
 
-                                if len(info["content"]) + len(text) <= self.content_length:
-                                    info["content"] += text
-                                else:
-                                    info["content"] += text[0:self.content_length - len(info["content"])]
-                                    break
-                        else:
-                            continue
-                        break
-                else:
-                    print("PDF is not extractable: " + full_path)
-            except ValueError:
-                print("Couldn't parse page for " + full_path)
+                                    if len(info["content"]) + len(text) <= self.content_length:
+                                        info["content"] += text
+                                    else:
+                                        info["content"] += text[0:self.content_length - len(info["content"])]
+                                        break
+                            else:
+                                continue
+                            break
+                    else:
+                        print("PDF is not extractable: " + full_path)
+                except ValueError:
+                    print("Couldn't parse page for " + full_path)
 
         return info
 
diff --git a/requirements.txt b/requirements.txt
index e01b120..5424fc3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,4 +15,5 @@ ebooklib
 html2text
 docx2txt
 xlrd
-six
\ No newline at end of file
+six
+cairosvg
\ No newline at end of file
diff --git a/run.py b/run.py
index 213678b..5afe1c6 100644
--- a/run.py
+++ b/run.py
@@ -67,7 +67,7 @@ def download(doc_id):
     extension = "" if doc["extension"] is None or doc["extension"] == "" else "." + doc["extension"]
     full_path = os.path.join(directory.path, doc["path"], doc["name"] + extension)
 
-    return send_file(full_path, mimetype=doc["mime"])
+    return send_file(full_path, mimetype=doc["mime"], conditional=True)
 
 
 @app.route("/thumb/<doc_id>")
@@ -195,9 +195,13 @@ def directory_update(dir_id):
     # Only name and enabled status can be updated
     updated_dir = Directory(path, enabled, directory.options, name)
     updated_dir.id = dir_id
-    storage.update_directory(updated_dir)
 
-    flash("<strong>Updated directory</strong>", "success")
+    try:
+        storage.update_directory(updated_dir)
+        flash("<strong>Updated directory</strong>", "success")
+
+    except DuplicateDirectoryException:
+        flash("<strong>Couldn't update directory</strong> Make sure that the path is unique", "danger")
 
     return redirect("/directory/" + str(dir_id))
 
diff --git a/storage.py b/storage.py
index e1de040..c3584e3 100644
--- a/storage.py
+++ b/storage.py
@@ -278,14 +278,17 @@ class LocalStorage:
 
         self.dir_cache_outdated = True
 
-        conn = sqlite3.connect(self.db_path)
-        c = conn.cursor()
-        c.execute("UPDATE Directory SET name=?, path=?, enabled=? WHERE id=?",
-                  (directory.name, directory.path, directory.enabled, directory.id))
+        try:
+            conn = sqlite3.connect(self.db_path)
+            c = conn.cursor()
+            c.execute("UPDATE Directory SET name=?, path=?, enabled=? WHERE id=?",
+                      (directory.name, directory.path, directory.enabled, directory.id))
 
-        c.close()
-        conn.commit()
-        conn.close()
+            c.close()
+            conn.commit()
+            conn.close()
+        except sqlite3.IntegrityError:
+            raise DuplicateDirectoryException("Duplicate directory: " + directory.path)
 
     def save_option(self, option: Option):
 
diff --git a/templates/search.html b/templates/search.html
index 3eb39ef..6fe462b 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -7,7 +7,7 @@
 {% block body %}
 
     <style>
-        body    {overflow-y:scroll;}
+        body {overflow-y:scroll;}
         .document {
             padding: 0.5rem;
         }
@@ -79,13 +79,17 @@
         .fit {
             width: 100%;
             height: 100%;
-        {#            margin-top: 3px;#}
             padding: 3px;
             min-width: 64px;
             max-width: 100%;
             max-height: 256px;
         }
 
+        .audio-fit {
+            height: 39px;
+            vertical-align: bottom;
+        }
+
         @media (min-width: 1200px) {
             .card-columns {
                 column-count: 4;
@@ -156,7 +160,6 @@
     <div class="container">
 
         <div class="card">
-            {#            <div class="card-header">An excellent form</div>#}
             <div class="card-body">
                 <div class="form-group">
                     <input id="pathBar" type="search" class="form-control" placeholder="Path">
@@ -190,8 +193,6 @@
                     <div class="col">
                         <label>Mime types</label>
 
-                        <button class="btn btn-xs btn-success" onclick="toggleTree()" style="float: right">Toggle</button>
-
                         <div class="tree"></div>
                     </div>
                 </div>
@@ -209,9 +210,20 @@
 
                     //Select all
                     tree.select();
+                    tree.node("any").deselect();
 
                     tree.on("node.click", function(event, node, handler) {
                         event.preventTreeDefault();
+
+                        if (node.id === "any") {
+
+                            if (!node.itree.state.checked) {
+                                tree.deselect();
+                            }
+                        } else {
+                            tree.node("any").deselect();
+                        }
+
                         handler();
                         searchQueued = true;
                     })
@@ -530,26 +542,27 @@
                             }
 
                             break;
-                        case "image":
-
-                            formatTag = document.createElement("span");
+                        case "image": {
+                            let formatTag = document.createElement("span");
                             formatTag.setAttribute("class", "badge badge-pill badge-image");
                             formatTag.appendChild(document.createTextNode(format));
                             tags.push(formatTag);
-
+                        }
                             break;
-                        case "audio":
-                            formatTag = document.createElement("span");
+                        case "audio": {
+                            let formatTag = document.createElement("span");
                             formatTag.setAttribute("class", "badge badge-pill badge-audio");
                             formatTag.appendChild(document.createTextNode(hit["_source"]["format_name"]));
                             tags.push(formatTag);
+                        }
 
                             break;
-                        case "text":
-                            formatTag = document.createElement("span");
+                        case "text": {
+                            let formatTag = document.createElement("span");
                             formatTag.setAttribute("class", "badge badge-pill badge-text");
                             formatTag.appendChild(document.createTextNode(hit["_source"]["encoding"]));
                             tags.push(formatTag);
+                        }
 
                             break;
                     }
@@ -563,17 +576,17 @@
                         docCard.appendChild(contentDiv);
                     }
 
-                    //Font_name
-                    if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("font_name")) {
-                        let contentDiv = document.createElement("div");
-                        contentDiv.setAttribute("class", "content-div bg-light");
-                        contentDiv.insertAdjacentHTML('afterbegin', hit["highlight"]["font_name"][0]);
-                        docCard.appendChild(contentDiv);
-                    }
-
                     //Audio
-                    if (mimeCategory === "audio") {
-                        //TODO
+                    if (mimeCategory === "audio" && hit["_source"].hasOwnProperty("format_long_name")) {
+
+                        let audio = document.createElement("audio");
+                        audio.setAttribute("preload", "none");
+                        audio.setAttribute("class", "audio-fit fit");
+                        audio.setAttribute("controls", "");
+                        audio.setAttribute("type", hit["_source"]["mime"]);
+                        audio.setAttribute("src", "file/" + hit["_id"]);
+
+                        docCard.appendChild(audio)
                     }
 
                     if (thumbnail !== null) {
@@ -671,6 +684,11 @@
                 let selected = tree.selected();
 
                 for (let i = 0; i < selected.length; i++) {
+
+                    if(selected[i].id === "any") {
+                        return "any"
+                    }
+
                     //Only get children
                     if (selected[i].text.indexOf("(") !== -1) {
                         mimeTypes.push(selected[i].id);
@@ -734,7 +752,7 @@
                     postBody.mime_types = getSelectedMimeTypes();
                     postBody.must_match = must_match;
                     postBody.directories = selectedDirs;
-                    postBody.path = pathBar.value.replace(/\/$/, ""); //remove trailing slashes
+                    postBody.path = pathBar.value.replace(/\/$/, "").toLowerCase(); //remove trailing slashes
                     xhttp.setRequestHeader('content-type', 'application/json');
                     xhttp.send(JSON.stringify(postBody));
                 }
diff --git a/thumbnail.py b/thumbnail.py
index ddb01c5..038b7b0 100644
--- a/thumbnail.py
+++ b/thumbnail.py
@@ -1,8 +1,8 @@
 from PIL import Image
 import os
-from parsing import ContentMimeGuesser, ExtensionMimeGuesser
 from multiprocessing import Value
 import ffmpeg
+import cairosvg
 
 
 class ThumbnailGenerator:
@@ -17,7 +17,16 @@ class ThumbnailGenerator:
         if mime is None:
             return
 
-        if mime.startswith("image"):
+        if mime == "image/svg+xml":
+
+            try:
+                cairosvg.svg2png(url=path, write_to="tmp")
+                self.generate_image("tmp", dest_path)
+                os.remove("tmp")
+            except Exception:
+                print("Couldn't make thumbnail for " + path)
+
+        elif mime.startswith("image"):
 
             try:
                 self.generate_image(path, dest_path)