From 4eb9cf6b6310b685f1ad6aeedc6bb871c9b2567a Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 17 Apr 2018 11:45:31 -0400 Subject: [PATCH] Audio tags in search page and svg thumbnail generation --- config.py | 6 ++- crawler.py | 7 +-- parsing.py | 111 ++++++++++++++++++------------------------ requirements.txt | 3 +- run.py | 10 ++-- storage.py | 17 ++++--- templates/search.html | 66 ++++++++++++++++--------- thumbnail.py | 13 ++++- 8 files changed, 128 insertions(+), 105 deletions(-) diff --git a/config.py b/config.py index 09e876e..a2de785 100644 --- a/config.py +++ b/config.py @@ -2,8 +2,10 @@ default_options = { "ThumbnailQuality": "85", "ThumbnailSize": "275", "ThumbnailColor": "FF00FF", - "TextFileContentLength": "8192", - "PdfFileContentLength": "8192", + "TextFileContentLength": "2000", + "PdfFileContentLength": "2000", + "SpreadsheetContentLength": "2000", + "EbookContentLength": "2000", "MimeGuesser": "extension", # extension, content "CheckSumCalculators": "", # md5, sha1, sha256 "FileParsers": "media, text, picture, font" # media, text, picture diff --git a/crawler.py b/crawler.py index 99bf7f9..b535873 100644 --- a/crawler.py +++ b/crawler.py @@ -5,7 +5,7 @@ from multiprocessing import Process, Value from apscheduler.schedulers.background import BackgroundScheduler from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \ PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \ - PdfFileParser, DocxParser + PdfFileParser, DocxParser, EbookParser from indexer import Indexer from search import Search from thumbnail import ThumbnailGenerator @@ -77,7 +77,7 @@ class Crawler: except FileNotFoundError: continue # File was deleted - if self.indexer is not None: + if self.indexer is not None and len(self.documents) > 0: self.indexer.index(self.documents, self.dir_id) def countFiles(self, root_dir: str): @@ -141,7 +141,8 @@ class TaskManager: PictureFileParser(chksum_calcs), FontParser(chksum_calcs), PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt - DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt + DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt + EbookParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt mime_guesser, self.indexer, directory.id) c.crawl(directory.path, counter) diff --git a/parsing.py b/parsing.py index 3960901..ae80329 100644 --- a/parsing.py +++ b/parsing.py @@ -279,43 +279,27 @@ class TextFileParser(GenericFileParser): "text/x-perl", "text/x-dsrc", "text/scriptlet", "text/x-scala", "text/calendar", "text/x-bibtex", "text/x-tcl", "text/x-c++", "text/x-shellscript", "text/x-msdos-batch", "text/x-makefile", "text/rtf", "text/x-objective-c", "text/troff", "text/x-m4", - "text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po" - ] - - self.encodings = [ - 'ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437', - 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', - 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', - 'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932', - 'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140', - 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', - 'cp1256', 'cp1257', 'cp1258', 'cp65001', 'euc_jp', 'euc_jis_2004', - 'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp', - 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 'iso2022_jp_3', - 'iso2022_jp_ext', 'iso2022_kr', 'latin_1', 'iso8859_2', 'iso8859_3', - 'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8', - 'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14', - 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u', - 'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', - 'mac_roman', 'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', - 'shift_jisx0213', 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16', 'utf_16_be', - 'utf_16_le', 'utf_7', 'utf_8', 'utf_8_sig' + "text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po", + "text/x-makefile" ] def parse(self, full_path: str): info = super().parse(full_path) - with open(full_path, "rb") as text_file: - raw_content = text_file.read(self.content_length) + if self.content_length > 0: + with open(full_path, "rb") as text_file: + raw_content = text_file.read(self.content_length) - chardet.detect(raw_content) - encoding = chardet.detect(raw_content)["encoding"] + chardet.detect(raw_content) + encoding = chardet.detect(raw_content)["encoding"] - if encoding is not None and encoding in self.encodings: - info["encoding"] = encoding - content = raw_content.decode(encoding, "ignore") - - info["content"] = html.escape(content) + if encoding is not None: + info["encoding"] = encoding + try: + content = raw_content.decode(encoding, "ignore") + info["content"] = html.escape(content) + except Exception: + print("Unknown encoding: " + encoding) return info @@ -373,49 +357,50 @@ class PdfFileParser(GenericFileParser): def parse(self, full_path: str): info = super().parse(full_path) - with open(full_path, "rb") as f: + if self.content_length > 0: + with open(full_path, "rb") as f: - info["content"] = "" + info["content"] = "" - parser = PDFParser(f) - document = PDFDocument(parser) + parser = PDFParser(f) + document = PDFDocument(parser) - if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"": - if isinstance(document.info[0]["Title"], bytes): - info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n" - else: - info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n" + if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"": + if isinstance(document.info[0]["Title"], bytes): + info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n" + else: + info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n" - try: - if document.is_extractable: - resource_manager = PDFResourceManager() - la_params = LAParams() + try: + if document.is_extractable: + resource_manager = PDFResourceManager() + la_params = LAParams() - device = PDFPageAggregator(resource_manager, laparams=la_params) - interpreter = PDFPageInterpreter(resource_manager, device) + device = PDFPageAggregator(resource_manager, laparams=la_params) + interpreter = PDFPageInterpreter(resource_manager, device) - for page in PDFPage.create_pages(document): + for page in PDFPage.create_pages(document): - interpreter.process_page(page) - layout = device.get_result() + interpreter.process_page(page) + layout = device.get_result() - for lt_obj in layout: - if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): + for lt_obj in layout: + if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): - text = lt_obj.get_text() + text = lt_obj.get_text() - if len(info["content"]) + len(text) <= self.content_length: - info["content"] += text - else: - info["content"] += text[0:self.content_length - len(info["content"])] - break - else: - continue - break - else: - print("PDF is not extractable: " + full_path) - except ValueError: - print("Couldn't parse page for " + full_path) + if len(info["content"]) + len(text) <= self.content_length: + info["content"] += text + else: + info["content"] += text[0:self.content_length - len(info["content"])] + break + else: + continue + break + else: + print("PDF is not extractable: " + full_path) + except ValueError: + print("Couldn't parse page for " + full_path) return info diff --git a/requirements.txt b/requirements.txt index e01b120..5424fc3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,5 @@ ebooklib html2text docx2txt xlrd -six \ No newline at end of file +six +cairosvg \ No newline at end of file diff --git a/run.py b/run.py index 213678b..5afe1c6 100644 --- a/run.py +++ b/run.py @@ -67,7 +67,7 @@ def download(doc_id): extension = "" if doc["extension"] is None or doc["extension"] == "" else "." + doc["extension"] full_path = os.path.join(directory.path, doc["path"], doc["name"] + extension) - return send_file(full_path, mimetype=doc["mime"]) + return send_file(full_path, mimetype=doc["mime"], conditional=True) @app.route("/thumb/") @@ -195,9 +195,13 @@ def directory_update(dir_id): # Only name and enabled status can be updated updated_dir = Directory(path, enabled, directory.options, name) updated_dir.id = dir_id - storage.update_directory(updated_dir) - flash("Updated directory", "success") + try: + storage.update_directory(updated_dir) + flash("Updated directory", "success") + + except DuplicateDirectoryException: + flash("Couldn't update directory Make sure that the path is unique", "danger") return redirect("/directory/" + str(dir_id)) diff --git a/storage.py b/storage.py index e1de040..c3584e3 100644 --- a/storage.py +++ b/storage.py @@ -278,14 +278,17 @@ class LocalStorage: self.dir_cache_outdated = True - conn = sqlite3.connect(self.db_path) - c = conn.cursor() - c.execute("UPDATE Directory SET name=?, path=?, enabled=? WHERE id=?", - (directory.name, directory.path, directory.enabled, directory.id)) + try: + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + c.execute("UPDATE Directory SET name=?, path=?, enabled=? WHERE id=?", + (directory.name, directory.path, directory.enabled, directory.id)) - c.close() - conn.commit() - conn.close() + c.close() + conn.commit() + conn.close() + except sqlite3.IntegrityError: + raise DuplicateDirectoryException("Duplicate directory: " + directory.path) def save_option(self, option: Option): diff --git a/templates/search.html b/templates/search.html index 3eb39ef..6fe462b 100644 --- a/templates/search.html +++ b/templates/search.html @@ -7,7 +7,7 @@ {% block body %}