From 980babc5ccc98d02a1e6d26513b9fe8bbd7bd0ea Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 26 May 2019 11:31:28 -0400 Subject: [PATCH] ES 7.X support. Bug fixes. UI tweaks. Config fixes --- config.py | 6 ++- crawler.py | 67 ++++++++++++++++++---------- indexer.py | 4 +- local_storage.db | Bin 40960 -> 40960 bytes run.py | 2 +- search.py | 16 +++---- static/js/search.js | 91 ++++++++++++++++++++++----------------- storage.py | 21 ++++----- templates/directory.html | 2 +- templates/layout.html | 26 +++++------ templates/task.html | 4 +- thumbnail.py | 2 +- 12 files changed, 139 insertions(+), 102 deletions(-) diff --git a/config.py b/config.py index 25f5d3d..2c1d543 100644 --- a/config.py +++ b/config.py @@ -5,7 +5,8 @@ default_options = { "ThumbnailColor": "FF00FF", "TextFileContentLength": "2000", "PdfFileContentLength": "2000", - "SpreadsheetContentLength": "2000", + "DocxContentLength": "2000", + "SpreadSheetContentLength": "2000", "EbookContentLength": "2000", "MimeGuesser": "extension", # extension, content "CheckSumCalculators": "", # md5, sha1, sha256 @@ -19,6 +20,7 @@ index_every = 10000 nGramMin = 3 nGramMax = 3 elasticsearch_url = "http://localhost:9200" +elasticsearch_index = "sist" # Password hashing bcrypt_rounds = 13 @@ -41,4 +43,4 @@ try: except: cairosvg = False -VERSION = "1.1a" +VERSION = "1.2a" diff --git a/crawler.py b/crawler.py index d2e7785..6b64b8e 100644 --- a/crawler.py +++ b/crawler.py @@ -11,7 +11,7 @@ import config from indexer import Indexer from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \ PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \ - PdfFileParser, DocxParser, EbookParser + PdfFileParser, DocxParser, EbookParser, SpreadSheetParser from search import Search from storage import Directory from storage import Task, LocalStorage @@ -53,7 +53,7 @@ class Crawler: self.mime_guesser = mime_guesser - def crawl(self, root_dir: str, counter: Value = None, total_files = None): + def crawl(self, root_dir: str, counter: Value = None, total_files=None): in_q = Queue(50000) # TODO: get from config? out_q = Queue() @@ -154,7 +154,8 @@ class Crawler: pass finally: out_q.task_done() - self.indexer.index(self.documents, self.dir_id) + if self.documents: + self.indexer.index(self.documents, self.dir_id) class TaskManager: @@ -162,7 +163,7 @@ class TaskManager: self.current_task = None self.storage = storage self.current_process = None - self.indexer = Indexer("changeme") + self.indexer = Indexer(config.elasticsearch_index) scheduler = BackgroundScheduler() scheduler.add_job(self.check_new_task, "interval", seconds=0.5) @@ -188,34 +189,54 @@ class TaskManager: def execute_crawl(self, directory: Directory, counter: Value, done: Value, total_files: Value): - Search("changeme").delete_directory(directory.id) + Search(config.elasticsearch_index).delete_directory(directory.id) + chksum_calcs = self.make_checksums_list(directory) + + mime_guesser = ExtensionMimeGuesser() if directory.get_option("MimeGuesser") == "extension" \ + else ContentMimeGuesser() + + c = Crawler(self.make_parser_list(chksum_calcs, directory), mime_guesser, self.indexer, directory.id) + c.crawl(directory.path, counter, total_files) + + done.value = 1 + + @staticmethod + def make_checksums_list(directory): chksum_calcs = [] - for arg in directory.get_option("CheckSumCalculators").split(","): - if arg.strip() == "md5": chksum_calcs.append(Md5CheckSumCalculator()) elif arg.strip() == "sha1": chksum_calcs.append(Sha1CheckSumCalculator()) elif arg.strip() == "sha256": chksum_calcs.append(Sha256CheckSumCalculator()) + return chksum_calcs - mime_guesser = ExtensionMimeGuesser() if directory.get_option("MimeGuesser") == "extension" \ - else ContentMimeGuesser() - - c = Crawler([GenericFileParser(chksum_calcs, directory.path), - MediaFileParser(chksum_calcs, directory.path), - TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path), - PictureFileParser(chksum_calcs, directory.path), - FontParser(chksum_calcs, directory.path), - PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path), - DocxParser(chksum_calcs, int(directory.get_option("SpreadsheetContentLength")), directory.path), - EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path)], - mime_guesser, self.indexer, directory.id) - c.crawl(directory.path, counter, total_files) - - done.value = 1 + @staticmethod + def make_parser_list(chksum_calcs, directory): + p = [p.strip() for p in directory.get_option("FileParsers").split(",")] + parsers = [GenericFileParser(chksum_calcs, directory.path)] + if "media" in p: + parsers.append(MediaFileParser(chksum_calcs, directory.path)) + if "text" in p: + parsers.append( + TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path)) + if "picture" in p: + parsers.append(PictureFileParser(chksum_calcs, directory.path)) + if "font" in p: + parsers.append(FontParser(chksum_calcs, directory.path)) + if "pdf" in p: + parsers.append( + PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path)) + if "docx" in p: + parsers.append(DocxParser(chksum_calcs, int(directory.get_option("DocxContentLength")), directory.path)) + if "spreadsheet" in p: + parsers.append( + SpreadSheetParser(chksum_calcs, int(directory.get_option("SpreadSheetContentLength")), directory.path)) + if "ebook" in p: + parsers.append(EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path)) + return parsers def execute_thumbnails(self, directory: Directory, total_files: Value, counter: Value, done: Value): @@ -223,7 +244,7 @@ class TaskManager: if os.path.exists(dest_path): shutil.rmtree(dest_path) - docs = Search("changeme").get_all_documents(directory.id) + docs = Search(config.elasticsearch_index).get_all_documents(directory.id) tn_generator = ThumbnailGenerator(int(directory.get_option("ThumbnailSize")), int(directory.get_option("ThumbnailQuality")), diff --git a/indexer.py b/indexer.py index 2b881ab..6033073 100644 --- a/indexer.py +++ b/indexer.py @@ -13,7 +13,7 @@ class Indexer: self.index_name = index self.es = elasticsearch.Elasticsearch() - requests.head("http://localhost:9200") + requests.head(config.elasticsearch_url) if self.es.indices.exists(self.index_name): print("Index is already setup") else: @@ -93,7 +93,7 @@ class Indexer: "genre": {"analyzer": "my_nGram", "type": "text"}, "album_artist": {"analyzer": "my_nGram", "type": "text"}, "content": {"analyzer": "content_analyser", "type": "text"}, - }}, doc_type="file", index=self.index_name) + }}, doc_type="file", index=self.index_name, include_type_name=True) self.es.indices.open(index=self.index_name) diff --git a/local_storage.db b/local_storage.db index 15bcfe66a58f5b3f994a23d885248b00be15e5c3..5209cf277ec5cb623709225fca981c690173a244 100644 GIT binary patch literal 40960 zcmeI)Piz}m9S86k&mVjIZ??3K|0IoPld_E)r*@L2X()v{$v8NhIBDWkWI1>z_Up!t z?QuQhv_V3UT?uhqgxJF(apH&&9FPj)%4LNFCl+bp2Jx|udCb2*zs2Jj^Z(^TzYLo z%`9a*%f*xpHLY%_*#&j88BME(?rly6(6L=>m&Z!QnA;tDT@r)&IhH9^i{y#%s7yaq zJ)>w1AQZJ#_2k|D>#GUF#Gn`nqjTg%#vPx{wQ{Y_B^c_T>G${j7&*w z<<>G;y3wkdZ64@7a?lEKUn@6GKLt7t?f-a~Vuug;rtCYj&0WZ8#-23`9pPIkpD6f~ zt)Dzg&4U^_xn9sKwK6eD@fcrORMV-gm7J2aj#Vo=ZpVW$T%U6? zw$6#c@o{#$xvNK~q(L4vNVPy-SmiqaUdSA?UXpRTW$f@0>kLMt?BU~PovqhEYr)Y` zPb>KoY1TAzw|OooTiMLLt>*rByeW{X^#;^JbjKzgVo;6Rox!ar@{_Gaj=r0^tanr& zokjU3BY!0SLjJB?me=VDCI~*nDKy=s8r3wqGoE- ziPe%(u;z#$FcZOc&SYY>VU!BZhf!W&W}{MKSFeynv94E1B1@i_g|gl#wiaIF1SU1u zu3#pSsuy-k`y{cgH_XNkIeJnY0yBNNUG7H0tZRimQa5r(&!cQtbBp@pYFXEcMr*

iMDP+n$G>j3?&tx<7V*;QopGd+sIoggYSpTl$0aRQiGRSlX8E zNF%P_yME>Rk?T9Iw_HoEYcA3GvGX_1pV1&NK>z{}fB*zOO9IRD0yFweHeD){b*)ZE zvN9^9SkmH(NpJs)E45O=Y}84dj>22DR*MhlNM3s3Nn9~%bvnG5u}cU|Cp)^nCx%n> zzCRzkuuBSin~lnXRxUKk8XbGg5F>YKGf{Sx4xY5sAO>xRR!^!19i>Vf`m(@G23Xaq zV}Uk9s^$u*zGd!CC6h^Un<~RM_>{Gc990wojLqPftd-g>%|Aw zPIl)DSI-Tc3oCZuTqJ4>&J7Ls0p~_8p9;o6m9jDSQy>e*jjrlSPR1SGrea@ZOuARyWHmSH+ZKrAfv>MR3vSb(b+ z+gV3_Wsr^qFma)s-0CW=&;P9al%fBaAOHafKmY;|fB*y_009U<00Izr$pq4@5Ip21 zqg2tWSEhEZOio|f+RxsrRd$y2*;=kab{@_z-@1RF+|ZX+zM^iuageOs-(SpWH@DY~ z^_gTgaeXmcSbt=V|HsPD=S?5P$##AOHafKmY;|fB*y_009V`RlvmspIqHr6R#pZ*R;P_A3-~v)H2jfY)F1#u3X2*P05L%h)c^nh diff --git a/run.py b/run.py index 09a6f67..c2da694 100644 --- a/run.py +++ b/run.py @@ -25,7 +25,7 @@ flaskLogger = logging.getLogger('werkzeug') flaskLogger.setLevel(logging.ERROR) tm = TaskManager(storage) -search = Search("changeme") +search = Search(config.elasticsearch_index) def get_dir_size(path): diff --git a/search.py b/search.py index 6402107..640c7ec 100644 --- a/search.py +++ b/search.py @@ -1,10 +1,12 @@ import json import os + import elasticsearch import requests -import config from elasticsearch import helpers +import config + class Search: @@ -14,9 +16,8 @@ class Search: try: requests.head(config.elasticsearch_url) - print("elasticsearch is already running") except: - print("elasticsearch is not running") + print("elasticsearch is not running!") self.search_iterator = None @@ -33,7 +34,6 @@ class Search: info = requests.get("http://localhost:9200/" + self.index_name + "/_stats") if info.status_code == 200: - parsed_info = json.loads(info.text) return int(parsed_info["indices"][self.index_name]["total"]["store"]["size_in_bytes"]) @@ -171,8 +171,9 @@ class Search: path_list = [] - for option in suggestions["suggest"]["path"][0]["options"]: - path_list.append(option["_source"]["path"]) + if "suggest" in suggestions: + for option in suggestions["suggest"]["path"][0]["options"]: + path_list.append(option["_source"]["path"]) return path_list @@ -202,6 +203,3 @@ class Search: print("Error: multiple delete tasks at the same time") except Exception as e: print(e) - - - diff --git a/static/js/search.js b/static/js/search.js index 0e54d82..03f6b28 100644 --- a/static/js/search.js +++ b/static/js/search.js @@ -24,7 +24,7 @@ new InspireTreeDOM(tree, { tree.select(); tree.node("any").deselect(); -tree.on("node.click", function(event, node, handler) { +tree.on("node.click", function (event, node, handler) { event.preventTreeDefault(); if (node.id === "any") { @@ -44,23 +44,23 @@ new autoComplete({ selector: '#pathBar', minChars: 1, delay: 75, - renderItem: function (item){ + renderItem: function (item) { return '

' + item + '
'; }, - source: async function(term, suggest) { + source: async function (term, suggest) { term = term.toLowerCase(); const choices = await getPathChoices(); let matches = []; - for (let i=0; i= thresh && u < units.length - 1); + } while (Math.abs(bytes) >= thresh && u < units.length - 1); return bytes.toFixed(1) + ' ' + units[u]; } @@ -118,15 +120,21 @@ function humanFileSize(bytes) { /** * https://stackoverflow.com/questions/6312993 */ -function humanTime (sec_num) { +function humanTime(sec_num) { sec_num = Math.floor(sec_num); - let hours = Math.floor(sec_num / 3600); + let hours = Math.floor(sec_num / 3600); let minutes = Math.floor((sec_num - (hours * 3600)) / 60); let seconds = sec_num - (hours * 3600) - (minutes * 60); - if (hours < 10) {hours = "0" + hours;} - if (minutes < 10) {minutes = "0" + minutes;} - if (seconds < 10) {seconds = "0" + seconds;} + if (hours < 10) { + hours = "0" + hours; + } + if (minutes < 10) { + minutes = "0" + minutes; + } + if (seconds < 10) { + seconds = "0" + seconds; + } return hours + ":" + minutes + ":" + seconds; } @@ -134,7 +142,7 @@ function humanTime (sec_num) { function initPopover() { $('[data-toggle="popover"]').popover({ trigger: "focus", - delay: { "show": 0, "hide": 100 }, + delay: {"show": 0, "hide": 100}, placement: "bottom", html: true }); @@ -152,7 +160,7 @@ function gifOver(thumbnail, documentId) { thumbnail.mouseStayedOver = true; - window.setTimeout(function() { + window.setTimeout(function () { if (thumbnail.mouseStayedOver) { thumbnail.removeEventListener('mouseover', callee, false); @@ -163,7 +171,7 @@ function gifOver(thumbnail, documentId) { }); - thumbnail.addEventListener("mouseout", function() { + thumbnail.addEventListener("mouseout", function () { //Reset timer thumbnail.mouseStayedOver = false; thumbnail.setAttribute("src", "/thumb/" + documentId); @@ -173,10 +181,10 @@ function gifOver(thumbnail, documentId) { function downloadPopover(element, documentId) { element.setAttribute("data-content", - ' Download' + - ' View'); + ' Download' + + ' View'); element.setAttribute("data-toggle", "popover"); - element.addEventListener("mouseover", function() { + element.addEventListener("mouseover", function () { element.focus(); }); } @@ -242,7 +250,7 @@ function createDocCard(hit) { thumbnail.setAttribute("controls", ""); thumbnail.setAttribute("preload", "none"); thumbnail.setAttribute("poster", "/thumb/" + hit["_id"]); - thumbnail.addEventListener("dblclick", function() { + thumbnail.addEventListener("dblclick", function () { thumbnail.webkitRequestFullScreen(); }); @@ -272,7 +280,7 @@ function createDocCard(hit) { var format = hit["_source"]["format_name"]; //Hover - if(format === "GIF") { + if (format === "GIF") { gifOver(thumbnail, hit["_id"]); } break; @@ -302,10 +310,12 @@ function createDocCard(hit) { break; case "image": { - let formatTag = document.createElement("span"); - formatTag.setAttribute("class", "badge badge-pill badge-image"); - formatTag.appendChild(document.createTextNode(format)); - tags.push(formatTag); + if (format !== undefined) { + let formatTag = document.createElement("span"); + formatTag.setAttribute("class", "badge badge-pill badge-image"); + formatTag.appendChild(document.createTextNode(format)); + tags.push(formatTag); + } } break; case "audio": { @@ -315,9 +325,7 @@ function createDocCard(hit) { formatTag.appendChild(document.createTextNode(hit["_source"]["format_long_name"])); tags.push(formatTag); } - } - break; case "text": { let formatTag = document.createElement("span"); @@ -387,13 +395,15 @@ function createDocCard(hit) { function makePageIndicator(searchResult) { let pageIndicator = document.createElement("div"); - pageIndicator.appendChild(document.createTextNode(docCount + " / " +searchResult["hits"]["total"])); + const totalHits = searchResult["hits"]["total"].hasOwnProperty("value") + ? searchResult["hits"]["total"]["value"] : searchResult["hits"]["total"]; + pageIndicator.appendChild(document.createTextNode(docCount + " / " + totalHits)); return pageIndicator; } function insertHits(resultContainer, hits) { - for (let i = 0 ; i < hits.length; i++) { + for (let i = 0; i < hits.length; i++) { resultContainer.appendChild(createDocCard(hits[i])); docCount++; } @@ -409,7 +419,7 @@ window.addEventListener("scroll", function () { //load next page let xhttp = new XMLHttpRequest(); - xhttp.onreadystatechange = function() { + xhttp.onreadystatechange = function () { if (this.readyState === 4 && this.status === 200) { let searchResult = JSON.parse(this.responseText); @@ -449,7 +459,7 @@ function getSelectedMimeTypes() { for (let i = 0; i < selected.length; i++) { - if(selected[i].id === "any") { + if (selected[i].id === "any") { return "any" } @@ -468,7 +478,7 @@ function search() { searchQueued = false; //Clear old search results - let searchResults = document.getElementById("searchResults"); + let searchResults = document.getElementById("searchResults"); while (searchResults.firstChild) { searchResults.removeChild(searchResults.firstChild); } @@ -476,7 +486,7 @@ function search() { let query = searchBar.value; let xhttp = new XMLHttpRequest(); - xhttp.onreadystatechange = function() { + xhttp.onreadystatechange = function () { if (this.readyState === 4 && this.status === 200) { let searchResult = JSON.parse(this.responseText); @@ -542,7 +552,7 @@ $("#sizeSlider").ionRangeSlider({ drag_interval: true, prettify: function (num) { - if(num === 0) { + if (num === 0) { return "0 B" } else if (num >= 3684) { return humanFileSize(num * num * num) + "+"; @@ -550,11 +560,11 @@ $("#sizeSlider").ionRangeSlider({ return humanFileSize(num * num * num) }, - onChange: function(e) { + onChange: function (e) { size_min = (e.from * e.from * e.from); size_max = (e.to * e.to * e.to); - if (e.to >= 3684) { + if (e.to >= 3684) { size_max = 10000000000000; } @@ -566,12 +576,13 @@ $("#sizeSlider").ionRangeSlider({ function updateDirectories() { let selected = $('#directories').find('option:selected'); selectedDirs = []; - $(selected).each(function(){ + $(selected).each(function () { selectedDirs.push(parseInt($(this).val())); }); searchQueued = true; } + document.getElementById("directories").addEventListener("change", updateDirectories); updateDirectories(); searchQueued = false; @@ -581,7 +592,7 @@ function getPathChoices() { return new Promise(getPaths => { let xhttp = new XMLHttpRequest(); - xhttp.onreadystatechange = function() { + xhttp.onreadystatechange = function () { if (this.readyState === 4 && this.status === 200) { getPaths(JSON.parse(xhttp.responseText)) } diff --git a/storage.py b/storage.py index 06c8933..88f626c 100644 --- a/storage.py +++ b/storage.py @@ -1,7 +1,9 @@ -import sqlite3 import os -import flask_bcrypt +import sqlite3 import time + +import flask_bcrypt + import config @@ -39,7 +41,7 @@ class Option: Data structure to hold a directory option """ - def __init__(self, key: str, value: str, dir_id: int=None, opt_id: int = None): + def __init__(self, key: str, value: str, dir_id: int = None, opt_id: int = None): self.key = key self.value = value self.id = opt_id @@ -50,6 +52,7 @@ class Directory: """ Data structure to hold directory information """ + def __init__(self, path: str, enabled: bool, options: list, name: str): self.id = None self.path = path @@ -59,7 +62,7 @@ class Directory: def __str__(self): return self.path + " | enabled: " + str(self.enabled) + " | opts: " + str(self.options) - + def get_option(self, key): for option in self.options: @@ -77,7 +80,6 @@ class Directory: class Task: - INDEX = 1 GEN_THUMBNAIL = 2 @@ -268,7 +270,7 @@ class LocalStorage: c = conn.cursor() c.execute("UPDATE User SET is_admin=? WHERE username=?", (user.admin, user.username)) - c.execute("DELETE FROM User_canRead_Directory WHERE username=?", (user.username, )) + c.execute("DELETE FROM User_canRead_Directory WHERE username=?", (user.username,)) conn.commit() for access in user.readable_directories: @@ -332,7 +334,7 @@ class LocalStorage: conn = sqlite3.connect(self.db_path) c = conn.cursor() - c.execute("DELETE FROM Option WHERE id=?", (opt_id, )) + c.execute("DELETE FROM Option WHERE id=?", (opt_id,)) conn.commit() conn.close() @@ -385,7 +387,6 @@ class LocalStorage: conn.close() for db_task in tasks: - task = Task(db_task[2], db_task[1], db_task[3], db_task[4], db_task[0]) self.cached_tasks[task.id] = task @@ -402,7 +403,7 @@ class LocalStorage: conn = sqlite3.connect(self.db_path) c = conn.cursor() - c.execute("DELETE FROM Task WHERE id=?", (task_id, )) + c.execute("DELETE FROM Task WHERE id=?", (task_id,)) conn.commit() c.close() @@ -438,4 +439,4 @@ class LocalStorage: for access in accesses: access_list.append(access[1]) - return access_list \ No newline at end of file + return access_list diff --git a/templates/directory.html b/templates/directory.html index 8a07fdb..1bf5866 100644 --- a/templates/directory.html +++ b/templates/directory.html @@ -44,7 +44,7 @@ {{ directories[dir].name }}
{{ directories[dir].path }}
- Manage + Manage {% endfor %} diff --git a/templates/layout.html b/templates/layout.html index c2e55a4..27c0057 100644 --- a/templates/layout.html +++ b/templates/layout.html @@ -59,18 +59,20 @@