ES 7.X support. Bug fixes. UI tweaks. Config fixes

This commit is contained in:
simon 2019-05-26 11:31:28 -04:00
parent 64b743870f
commit 980babc5cc
12 changed files with 139 additions and 102 deletions

View File

@ -5,7 +5,8 @@ default_options = {
"ThumbnailColor": "FF00FF",
"TextFileContentLength": "2000",
"PdfFileContentLength": "2000",
"SpreadsheetContentLength": "2000",
"DocxContentLength": "2000",
"SpreadSheetContentLength": "2000",
"EbookContentLength": "2000",
"MimeGuesser": "extension", # extension, content
"CheckSumCalculators": "", # md5, sha1, sha256
@ -19,6 +20,7 @@ index_every = 10000
nGramMin = 3
nGramMax = 3
elasticsearch_url = "http://localhost:9200"
elasticsearch_index = "sist"
# Password hashing
bcrypt_rounds = 13
@ -41,4 +43,4 @@ try:
except:
cairosvg = False
VERSION = "1.1a"
VERSION = "1.2a"

View File

@ -11,7 +11,7 @@ import config
from indexer import Indexer
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
PdfFileParser, DocxParser, EbookParser
PdfFileParser, DocxParser, EbookParser, SpreadSheetParser
from search import Search
from storage import Directory
from storage import Task, LocalStorage
@ -154,6 +154,7 @@ class Crawler:
pass
finally:
out_q.task_done()
if self.documents:
self.indexer.index(self.documents, self.dir_id)
@ -162,7 +163,7 @@ class TaskManager:
self.current_task = None
self.storage = storage
self.current_process = None
self.indexer = Indexer("changeme")
self.indexer = Indexer(config.elasticsearch_index)
scheduler = BackgroundScheduler()
scheduler.add_job(self.check_new_task, "interval", seconds=0.5)
@ -188,34 +189,54 @@ class TaskManager:
def execute_crawl(self, directory: Directory, counter: Value, done: Value, total_files: Value):
Search("changeme").delete_directory(directory.id)
Search(config.elasticsearch_index).delete_directory(directory.id)
chksum_calcs = self.make_checksums_list(directory)
mime_guesser = ExtensionMimeGuesser() if directory.get_option("MimeGuesser") == "extension" \
else ContentMimeGuesser()
c = Crawler(self.make_parser_list(chksum_calcs, directory), mime_guesser, self.indexer, directory.id)
c.crawl(directory.path, counter, total_files)
done.value = 1
@staticmethod
def make_checksums_list(directory):
chksum_calcs = []
for arg in directory.get_option("CheckSumCalculators").split(","):
if arg.strip() == "md5":
chksum_calcs.append(Md5CheckSumCalculator())
elif arg.strip() == "sha1":
chksum_calcs.append(Sha1CheckSumCalculator())
elif arg.strip() == "sha256":
chksum_calcs.append(Sha256CheckSumCalculator())
return chksum_calcs
mime_guesser = ExtensionMimeGuesser() if directory.get_option("MimeGuesser") == "extension" \
else ContentMimeGuesser()
c = Crawler([GenericFileParser(chksum_calcs, directory.path),
MediaFileParser(chksum_calcs, directory.path),
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path),
PictureFileParser(chksum_calcs, directory.path),
FontParser(chksum_calcs, directory.path),
PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path),
DocxParser(chksum_calcs, int(directory.get_option("SpreadsheetContentLength")), directory.path),
EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path)],
mime_guesser, self.indexer, directory.id)
c.crawl(directory.path, counter, total_files)
done.value = 1
@staticmethod
def make_parser_list(chksum_calcs, directory):
p = [p.strip() for p in directory.get_option("FileParsers").split(",")]
parsers = [GenericFileParser(chksum_calcs, directory.path)]
if "media" in p:
parsers.append(MediaFileParser(chksum_calcs, directory.path))
if "text" in p:
parsers.append(
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path))
if "picture" in p:
parsers.append(PictureFileParser(chksum_calcs, directory.path))
if "font" in p:
parsers.append(FontParser(chksum_calcs, directory.path))
if "pdf" in p:
parsers.append(
PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path))
if "docx" in p:
parsers.append(DocxParser(chksum_calcs, int(directory.get_option("DocxContentLength")), directory.path))
if "spreadsheet" in p:
parsers.append(
SpreadSheetParser(chksum_calcs, int(directory.get_option("SpreadSheetContentLength")), directory.path))
if "ebook" in p:
parsers.append(EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path))
return parsers
def execute_thumbnails(self, directory: Directory, total_files: Value, counter: Value, done: Value):
@ -223,7 +244,7 @@ class TaskManager:
if os.path.exists(dest_path):
shutil.rmtree(dest_path)
docs = Search("changeme").get_all_documents(directory.id)
docs = Search(config.elasticsearch_index).get_all_documents(directory.id)
tn_generator = ThumbnailGenerator(int(directory.get_option("ThumbnailSize")),
int(directory.get_option("ThumbnailQuality")),

View File

@ -13,7 +13,7 @@ class Indexer:
self.index_name = index
self.es = elasticsearch.Elasticsearch()
requests.head("http://localhost:9200")
requests.head(config.elasticsearch_url)
if self.es.indices.exists(self.index_name):
print("Index is already setup")
else:
@ -93,7 +93,7 @@ class Indexer:
"genre": {"analyzer": "my_nGram", "type": "text"},
"album_artist": {"analyzer": "my_nGram", "type": "text"},
"content": {"analyzer": "content_analyser", "type": "text"},
}}, doc_type="file", index=self.index_name)
}}, doc_type="file", index=self.index_name, include_type_name=True)
self.es.indices.open(index=self.index_name)

Binary file not shown.

2
run.py
View File

@ -25,7 +25,7 @@ flaskLogger = logging.getLogger('werkzeug')
flaskLogger.setLevel(logging.ERROR)
tm = TaskManager(storage)
search = Search("changeme")
search = Search(config.elasticsearch_index)
def get_dir_size(path):

View File

@ -1,10 +1,12 @@
import json
import os
import elasticsearch
import requests
import config
from elasticsearch import helpers
import config
class Search:
@ -14,9 +16,8 @@ class Search:
try:
requests.head(config.elasticsearch_url)
print("elasticsearch is already running")
except:
print("elasticsearch is not running")
print("elasticsearch is not running!")
self.search_iterator = None
@ -33,7 +34,6 @@ class Search:
info = requests.get("http://localhost:9200/" + self.index_name + "/_stats")
if info.status_code == 200:
parsed_info = json.loads(info.text)
return int(parsed_info["indices"][self.index_name]["total"]["store"]["size_in_bytes"])
@ -171,6 +171,7 @@ class Search:
path_list = []
if "suggest" in suggestions:
for option in suggestions["suggest"]["path"][0]["options"]:
path_list.append(option["_source"]["path"])
@ -202,6 +203,3 @@ class Search:
print("Error: multiple delete tasks at the same time")
except Exception as e:
print(e)

View File

@ -73,7 +73,9 @@ function makeStatsCard(searchResult) {
statsCardBody.setAttribute("class", "card-body");
let stat = document.createElement("p");
stat.appendChild(document.createTextNode(searchResult["hits"]["total"] + " results in " + searchResult["took"] + "ms"));
const totalHits = searchResult["hits"]["total"].hasOwnProperty("value")
? searchResult["hits"]["total"]["value"] : searchResult["hits"]["total"];
stat.appendChild(document.createTextNode(totalHits + " results in " + searchResult["took"] + "ms"));
let sizeStat = document.createElement("span");
sizeStat.appendChild(document.createTextNode(humanFileSize(searchResult["aggregations"]["total_size"]["value"])));
@ -124,9 +126,15 @@ function humanTime (sec_num) {
let minutes = Math.floor((sec_num - (hours * 3600)) / 60);
let seconds = sec_num - (hours * 3600) - (minutes * 60);
if (hours < 10) {hours = "0" + hours;}
if (minutes < 10) {minutes = "0" + minutes;}
if (seconds < 10) {seconds = "0" + seconds;}
if (hours < 10) {
hours = "0" + hours;
}
if (minutes < 10) {
minutes = "0" + minutes;
}
if (seconds < 10) {
seconds = "0" + seconds;
}
return hours + ":" + minutes + ":" + seconds;
}
@ -302,11 +310,13 @@ function createDocCard(hit) {
break;
case "image": {
if (format !== undefined) {
let formatTag = document.createElement("span");
formatTag.setAttribute("class", "badge badge-pill badge-image");
formatTag.appendChild(document.createTextNode(format));
tags.push(formatTag);
}
}
break;
case "audio": {
if (hit["_source"].hasOwnProperty("format_long_name")) {
@ -315,9 +325,7 @@ function createDocCard(hit) {
formatTag.appendChild(document.createTextNode(hit["_source"]["format_long_name"]));
tags.push(formatTag);
}
}
break;
case "text": {
let formatTag = document.createElement("span");
@ -387,7 +395,9 @@ function createDocCard(hit) {
function makePageIndicator(searchResult) {
let pageIndicator = document.createElement("div");
pageIndicator.appendChild(document.createTextNode(docCount + " / " +searchResult["hits"]["total"]));
const totalHits = searchResult["hits"]["total"].hasOwnProperty("value")
? searchResult["hits"]["total"]["value"] : searchResult["hits"]["total"];
pageIndicator.appendChild(document.createTextNode(docCount + " / " + totalHits));
return pageIndicator;
}
@ -572,6 +582,7 @@ function updateDirectories() {
searchQueued = true;
}
document.getElementById("directories").addEventListener("change", updateDirectories);
updateDirectories();
searchQueued = false;

View File

@ -1,7 +1,9 @@
import sqlite3
import os
import flask_bcrypt
import sqlite3
import time
import flask_bcrypt
import config
@ -50,6 +52,7 @@ class Directory:
"""
Data structure to hold directory information
"""
def __init__(self, path: str, enabled: bool, options: list, name: str):
self.id = None
self.path = path
@ -77,7 +80,6 @@ class Directory:
class Task:
INDEX = 1
GEN_THUMBNAIL = 2
@ -385,7 +387,6 @@ class LocalStorage:
conn.close()
for db_task in tasks:
task = Task(db_task[2], db_task[1], db_task[3], db_task[4], db_task[0])
self.cached_tasks[task.id] = task

View File

@ -44,7 +44,7 @@
<td>{{ directories[dir].name }}</td>
<td style="word-break: break-all"><pre>{{ directories[dir].path }}</pre></td>
<td><i class="far {{ "fa-check-square" if directories[dir].enabled else "fa-square" }}"></i></td>
<td><a href="directory/{{ dir }}" class="btn btn-primary"><i class="fas fa-cog"></i> </a> Manage</td>
<td><a href="directory/{{ dir }}" class="btn btn-primary"><i class="fas fa-cog"></i> Manage</a> </td>
</tr>
{% endfor %}
</tbody>

View File

@ -59,6 +59,7 @@
<div class="collapse navbar-collapse" id="navbarSupportedContent">
<ul class="navbar-nav mr-auto">
{% if session["username"] %}
<li class="nav-item">
<a class="nav-link {% if "directory" == active_page %}active{% endif %}" href="/directory">Directories</a>
</li>
@ -71,6 +72,7 @@
<li class="nav-item">
<a class="nav-link {% if "dashboard" == active_page %}active{% endif %}" href="/dashboard">Dashboard</a>
</li>
{% endif %}
</ul>
{% if session["username"] %}

View File

@ -81,7 +81,7 @@
if (currentTask.total === 0) {
document.getElementById("task-label-" + currentTask.id).innerHTML = "Calculating file count...";
document.getElementById("task-label-" + currentTask.id).innerHTML = "Initializing...";
} else {
let bar = document.getElementById("task-bar-" + currentTask.id);
@ -90,6 +90,8 @@
if (percent === 100) {
bar.classList.add("bg-success")
} else {
bar.classList.remove("bg-success")
}
}

View File

@ -36,7 +36,7 @@ class ThumbnailGenerator:
p.terminate()
print("Timed out: " + path)
else:
self.generate_image("tmp", dest_path)
self.generate_image(tmpfile, dest_path)
except Exception:
print("Couldn't make thumbnail for " + path)