mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-10 05:56:44 +00:00
ES 7.X support. Bug fixes. UI tweaks. Config fixes
This commit is contained in:
parent
64b743870f
commit
980babc5cc
@ -5,7 +5,8 @@ default_options = {
|
||||
"ThumbnailColor": "FF00FF",
|
||||
"TextFileContentLength": "2000",
|
||||
"PdfFileContentLength": "2000",
|
||||
"SpreadsheetContentLength": "2000",
|
||||
"DocxContentLength": "2000",
|
||||
"SpreadSheetContentLength": "2000",
|
||||
"EbookContentLength": "2000",
|
||||
"MimeGuesser": "extension", # extension, content
|
||||
"CheckSumCalculators": "", # md5, sha1, sha256
|
||||
@ -19,6 +20,7 @@ index_every = 10000
|
||||
nGramMin = 3
|
||||
nGramMax = 3
|
||||
elasticsearch_url = "http://localhost:9200"
|
||||
elasticsearch_index = "sist"
|
||||
|
||||
# Password hashing
|
||||
bcrypt_rounds = 13
|
||||
@ -41,4 +43,4 @@ try:
|
||||
except:
|
||||
cairosvg = False
|
||||
|
||||
VERSION = "1.1a"
|
||||
VERSION = "1.2a"
|
||||
|
63
crawler.py
63
crawler.py
@ -11,7 +11,7 @@ import config
|
||||
from indexer import Indexer
|
||||
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
|
||||
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
|
||||
PdfFileParser, DocxParser, EbookParser
|
||||
PdfFileParser, DocxParser, EbookParser, SpreadSheetParser
|
||||
from search import Search
|
||||
from storage import Directory
|
||||
from storage import Task, LocalStorage
|
||||
@ -154,6 +154,7 @@ class Crawler:
|
||||
pass
|
||||
finally:
|
||||
out_q.task_done()
|
||||
if self.documents:
|
||||
self.indexer.index(self.documents, self.dir_id)
|
||||
|
||||
|
||||
@ -162,7 +163,7 @@ class TaskManager:
|
||||
self.current_task = None
|
||||
self.storage = storage
|
||||
self.current_process = None
|
||||
self.indexer = Indexer("changeme")
|
||||
self.indexer = Indexer(config.elasticsearch_index)
|
||||
|
||||
scheduler = BackgroundScheduler()
|
||||
scheduler.add_job(self.check_new_task, "interval", seconds=0.5)
|
||||
@ -188,34 +189,54 @@ class TaskManager:
|
||||
|
||||
def execute_crawl(self, directory: Directory, counter: Value, done: Value, total_files: Value):
|
||||
|
||||
Search("changeme").delete_directory(directory.id)
|
||||
Search(config.elasticsearch_index).delete_directory(directory.id)
|
||||
|
||||
chksum_calcs = self.make_checksums_list(directory)
|
||||
|
||||
mime_guesser = ExtensionMimeGuesser() if directory.get_option("MimeGuesser") == "extension" \
|
||||
else ContentMimeGuesser()
|
||||
|
||||
c = Crawler(self.make_parser_list(chksum_calcs, directory), mime_guesser, self.indexer, directory.id)
|
||||
c.crawl(directory.path, counter, total_files)
|
||||
|
||||
done.value = 1
|
||||
|
||||
@staticmethod
|
||||
def make_checksums_list(directory):
|
||||
chksum_calcs = []
|
||||
|
||||
for arg in directory.get_option("CheckSumCalculators").split(","):
|
||||
|
||||
if arg.strip() == "md5":
|
||||
chksum_calcs.append(Md5CheckSumCalculator())
|
||||
elif arg.strip() == "sha1":
|
||||
chksum_calcs.append(Sha1CheckSumCalculator())
|
||||
elif arg.strip() == "sha256":
|
||||
chksum_calcs.append(Sha256CheckSumCalculator())
|
||||
return chksum_calcs
|
||||
|
||||
mime_guesser = ExtensionMimeGuesser() if directory.get_option("MimeGuesser") == "extension" \
|
||||
else ContentMimeGuesser()
|
||||
|
||||
c = Crawler([GenericFileParser(chksum_calcs, directory.path),
|
||||
MediaFileParser(chksum_calcs, directory.path),
|
||||
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path),
|
||||
PictureFileParser(chksum_calcs, directory.path),
|
||||
FontParser(chksum_calcs, directory.path),
|
||||
PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path),
|
||||
DocxParser(chksum_calcs, int(directory.get_option("SpreadsheetContentLength")), directory.path),
|
||||
EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path)],
|
||||
mime_guesser, self.indexer, directory.id)
|
||||
c.crawl(directory.path, counter, total_files)
|
||||
|
||||
done.value = 1
|
||||
@staticmethod
|
||||
def make_parser_list(chksum_calcs, directory):
|
||||
p = [p.strip() for p in directory.get_option("FileParsers").split(",")]
|
||||
parsers = [GenericFileParser(chksum_calcs, directory.path)]
|
||||
if "media" in p:
|
||||
parsers.append(MediaFileParser(chksum_calcs, directory.path))
|
||||
if "text" in p:
|
||||
parsers.append(
|
||||
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")), directory.path))
|
||||
if "picture" in p:
|
||||
parsers.append(PictureFileParser(chksum_calcs, directory.path))
|
||||
if "font" in p:
|
||||
parsers.append(FontParser(chksum_calcs, directory.path))
|
||||
if "pdf" in p:
|
||||
parsers.append(
|
||||
PdfFileParser(chksum_calcs, int(directory.get_option("PdfFileContentLength")), directory.path))
|
||||
if "docx" in p:
|
||||
parsers.append(DocxParser(chksum_calcs, int(directory.get_option("DocxContentLength")), directory.path))
|
||||
if "spreadsheet" in p:
|
||||
parsers.append(
|
||||
SpreadSheetParser(chksum_calcs, int(directory.get_option("SpreadSheetContentLength")), directory.path))
|
||||
if "ebook" in p:
|
||||
parsers.append(EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path))
|
||||
return parsers
|
||||
|
||||
def execute_thumbnails(self, directory: Directory, total_files: Value, counter: Value, done: Value):
|
||||
|
||||
@ -223,7 +244,7 @@ class TaskManager:
|
||||
if os.path.exists(dest_path):
|
||||
shutil.rmtree(dest_path)
|
||||
|
||||
docs = Search("changeme").get_all_documents(directory.id)
|
||||
docs = Search(config.elasticsearch_index).get_all_documents(directory.id)
|
||||
|
||||
tn_generator = ThumbnailGenerator(int(directory.get_option("ThumbnailSize")),
|
||||
int(directory.get_option("ThumbnailQuality")),
|
||||
|
@ -13,7 +13,7 @@ class Indexer:
|
||||
self.index_name = index
|
||||
self.es = elasticsearch.Elasticsearch()
|
||||
|
||||
requests.head("http://localhost:9200")
|
||||
requests.head(config.elasticsearch_url)
|
||||
if self.es.indices.exists(self.index_name):
|
||||
print("Index is already setup")
|
||||
else:
|
||||
@ -93,7 +93,7 @@ class Indexer:
|
||||
"genre": {"analyzer": "my_nGram", "type": "text"},
|
||||
"album_artist": {"analyzer": "my_nGram", "type": "text"},
|
||||
"content": {"analyzer": "content_analyser", "type": "text"},
|
||||
}}, doc_type="file", index=self.index_name)
|
||||
}}, doc_type="file", index=self.index_name, include_type_name=True)
|
||||
|
||||
self.es.indices.open(index=self.index_name)
|
||||
|
||||
|
BIN
local_storage.db
BIN
local_storage.db
Binary file not shown.
2
run.py
2
run.py
@ -25,7 +25,7 @@ flaskLogger = logging.getLogger('werkzeug')
|
||||
flaskLogger.setLevel(logging.ERROR)
|
||||
|
||||
tm = TaskManager(storage)
|
||||
search = Search("changeme")
|
||||
search = Search(config.elasticsearch_index)
|
||||
|
||||
|
||||
def get_dir_size(path):
|
||||
|
12
search.py
12
search.py
@ -1,10 +1,12 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import elasticsearch
|
||||
import requests
|
||||
import config
|
||||
from elasticsearch import helpers
|
||||
|
||||
import config
|
||||
|
||||
|
||||
class Search:
|
||||
|
||||
@ -14,9 +16,8 @@ class Search:
|
||||
|
||||
try:
|
||||
requests.head(config.elasticsearch_url)
|
||||
print("elasticsearch is already running")
|
||||
except:
|
||||
print("elasticsearch is not running")
|
||||
print("elasticsearch is not running!")
|
||||
|
||||
self.search_iterator = None
|
||||
|
||||
@ -33,7 +34,6 @@ class Search:
|
||||
info = requests.get("http://localhost:9200/" + self.index_name + "/_stats")
|
||||
|
||||
if info.status_code == 200:
|
||||
|
||||
parsed_info = json.loads(info.text)
|
||||
|
||||
return int(parsed_info["indices"][self.index_name]["total"]["store"]["size_in_bytes"])
|
||||
@ -171,6 +171,7 @@ class Search:
|
||||
|
||||
path_list = []
|
||||
|
||||
if "suggest" in suggestions:
|
||||
for option in suggestions["suggest"]["path"][0]["options"]:
|
||||
path_list.append(option["_source"]["path"])
|
||||
|
||||
@ -202,6 +203,3 @@ class Search:
|
||||
print("Error: multiple delete tasks at the same time")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
|
||||
|
||||
|
@ -73,7 +73,9 @@ function makeStatsCard(searchResult) {
|
||||
statsCardBody.setAttribute("class", "card-body");
|
||||
|
||||
let stat = document.createElement("p");
|
||||
stat.appendChild(document.createTextNode(searchResult["hits"]["total"] + " results in " + searchResult["took"] + "ms"));
|
||||
const totalHits = searchResult["hits"]["total"].hasOwnProperty("value")
|
||||
? searchResult["hits"]["total"]["value"] : searchResult["hits"]["total"];
|
||||
stat.appendChild(document.createTextNode(totalHits + " results in " + searchResult["took"] + "ms"));
|
||||
|
||||
let sizeStat = document.createElement("span");
|
||||
sizeStat.appendChild(document.createTextNode(humanFileSize(searchResult["aggregations"]["total_size"]["value"])));
|
||||
@ -124,9 +126,15 @@ function humanTime (sec_num) {
|
||||
let minutes = Math.floor((sec_num - (hours * 3600)) / 60);
|
||||
let seconds = sec_num - (hours * 3600) - (minutes * 60);
|
||||
|
||||
if (hours < 10) {hours = "0" + hours;}
|
||||
if (minutes < 10) {minutes = "0" + minutes;}
|
||||
if (seconds < 10) {seconds = "0" + seconds;}
|
||||
if (hours < 10) {
|
||||
hours = "0" + hours;
|
||||
}
|
||||
if (minutes < 10) {
|
||||
minutes = "0" + minutes;
|
||||
}
|
||||
if (seconds < 10) {
|
||||
seconds = "0" + seconds;
|
||||
}
|
||||
return hours + ":" + minutes + ":" + seconds;
|
||||
}
|
||||
|
||||
@ -302,11 +310,13 @@ function createDocCard(hit) {
|
||||
|
||||
break;
|
||||
case "image": {
|
||||
if (format !== undefined) {
|
||||
let formatTag = document.createElement("span");
|
||||
formatTag.setAttribute("class", "badge badge-pill badge-image");
|
||||
formatTag.appendChild(document.createTextNode(format));
|
||||
tags.push(formatTag);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case "audio": {
|
||||
if (hit["_source"].hasOwnProperty("format_long_name")) {
|
||||
@ -315,9 +325,7 @@ function createDocCard(hit) {
|
||||
formatTag.appendChild(document.createTextNode(hit["_source"]["format_long_name"]));
|
||||
tags.push(formatTag);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
break;
|
||||
case "text": {
|
||||
let formatTag = document.createElement("span");
|
||||
@ -387,7 +395,9 @@ function createDocCard(hit) {
|
||||
|
||||
function makePageIndicator(searchResult) {
|
||||
let pageIndicator = document.createElement("div");
|
||||
pageIndicator.appendChild(document.createTextNode(docCount + " / " +searchResult["hits"]["total"]));
|
||||
const totalHits = searchResult["hits"]["total"].hasOwnProperty("value")
|
||||
? searchResult["hits"]["total"]["value"] : searchResult["hits"]["total"];
|
||||
pageIndicator.appendChild(document.createTextNode(docCount + " / " + totalHits));
|
||||
return pageIndicator;
|
||||
}
|
||||
|
||||
@ -572,6 +582,7 @@ function updateDirectories() {
|
||||
|
||||
searchQueued = true;
|
||||
}
|
||||
|
||||
document.getElementById("directories").addEventListener("change", updateDirectories);
|
||||
updateDirectories();
|
||||
searchQueued = false;
|
||||
|
@ -1,7 +1,9 @@
|
||||
import sqlite3
|
||||
import os
|
||||
import flask_bcrypt
|
||||
import sqlite3
|
||||
import time
|
||||
|
||||
import flask_bcrypt
|
||||
|
||||
import config
|
||||
|
||||
|
||||
@ -50,6 +52,7 @@ class Directory:
|
||||
"""
|
||||
Data structure to hold directory information
|
||||
"""
|
||||
|
||||
def __init__(self, path: str, enabled: bool, options: list, name: str):
|
||||
self.id = None
|
||||
self.path = path
|
||||
@ -77,7 +80,6 @@ class Directory:
|
||||
|
||||
|
||||
class Task:
|
||||
|
||||
INDEX = 1
|
||||
GEN_THUMBNAIL = 2
|
||||
|
||||
@ -385,7 +387,6 @@ class LocalStorage:
|
||||
conn.close()
|
||||
|
||||
for db_task in tasks:
|
||||
|
||||
task = Task(db_task[2], db_task[1], db_task[3], db_task[4], db_task[0])
|
||||
self.cached_tasks[task.id] = task
|
||||
|
||||
|
@ -44,7 +44,7 @@
|
||||
<td>{{ directories[dir].name }}</td>
|
||||
<td style="word-break: break-all"><pre>{{ directories[dir].path }}</pre></td>
|
||||
<td><i class="far {{ "fa-check-square" if directories[dir].enabled else "fa-square" }}"></i></td>
|
||||
<td><a href="directory/{{ dir }}" class="btn btn-primary"><i class="fas fa-cog"></i> </a> Manage</td>
|
||||
<td><a href="directory/{{ dir }}" class="btn btn-primary"><i class="fas fa-cog"></i> Manage</a> </td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
|
@ -59,6 +59,7 @@
|
||||
|
||||
<div class="collapse navbar-collapse" id="navbarSupportedContent">
|
||||
<ul class="navbar-nav mr-auto">
|
||||
{% if session["username"] %}
|
||||
<li class="nav-item">
|
||||
<a class="nav-link {% if "directory" == active_page %}active{% endif %}" href="/directory">Directories</a>
|
||||
</li>
|
||||
@ -71,6 +72,7 @@
|
||||
<li class="nav-item">
|
||||
<a class="nav-link {% if "dashboard" == active_page %}active{% endif %}" href="/dashboard">Dashboard</a>
|
||||
</li>
|
||||
{% endif %}
|
||||
</ul>
|
||||
|
||||
{% if session["username"] %}
|
||||
|
@ -81,7 +81,7 @@
|
||||
|
||||
if (currentTask.total === 0) {
|
||||
|
||||
document.getElementById("task-label-" + currentTask.id).innerHTML = "Calculating file count...";
|
||||
document.getElementById("task-label-" + currentTask.id).innerHTML = "Initializing...";
|
||||
|
||||
} else {
|
||||
let bar = document.getElementById("task-bar-" + currentTask.id);
|
||||
@ -90,6 +90,8 @@
|
||||
|
||||
if (percent === 100) {
|
||||
bar.classList.add("bg-success")
|
||||
} else {
|
||||
bar.classList.remove("bg-success")
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -36,7 +36,7 @@ class ThumbnailGenerator:
|
||||
p.terminate()
|
||||
print("Timed out: " + path)
|
||||
else:
|
||||
self.generate_image("tmp", dest_path)
|
||||
self.generate_image(tmpfile, dest_path)
|
||||
except Exception:
|
||||
print("Couldn't make thumbnail for " + path)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user