mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-10 14:06:41 +00:00
multithreading fixes and optimizations
This commit is contained in:
parent
60349cf954
commit
2e353c9787
@ -21,7 +21,7 @@ nGramMax = 3
|
||||
elasticsearch_url = "http://localhost:9200"
|
||||
|
||||
# Password hashing
|
||||
bcrypt_rounds = 14
|
||||
bcrypt_rounds = 13
|
||||
# sqlite3 database path
|
||||
db_path = "./local_storage.db"
|
||||
|
||||
@ -29,7 +29,7 @@ db_path = "./local_storage.db"
|
||||
allow_guests = True
|
||||
|
||||
# Number of threads used for parsing
|
||||
parse_threads = 8
|
||||
parse_threads = 32
|
||||
|
||||
# Number of threads used for thumbnail generation
|
||||
tn_threads = 32
|
||||
|
13
crawler.py
13
crawler.py
@ -115,6 +115,8 @@ class Crawler:
|
||||
doc = parser.parse(full_path)
|
||||
doc["mime"] = mime
|
||||
out_q.put(doc)
|
||||
except:
|
||||
pass
|
||||
finally:
|
||||
in_q.task_done()
|
||||
|
||||
@ -123,7 +125,7 @@ class Crawler:
|
||||
if self.indexer is None:
|
||||
while True:
|
||||
try:
|
||||
doc = out_q.get(timeout=10)
|
||||
doc = out_q.get(timeout=120)
|
||||
if doc is None:
|
||||
break
|
||||
except Empty:
|
||||
@ -134,10 +136,11 @@ class Crawler:
|
||||
|
||||
while True:
|
||||
try:
|
||||
doc = out_q.get(timeout=10)
|
||||
doc = out_q.get(timeout=120)
|
||||
if doc is None:
|
||||
break
|
||||
except Empty:
|
||||
print("outq empty")
|
||||
break
|
||||
|
||||
try:
|
||||
@ -171,7 +174,6 @@ class TaskManager:
|
||||
directory = self.storage.dirs()[task.dir_id]
|
||||
|
||||
if task.type == Task.INDEX:
|
||||
c = Crawler([])
|
||||
self.current_process = Process(target=self.execute_crawl, args=(directory,
|
||||
self.current_task.parsed_files,
|
||||
self.current_task.done,
|
||||
@ -236,8 +238,9 @@ class TaskManager:
|
||||
def check_new_task(self):
|
||||
|
||||
if self.current_task is None:
|
||||
for i in sorted(self.storage.tasks(), reverse=True):
|
||||
self.start_task(self.storage.tasks()[i])
|
||||
tasks = self.storage.tasks()
|
||||
if len(tasks) > 0:
|
||||
self.start_task(tasks[sorted(tasks)[0]])
|
||||
else:
|
||||
if self.current_task.done.value == 1:
|
||||
self.current_process.terminate()
|
||||
|
@ -57,7 +57,8 @@ class Indexer:
|
||||
"analysis": {"tokenizer": {"path_tokenizer": {"type": "path_hierarchy"}}}},
|
||||
index=self.index_name)
|
||||
self.es.indices.put_settings(body={
|
||||
"analysis": {"tokenizer": {"my_nGram_tokenizer": {"type": "nGram", "min_gram": config.nGramMin, "max_gram": config.nGramMax}}}},
|
||||
"analysis": {"tokenizer": {
|
||||
"my_nGram_tokenizer": {"type": "nGram", "min_gram": config.nGramMin, "max_gram": config.nGramMax}}}},
|
||||
index=self.index_name)
|
||||
self.es.indices.put_settings(body={
|
||||
"analysis": {"analyzer": {"path_analyser": {"tokenizer": "path_tokenizer", "filter": ["lowercase"]}}}},
|
||||
@ -83,7 +84,9 @@ class Indexer:
|
||||
"mtime": {"type": "integer"},
|
||||
"size": {"type": "long"},
|
||||
"directory": {"type": "short"},
|
||||
"name": {"analyzer": "my_nGram", "type": "text"},
|
||||
"name": {"analyzer": "content_analyser", "type": "text",
|
||||
"fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}
|
||||
},
|
||||
"album": {"analyzer": "my_nGram", "type": "text"},
|
||||
"artist": {"analyzer": "my_nGram", "type": "text"},
|
||||
"title": {"analyzer": "my_nGram", "type": "text"},
|
||||
|
5
run.py
5
run.py
@ -1,4 +1,5 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from io import BytesIO
|
||||
@ -19,6 +20,10 @@ app = Flask(__name__)
|
||||
app.secret_key = "A very secret key"
|
||||
storage = LocalStorage(config.db_path)
|
||||
|
||||
# Disable flask logging
|
||||
flaskLogger = logging.getLogger('werkzeug')
|
||||
flaskLogger.setLevel(logging.ERROR)
|
||||
|
||||
tm = TaskManager(storage)
|
||||
search = Search("changeme")
|
||||
|
||||
|
37
search.py
37
search.py
@ -128,9 +128,9 @@ class Search:
|
||||
condition: {
|
||||
"multi_match": {
|
||||
"query": query,
|
||||
"fields": ["name", "content", "album", "artist", "title", "genre",
|
||||
"album_artist", "font_name"],
|
||||
"operator": "and"
|
||||
"fields": ["name^3", "name.nGram^2", "content", "album^4", "artist^4", "title^4", "genre",
|
||||
"album_artist^4", "font_name^2"],
|
||||
"operator": "or"
|
||||
}
|
||||
},
|
||||
"filter": filters
|
||||
@ -141,15 +141,16 @@ class Search:
|
||||
],
|
||||
"highlight": {
|
||||
"fields": {
|
||||
"content": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
|
||||
"name": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
|
||||
"font_name": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
|
||||
"content": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]},
|
||||
"name": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]},
|
||||
"name.nGram": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]},
|
||||
"font_name": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]},
|
||||
}
|
||||
},
|
||||
"aggs": {
|
||||
"total_size": {"sum": {"field": "size"}}
|
||||
},
|
||||
"size": 40}, index=self.index_name, scroll="30m")
|
||||
"size": 40}, index=self.index_name, scroll="15m")
|
||||
|
||||
return page
|
||||
|
||||
@ -189,14 +190,18 @@ class Search:
|
||||
return None
|
||||
|
||||
def delete_directory(self, dir_id):
|
||||
|
||||
try:
|
||||
self.es.delete_by_query(body={"query": {
|
||||
"bool": {
|
||||
"filter": {"term": {"directory": dir_id}}
|
||||
}
|
||||
}}, index=self.index_name)
|
||||
except elasticsearch.exceptions.ConflictError:
|
||||
print("Error: multiple delete tasks at the same time")
|
||||
while True:
|
||||
try:
|
||||
self.es.delete_by_query(body={"query": {
|
||||
"bool": {
|
||||
"filter": {"term": {"directory": dir_id}}
|
||||
}
|
||||
}}, index=self.index_name, request_timeout=60)
|
||||
break
|
||||
except elasticsearch.exceptions.ConflictError:
|
||||
print("Error: multiple delete tasks at the same time")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
|
||||
|
||||
|
@ -94,8 +94,10 @@ body {overflow-y:scroll;}
|
||||
}
|
||||
}
|
||||
|
||||
.hl {
|
||||
mark {
|
||||
background: #fff217;
|
||||
border-radius: 0;
|
||||
padding: 1px 0;
|
||||
}
|
||||
|
||||
.content-div {
|
||||
|
@ -206,6 +206,8 @@ function createDocCard(hit) {
|
||||
|
||||
if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("name")) {
|
||||
title.insertAdjacentHTML('afterbegin', hit["highlight"]["name"] + extension);
|
||||
} else if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("name.nGram")) {
|
||||
title.insertAdjacentHTML('afterbegin', hit["highlight"]["name.nGram"] + extension);
|
||||
} else {
|
||||
title.appendChild(document.createTextNode(hit["_source"]["name"] + extension));
|
||||
}
|
||||
@ -491,7 +493,6 @@ function search() {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//Setup page
|
||||
let resultContainer = makeResultContainer();
|
||||
searchResults.appendChild(resultContainer);
|
||||
|
Loading…
x
Reference in New Issue
Block a user