multithreading fixes and optimizations

This commit is contained in:
simon987 2019-04-05 19:59:21 -04:00
parent 60349cf954
commit 2e353c9787
7 changed files with 46 additions and 27 deletions

View File

@ -21,7 +21,7 @@ nGramMax = 3
elasticsearch_url = "http://localhost:9200"
# Password hashing
bcrypt_rounds = 14
bcrypt_rounds = 13
# sqlite3 database path
db_path = "./local_storage.db"
@ -29,7 +29,7 @@ db_path = "./local_storage.db"
allow_guests = True
# Number of threads used for parsing
parse_threads = 8
parse_threads = 32
# Number of threads used for thumbnail generation
tn_threads = 32

View File

@ -115,6 +115,8 @@ class Crawler:
doc = parser.parse(full_path)
doc["mime"] = mime
out_q.put(doc)
except:
pass
finally:
in_q.task_done()
@ -123,7 +125,7 @@ class Crawler:
if self.indexer is None:
while True:
try:
doc = out_q.get(timeout=10)
doc = out_q.get(timeout=120)
if doc is None:
break
except Empty:
@ -134,10 +136,11 @@ class Crawler:
while True:
try:
doc = out_q.get(timeout=10)
doc = out_q.get(timeout=120)
if doc is None:
break
except Empty:
print("outq empty")
break
try:
@ -171,7 +174,6 @@ class TaskManager:
directory = self.storage.dirs()[task.dir_id]
if task.type == Task.INDEX:
c = Crawler([])
self.current_process = Process(target=self.execute_crawl, args=(directory,
self.current_task.parsed_files,
self.current_task.done,
@ -236,8 +238,9 @@ class TaskManager:
def check_new_task(self):
if self.current_task is None:
for i in sorted(self.storage.tasks(), reverse=True):
self.start_task(self.storage.tasks()[i])
tasks = self.storage.tasks()
if len(tasks) > 0:
self.start_task(tasks[sorted(tasks)[0]])
else:
if self.current_task.done.value == 1:
self.current_process.terminate()

View File

@ -57,7 +57,8 @@ class Indexer:
"analysis": {"tokenizer": {"path_tokenizer": {"type": "path_hierarchy"}}}},
index=self.index_name)
self.es.indices.put_settings(body={
"analysis": {"tokenizer": {"my_nGram_tokenizer": {"type": "nGram", "min_gram": config.nGramMin, "max_gram": config.nGramMax}}}},
"analysis": {"tokenizer": {
"my_nGram_tokenizer": {"type": "nGram", "min_gram": config.nGramMin, "max_gram": config.nGramMax}}}},
index=self.index_name)
self.es.indices.put_settings(body={
"analysis": {"analyzer": {"path_analyser": {"tokenizer": "path_tokenizer", "filter": ["lowercase"]}}}},
@ -83,7 +84,9 @@ class Indexer:
"mtime": {"type": "integer"},
"size": {"type": "long"},
"directory": {"type": "short"},
"name": {"analyzer": "my_nGram", "type": "text"},
"name": {"analyzer": "content_analyser", "type": "text",
"fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}
},
"album": {"analyzer": "my_nGram", "type": "text"},
"artist": {"analyzer": "my_nGram", "type": "text"},
"title": {"analyzer": "my_nGram", "type": "text"},

5
run.py
View File

@ -1,4 +1,5 @@
import json
import logging
import os
import shutil
from io import BytesIO
@ -19,6 +20,10 @@ app = Flask(__name__)
app.secret_key = "A very secret key"
storage = LocalStorage(config.db_path)
# Disable flask logging
flaskLogger = logging.getLogger('werkzeug')
flaskLogger.setLevel(logging.ERROR)
tm = TaskManager(storage)
search = Search("changeme")

View File

@ -128,9 +128,9 @@ class Search:
condition: {
"multi_match": {
"query": query,
"fields": ["name", "content", "album", "artist", "title", "genre",
"album_artist", "font_name"],
"operator": "and"
"fields": ["name^3", "name.nGram^2", "content", "album^4", "artist^4", "title^4", "genre",
"album_artist^4", "font_name^2"],
"operator": "or"
}
},
"filter": filters
@ -141,15 +141,16 @@ class Search:
],
"highlight": {
"fields": {
"content": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
"name": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
"font_name": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
"content": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]},
"name": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]},
"name.nGram": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]},
"font_name": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]},
}
},
"aggs": {
"total_size": {"sum": {"field": "size"}}
},
"size": 40}, index=self.index_name, scroll="30m")
"size": 40}, index=self.index_name, scroll="15m")
return page
@ -189,14 +190,18 @@ class Search:
return None
def delete_directory(self, dir_id):
try:
self.es.delete_by_query(body={"query": {
"bool": {
"filter": {"term": {"directory": dir_id}}
}
}}, index=self.index_name)
except elasticsearch.exceptions.ConflictError:
print("Error: multiple delete tasks at the same time")
while True:
try:
self.es.delete_by_query(body={"query": {
"bool": {
"filter": {"term": {"directory": dir_id}}
}
}}, index=self.index_name, request_timeout=60)
break
except elasticsearch.exceptions.ConflictError:
print("Error: multiple delete tasks at the same time")
except Exception as e:
print(e)

View File

@ -94,8 +94,10 @@ body {overflow-y:scroll;}
}
}
.hl {
mark {
background: #fff217;
border-radius: 0;
padding: 1px 0;
}
.content-div {

View File

@ -206,6 +206,8 @@ function createDocCard(hit) {
if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("name")) {
title.insertAdjacentHTML('afterbegin', hit["highlight"]["name"] + extension);
} else if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("name.nGram")) {
title.insertAdjacentHTML('afterbegin', hit["highlight"]["name.nGram"] + extension);
} else {
title.appendChild(document.createTextNode(hit["_source"]["name"] + extension));
}
@ -491,7 +493,6 @@ function search() {
}
}
//Setup page
let resultContainer = makeResultContainer();
searchResults.appendChild(resultContainer);