mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-19 18:16:45 +00:00
multithreading fixes and optimizations
This commit is contained in:
parent
60349cf954
commit
2e353c9787
@ -21,7 +21,7 @@ nGramMax = 3
|
|||||||
elasticsearch_url = "http://localhost:9200"
|
elasticsearch_url = "http://localhost:9200"
|
||||||
|
|
||||||
# Password hashing
|
# Password hashing
|
||||||
bcrypt_rounds = 14
|
bcrypt_rounds = 13
|
||||||
# sqlite3 database path
|
# sqlite3 database path
|
||||||
db_path = "./local_storage.db"
|
db_path = "./local_storage.db"
|
||||||
|
|
||||||
@ -29,7 +29,7 @@ db_path = "./local_storage.db"
|
|||||||
allow_guests = True
|
allow_guests = True
|
||||||
|
|
||||||
# Number of threads used for parsing
|
# Number of threads used for parsing
|
||||||
parse_threads = 8
|
parse_threads = 32
|
||||||
|
|
||||||
# Number of threads used for thumbnail generation
|
# Number of threads used for thumbnail generation
|
||||||
tn_threads = 32
|
tn_threads = 32
|
||||||
|
13
crawler.py
13
crawler.py
@ -115,6 +115,8 @@ class Crawler:
|
|||||||
doc = parser.parse(full_path)
|
doc = parser.parse(full_path)
|
||||||
doc["mime"] = mime
|
doc["mime"] = mime
|
||||||
out_q.put(doc)
|
out_q.put(doc)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
finally:
|
finally:
|
||||||
in_q.task_done()
|
in_q.task_done()
|
||||||
|
|
||||||
@ -123,7 +125,7 @@ class Crawler:
|
|||||||
if self.indexer is None:
|
if self.indexer is None:
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
doc = out_q.get(timeout=10)
|
doc = out_q.get(timeout=120)
|
||||||
if doc is None:
|
if doc is None:
|
||||||
break
|
break
|
||||||
except Empty:
|
except Empty:
|
||||||
@ -134,10 +136,11 @@ class Crawler:
|
|||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
doc = out_q.get(timeout=10)
|
doc = out_q.get(timeout=120)
|
||||||
if doc is None:
|
if doc is None:
|
||||||
break
|
break
|
||||||
except Empty:
|
except Empty:
|
||||||
|
print("outq empty")
|
||||||
break
|
break
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -171,7 +174,6 @@ class TaskManager:
|
|||||||
directory = self.storage.dirs()[task.dir_id]
|
directory = self.storage.dirs()[task.dir_id]
|
||||||
|
|
||||||
if task.type == Task.INDEX:
|
if task.type == Task.INDEX:
|
||||||
c = Crawler([])
|
|
||||||
self.current_process = Process(target=self.execute_crawl, args=(directory,
|
self.current_process = Process(target=self.execute_crawl, args=(directory,
|
||||||
self.current_task.parsed_files,
|
self.current_task.parsed_files,
|
||||||
self.current_task.done,
|
self.current_task.done,
|
||||||
@ -236,8 +238,9 @@ class TaskManager:
|
|||||||
def check_new_task(self):
|
def check_new_task(self):
|
||||||
|
|
||||||
if self.current_task is None:
|
if self.current_task is None:
|
||||||
for i in sorted(self.storage.tasks(), reverse=True):
|
tasks = self.storage.tasks()
|
||||||
self.start_task(self.storage.tasks()[i])
|
if len(tasks) > 0:
|
||||||
|
self.start_task(tasks[sorted(tasks)[0]])
|
||||||
else:
|
else:
|
||||||
if self.current_task.done.value == 1:
|
if self.current_task.done.value == 1:
|
||||||
self.current_process.terminate()
|
self.current_process.terminate()
|
||||||
|
@ -57,7 +57,8 @@ class Indexer:
|
|||||||
"analysis": {"tokenizer": {"path_tokenizer": {"type": "path_hierarchy"}}}},
|
"analysis": {"tokenizer": {"path_tokenizer": {"type": "path_hierarchy"}}}},
|
||||||
index=self.index_name)
|
index=self.index_name)
|
||||||
self.es.indices.put_settings(body={
|
self.es.indices.put_settings(body={
|
||||||
"analysis": {"tokenizer": {"my_nGram_tokenizer": {"type": "nGram", "min_gram": config.nGramMin, "max_gram": config.nGramMax}}}},
|
"analysis": {"tokenizer": {
|
||||||
|
"my_nGram_tokenizer": {"type": "nGram", "min_gram": config.nGramMin, "max_gram": config.nGramMax}}}},
|
||||||
index=self.index_name)
|
index=self.index_name)
|
||||||
self.es.indices.put_settings(body={
|
self.es.indices.put_settings(body={
|
||||||
"analysis": {"analyzer": {"path_analyser": {"tokenizer": "path_tokenizer", "filter": ["lowercase"]}}}},
|
"analysis": {"analyzer": {"path_analyser": {"tokenizer": "path_tokenizer", "filter": ["lowercase"]}}}},
|
||||||
@ -83,7 +84,9 @@ class Indexer:
|
|||||||
"mtime": {"type": "integer"},
|
"mtime": {"type": "integer"},
|
||||||
"size": {"type": "long"},
|
"size": {"type": "long"},
|
||||||
"directory": {"type": "short"},
|
"directory": {"type": "short"},
|
||||||
"name": {"analyzer": "my_nGram", "type": "text"},
|
"name": {"analyzer": "content_analyser", "type": "text",
|
||||||
|
"fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}
|
||||||
|
},
|
||||||
"album": {"analyzer": "my_nGram", "type": "text"},
|
"album": {"analyzer": "my_nGram", "type": "text"},
|
||||||
"artist": {"analyzer": "my_nGram", "type": "text"},
|
"artist": {"analyzer": "my_nGram", "type": "text"},
|
||||||
"title": {"analyzer": "my_nGram", "type": "text"},
|
"title": {"analyzer": "my_nGram", "type": "text"},
|
||||||
|
5
run.py
5
run.py
@ -1,4 +1,5 @@
|
|||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@ -19,6 +20,10 @@ app = Flask(__name__)
|
|||||||
app.secret_key = "A very secret key"
|
app.secret_key = "A very secret key"
|
||||||
storage = LocalStorage(config.db_path)
|
storage = LocalStorage(config.db_path)
|
||||||
|
|
||||||
|
# Disable flask logging
|
||||||
|
flaskLogger = logging.getLogger('werkzeug')
|
||||||
|
flaskLogger.setLevel(logging.ERROR)
|
||||||
|
|
||||||
tm = TaskManager(storage)
|
tm = TaskManager(storage)
|
||||||
search = Search("changeme")
|
search = Search("changeme")
|
||||||
|
|
||||||
|
23
search.py
23
search.py
@ -128,9 +128,9 @@ class Search:
|
|||||||
condition: {
|
condition: {
|
||||||
"multi_match": {
|
"multi_match": {
|
||||||
"query": query,
|
"query": query,
|
||||||
"fields": ["name", "content", "album", "artist", "title", "genre",
|
"fields": ["name^3", "name.nGram^2", "content", "album^4", "artist^4", "title^4", "genre",
|
||||||
"album_artist", "font_name"],
|
"album_artist^4", "font_name^2"],
|
||||||
"operator": "and"
|
"operator": "or"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"filter": filters
|
"filter": filters
|
||||||
@ -141,15 +141,16 @@ class Search:
|
|||||||
],
|
],
|
||||||
"highlight": {
|
"highlight": {
|
||||||
"fields": {
|
"fields": {
|
||||||
"content": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
|
"content": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]},
|
||||||
"name": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
|
"name": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]},
|
||||||
"font_name": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
|
"name.nGram": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]},
|
||||||
|
"font_name": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]},
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"aggs": {
|
"aggs": {
|
||||||
"total_size": {"sum": {"field": "size"}}
|
"total_size": {"sum": {"field": "size"}}
|
||||||
},
|
},
|
||||||
"size": 40}, index=self.index_name, scroll="30m")
|
"size": 40}, index=self.index_name, scroll="15m")
|
||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
@ -189,14 +190,18 @@ class Search:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def delete_directory(self, dir_id):
|
def delete_directory(self, dir_id):
|
||||||
|
while True:
|
||||||
try:
|
try:
|
||||||
self.es.delete_by_query(body={"query": {
|
self.es.delete_by_query(body={"query": {
|
||||||
"bool": {
|
"bool": {
|
||||||
"filter": {"term": {"directory": dir_id}}
|
"filter": {"term": {"directory": dir_id}}
|
||||||
}
|
}
|
||||||
}}, index=self.index_name)
|
}}, index=self.index_name, request_timeout=60)
|
||||||
|
break
|
||||||
except elasticsearch.exceptions.ConflictError:
|
except elasticsearch.exceptions.ConflictError:
|
||||||
print("Error: multiple delete tasks at the same time")
|
print("Error: multiple delete tasks at the same time")
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -94,8 +94,10 @@ body {overflow-y:scroll;}
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
.hl {
|
mark {
|
||||||
background: #fff217;
|
background: #fff217;
|
||||||
|
border-radius: 0;
|
||||||
|
padding: 1px 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
.content-div {
|
.content-div {
|
||||||
|
@ -206,6 +206,8 @@ function createDocCard(hit) {
|
|||||||
|
|
||||||
if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("name")) {
|
if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("name")) {
|
||||||
title.insertAdjacentHTML('afterbegin', hit["highlight"]["name"] + extension);
|
title.insertAdjacentHTML('afterbegin', hit["highlight"]["name"] + extension);
|
||||||
|
} else if (hit.hasOwnProperty("highlight") && hit["highlight"].hasOwnProperty("name.nGram")) {
|
||||||
|
title.insertAdjacentHTML('afterbegin', hit["highlight"]["name.nGram"] + extension);
|
||||||
} else {
|
} else {
|
||||||
title.appendChild(document.createTextNode(hit["_source"]["name"] + extension));
|
title.appendChild(document.createTextNode(hit["_source"]["name"] + extension));
|
||||||
}
|
}
|
||||||
@ -491,7 +493,6 @@ function search() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//Setup page
|
//Setup page
|
||||||
let resultContainer = makeResultContainer();
|
let resultContainer = makeResultContainer();
|
||||||
searchResults.appendChild(resultContainer);
|
searchResults.appendChild(resultContainer);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user