mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-19 18:16:45 +00:00
Bug fixes + multi threading
This commit is contained in:
parent
36b9ed6cb7
commit
746ad25a4e
@ -29,6 +29,7 @@ Once the web server is running, you can connect to the search interface by typin
|
|||||||
|
|
||||||
* Download and install [Elasticsearch](https://www.elastic.co/downloads/elasticsearch)
|
* Download and install [Elasticsearch](https://www.elastic.co/downloads/elasticsearch)
|
||||||
|
|
||||||
|
* Edit settings in [config.py](https://github.com/simon987/Simple-Incremental-Search-Tool/blob/master/config.py) (Default values are ok in most cases)
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/simon987/Simple-Incremental-Search-Tool
|
git clone https://github.com/simon987/Simple-Incremental-Search-Tool
|
||||||
|
|
||||||
|
11
config.py
11
config.py
@ -26,7 +26,14 @@ bcrypt_rounds = 14
|
|||||||
db_path = "./local_storage.db"
|
db_path = "./local_storage.db"
|
||||||
|
|
||||||
# Set to true to allow guests to search any directory
|
# Set to true to allow guests to search any directory
|
||||||
allow_guests = False
|
allow_guests = True
|
||||||
|
|
||||||
|
# Number of threads used for parsing
|
||||||
|
parse_threads = 8
|
||||||
|
|
||||||
|
# Number of threads used for thumbnail generation
|
||||||
|
tn_threads = 32
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import cairosvg
|
import cairosvg
|
||||||
@ -34,4 +41,4 @@ try:
|
|||||||
except:
|
except:
|
||||||
cairosvg = False
|
cairosvg = False
|
||||||
|
|
||||||
VERSION = "1.0a"
|
VERSION = "1.1a"
|
||||||
|
128
crawler.py
128
crawler.py
@ -2,6 +2,8 @@ import json
|
|||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
from multiprocessing import Process, Value
|
from multiprocessing import Process, Value
|
||||||
|
from queue import Queue, Empty, Full
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
from apscheduler.schedulers.background import BackgroundScheduler
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
|
|
||||||
@ -51,39 +53,42 @@ class Crawler:
|
|||||||
|
|
||||||
self.mime_guesser = mime_guesser
|
self.mime_guesser = mime_guesser
|
||||||
|
|
||||||
def crawl(self, root_dir: str, counter: Value = None):
|
def crawl(self, root_dir: str, counter: Value = None, total_files = None):
|
||||||
|
|
||||||
document_counter = 0
|
in_q = Queue(50000) # TODO: get from config?
|
||||||
|
out_q = Queue()
|
||||||
|
|
||||||
|
threads = []
|
||||||
|
print("Creating %d threads" % (config.parse_threads,))
|
||||||
|
for _ in range(config.parse_threads):
|
||||||
|
t = Thread(target=self.parse_file, args=[in_q, out_q, ])
|
||||||
|
threads.append(t)
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
indexer_thread = Thread(target=self.index_file, args=[out_q, counter, ])
|
||||||
|
indexer_thread.start()
|
||||||
|
|
||||||
for root, dirs, files in os.walk(root_dir):
|
for root, dirs, files in os.walk(root_dir):
|
||||||
|
|
||||||
for filename in files:
|
for filename in files:
|
||||||
full_path = os.path.join(root, filename)
|
while True:
|
||||||
|
|
||||||
mime = self.mime_guesser.guess_mime(full_path)
|
|
||||||
|
|
||||||
parser = self.ext_map.get(mime, self.default_parser)
|
|
||||||
|
|
||||||
document_counter += 1
|
|
||||||
if document_counter >= config.index_every:
|
|
||||||
document_counter = 0
|
|
||||||
|
|
||||||
self.indexer.index(self.documents, self.dir_id)
|
|
||||||
self.documents.clear()
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if counter:
|
in_q.put(os.path.join(root, filename), timeout=10)
|
||||||
counter.value += 1
|
if total_files:
|
||||||
|
total_files.value += 1
|
||||||
|
break
|
||||||
|
except Full:
|
||||||
|
continue
|
||||||
|
|
||||||
doc = parser.parse(full_path)
|
in_q.join()
|
||||||
doc["mime"] = mime
|
out_q.join()
|
||||||
|
|
||||||
self.documents.append(doc)
|
for _ in threads:
|
||||||
except FileNotFoundError:
|
in_q.put(None)
|
||||||
continue # File was deleted
|
out_q.put(None)
|
||||||
|
|
||||||
if self.indexer is not None and len(self.documents) > 0:
|
indexer_thread.join()
|
||||||
self.indexer.index(self.documents, self.dir_id)
|
for t in threads:
|
||||||
|
t.join()
|
||||||
|
|
||||||
def countFiles(self, root_dir: str):
|
def countFiles(self, root_dir: str):
|
||||||
count = 0
|
count = 0
|
||||||
@ -93,6 +98,61 @@ class Crawler:
|
|||||||
|
|
||||||
return count
|
return count
|
||||||
|
|
||||||
|
def parse_file(self, in_q: Queue, out_q: Queue):
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
full_path = in_q.get(timeout=1)
|
||||||
|
if full_path is None:
|
||||||
|
break
|
||||||
|
except Empty:
|
||||||
|
break
|
||||||
|
|
||||||
|
try:
|
||||||
|
mime = self.mime_guesser.guess_mime(full_path)
|
||||||
|
parser = self.ext_map.get(mime, self.default_parser)
|
||||||
|
|
||||||
|
doc = parser.parse(full_path)
|
||||||
|
doc["mime"] = mime
|
||||||
|
out_q.put(doc)
|
||||||
|
finally:
|
||||||
|
in_q.task_done()
|
||||||
|
|
||||||
|
def index_file(self, out_q: Queue, count: Value):
|
||||||
|
|
||||||
|
if self.indexer is None:
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
doc = out_q.get(timeout=10)
|
||||||
|
if doc is None:
|
||||||
|
break
|
||||||
|
except Empty:
|
||||||
|
break
|
||||||
|
self.documents.append(doc)
|
||||||
|
out_q.task_done()
|
||||||
|
return
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
doc = out_q.get(timeout=10)
|
||||||
|
if doc is None:
|
||||||
|
break
|
||||||
|
except Empty:
|
||||||
|
break
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.documents.append(doc)
|
||||||
|
count.value += 1
|
||||||
|
|
||||||
|
if count.value % config.index_every == 0:
|
||||||
|
self.indexer.index(self.documents, self.dir_id)
|
||||||
|
self.documents.clear()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
out_q.task_done()
|
||||||
|
self.indexer.index(self.documents, self.dir_id)
|
||||||
|
|
||||||
|
|
||||||
class TaskManager:
|
class TaskManager:
|
||||||
def __init__(self, storage: LocalStorage):
|
def __init__(self, storage: LocalStorage):
|
||||||
@ -112,10 +172,10 @@ class TaskManager:
|
|||||||
|
|
||||||
if task.type == Task.INDEX:
|
if task.type == Task.INDEX:
|
||||||
c = Crawler([])
|
c = Crawler([])
|
||||||
self.current_task.total_files.value = c.countFiles(directory.path)
|
self.current_process = Process(target=self.execute_crawl, args=(directory,
|
||||||
|
self.current_task.parsed_files,
|
||||||
self.current_process = Process(target=self.execute_crawl, args=(directory, self.current_task.parsed_files,
|
self.current_task.done,
|
||||||
self.current_task.done))
|
self.current_task.total_files))
|
||||||
|
|
||||||
elif task.type == Task.GEN_THUMBNAIL:
|
elif task.type == Task.GEN_THUMBNAIL:
|
||||||
self.current_process = Process(target=self.execute_thumbnails, args=(directory,
|
self.current_process = Process(target=self.execute_thumbnails, args=(directory,
|
||||||
@ -124,7 +184,7 @@ class TaskManager:
|
|||||||
self.current_task.done))
|
self.current_task.done))
|
||||||
self.current_process.start()
|
self.current_process.start()
|
||||||
|
|
||||||
def execute_crawl(self, directory: Directory, counter: Value, done: Value):
|
def execute_crawl(self, directory: Directory, counter: Value, done: Value, total_files: Value):
|
||||||
|
|
||||||
Search("changeme").delete_directory(directory.id)
|
Search("changeme").delete_directory(directory.id)
|
||||||
|
|
||||||
@ -151,7 +211,7 @@ class TaskManager:
|
|||||||
DocxParser(chksum_calcs, int(directory.get_option("SpreadsheetContentLength")), directory.path),
|
DocxParser(chksum_calcs, int(directory.get_option("SpreadsheetContentLength")), directory.path),
|
||||||
EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path)],
|
EbookParser(chksum_calcs, int(directory.get_option("EbookContentLength")), directory.path)],
|
||||||
mime_guesser, self.indexer, directory.id)
|
mime_guesser, self.indexer, directory.id)
|
||||||
c.crawl(directory.path, counter)
|
c.crawl(directory.path, counter, total_files)
|
||||||
|
|
||||||
done.value = 1
|
done.value = 1
|
||||||
|
|
||||||
@ -161,14 +221,12 @@ class TaskManager:
|
|||||||
if os.path.exists(dest_path):
|
if os.path.exists(dest_path):
|
||||||
shutil.rmtree(dest_path)
|
shutil.rmtree(dest_path)
|
||||||
|
|
||||||
docs = list(Search("changeme").get_all_documents(directory.id))
|
docs = Search("changeme").get_all_documents(directory.id)
|
||||||
|
|
||||||
total_files.value = len(docs)
|
|
||||||
|
|
||||||
tn_generator = ThumbnailGenerator(int(directory.get_option("ThumbnailSize")),
|
tn_generator = ThumbnailGenerator(int(directory.get_option("ThumbnailSize")),
|
||||||
int(directory.get_option("ThumbnailQuality")),
|
int(directory.get_option("ThumbnailQuality")),
|
||||||
directory.get_option("ThumbnailColor"))
|
directory.get_option("ThumbnailColor"))
|
||||||
tn_generator.generate_all(docs, dest_path, counter, directory)
|
tn_generator.generate_all(docs, dest_path, counter, directory, total_files)
|
||||||
|
|
||||||
done.value = 1
|
done.value = 1
|
||||||
|
|
||||||
|
@ -143,7 +143,7 @@ class GenericFileParser(FileParser):
|
|||||||
name, extension = os.path.splitext(name)
|
name, extension = os.path.splitext(name)
|
||||||
|
|
||||||
info["size"] = file_stat.st_size
|
info["size"] = file_stat.st_size
|
||||||
info["path"] = path[self.root_dir_len:]
|
info["path"] = os.path.relpath(path, self.root_dir)
|
||||||
info["name"] = name
|
info["name"] = name
|
||||||
info["extension"] = extension[1:]
|
info["extension"] = extension[1:]
|
||||||
info["mtime"] = file_stat.st_mtime
|
info["mtime"] = file_stat.st_mtime
|
||||||
|
2
run.py
2
run.py
@ -241,7 +241,7 @@ def search_liste_page():
|
|||||||
|
|
||||||
def get_allowed_dirs(username):
|
def get_allowed_dirs(username):
|
||||||
if config.allow_guests:
|
if config.allow_guests:
|
||||||
return [x for x in storage.dirs() if x.enabled]
|
return [x for x in storage.dirs() if storage.dirs()[x].enabled]
|
||||||
if username:
|
if username:
|
||||||
user = storage.users()[username]
|
user = storage.users()[username]
|
||||||
return [x for x in storage.dirs() if storage.dirs()[x].enabled and x in user.readable_directories]
|
return [x for x in storage.dirs() if storage.dirs()[x].enabled and x in user.readable_directories]
|
||||||
|
@ -149,7 +149,7 @@ class Search:
|
|||||||
"aggs": {
|
"aggs": {
|
||||||
"total_size": {"sum": {"field": "size"}}
|
"total_size": {"sum": {"field": "size"}}
|
||||||
},
|
},
|
||||||
"size": 40}, index=self.index_name, scroll="3m")
|
"size": 40}, index=self.index_name, scroll="30m")
|
||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
@ -267,7 +267,7 @@ function createDocCard(hit) {
|
|||||||
}
|
}
|
||||||
thumbnailOverlay.appendChild(resolutionBadge);
|
thumbnailOverlay.appendChild(resolutionBadge);
|
||||||
|
|
||||||
var format = hit["_source"]["format"];
|
var format = hit["_source"]["format_name"];
|
||||||
|
|
||||||
//Hover
|
//Hover
|
||||||
if(format === "GIF") {
|
if(format === "GIF") {
|
||||||
@ -429,6 +429,8 @@ window.addEventListener("scroll", function () {
|
|||||||
if (hits.length !== 0) {
|
if (hits.length !== 0) {
|
||||||
coolingDown = false;
|
coolingDown = false;
|
||||||
}
|
}
|
||||||
|
} else if (this.status === 500) {
|
||||||
|
window.location.reload()
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
xhttp.open("GET", "/scroll?scroll_id=" + scroll_id, true);
|
xhttp.open("GET", "/scroll?scroll_id=" + scroll_id, true);
|
||||||
|
@ -143,7 +143,7 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="card-header">Options <a href="#" style="float:right">Learn more <i
|
<div class="card-header">Options <a href="https://github.com/simon987/Simple-Incremental-Search-Tool/blob/master/config.py#L1-L13" style="float:right">Learn more <i
|
||||||
class="fas fa-external-link-alt"></i></a></div>
|
class="fas fa-external-link-alt"></i></a></div>
|
||||||
<div class="card-body">
|
<div class="card-body">
|
||||||
<table class="info-table table-striped table-hover">
|
<table class="info-table table-striped table-hover">
|
||||||
|
53
thumbnail.py
53
thumbnail.py
@ -1,6 +1,10 @@
|
|||||||
|
from queue import Full, Empty
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import os
|
import os
|
||||||
from multiprocessing import Value, Process
|
from multiprocessing import Value, Process
|
||||||
|
from queue import Queue
|
||||||
import ffmpeg
|
import ffmpeg
|
||||||
import config
|
import config
|
||||||
|
|
||||||
@ -22,10 +26,11 @@ class ThumbnailGenerator:
|
|||||||
|
|
||||||
if mime == "image/svg+xml" and config.cairosvg:
|
if mime == "image/svg+xml" and config.cairosvg:
|
||||||
|
|
||||||
|
tmpfile = dest_path + "_tmp"
|
||||||
try:
|
try:
|
||||||
p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": "tmp"})
|
p = Process(target=cairosvg.svg2png, kwargs={"url": path, "write_to": tmpfile})
|
||||||
p.start()
|
p.start()
|
||||||
p.join(1)
|
p.join(5)
|
||||||
|
|
||||||
if p.is_alive():
|
if p.is_alive():
|
||||||
p.terminate()
|
p.terminate()
|
||||||
@ -35,8 +40,8 @@ class ThumbnailGenerator:
|
|||||||
except Exception:
|
except Exception:
|
||||||
print("Couldn't make thumbnail for " + path)
|
print("Couldn't make thumbnail for " + path)
|
||||||
|
|
||||||
if os.path.exists("tmp"):
|
if os.path.exists(tmpfile):
|
||||||
os.remove("tmp")
|
os.remove(tmpfile)
|
||||||
|
|
||||||
elif mime.startswith("image"):
|
elif mime.startswith("image"):
|
||||||
|
|
||||||
@ -59,11 +64,16 @@ class ThumbnailGenerator:
|
|||||||
if os.path.exists("tmp"):
|
if os.path.exists("tmp"):
|
||||||
os.remove("tmp")
|
os.remove("tmp")
|
||||||
|
|
||||||
def generate_all(self, docs, dest_path, counter: Value=None, directory=None):
|
def worker(self, in_q: Queue, counter: Value, dest_path, directory):
|
||||||
|
|
||||||
os.makedirs(dest_path, exist_ok=True)
|
while True:
|
||||||
|
try:
|
||||||
|
doc = in_q.get(timeout=1)
|
||||||
|
if doc is None:
|
||||||
|
break
|
||||||
|
except Empty:
|
||||||
|
break
|
||||||
|
|
||||||
for doc in docs:
|
|
||||||
extension = "" if doc["_source"]["extension"] == "" else "." + doc["_source"]["extension"]
|
extension = "" if doc["_source"]["extension"] == "" else "." + doc["_source"]["extension"]
|
||||||
full_path = os.path.join(directory.path, doc["_source"]["path"], doc["_source"]["name"] + extension)
|
full_path = os.path.join(directory.path, doc["_source"]["path"], doc["_source"]["name"] + extension)
|
||||||
|
|
||||||
@ -73,6 +83,35 @@ class ThumbnailGenerator:
|
|||||||
if counter is not None:
|
if counter is not None:
|
||||||
counter.value += 1
|
counter.value += 1
|
||||||
|
|
||||||
|
in_q.task_done()
|
||||||
|
|
||||||
|
def generate_all(self, docs, dest_path, counter: Value = None, directory=None, total_count=None):
|
||||||
|
|
||||||
|
os.makedirs(dest_path, exist_ok=True)
|
||||||
|
|
||||||
|
in_q = Queue(50000) # TODO: load from config?
|
||||||
|
threads = []
|
||||||
|
for _ in range(config.tn_threads):
|
||||||
|
t = Thread(target=self.worker, args=[in_q, counter, dest_path, directory])
|
||||||
|
threads.append(t)
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
for doc in docs:
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
in_q.put(doc, timeout=10)
|
||||||
|
if total_count:
|
||||||
|
total_count.value += 1
|
||||||
|
break
|
||||||
|
except Full:
|
||||||
|
continue
|
||||||
|
|
||||||
|
in_q.join()
|
||||||
|
for _ in threads:
|
||||||
|
in_q.put(None)
|
||||||
|
for t in threads:
|
||||||
|
t.join()
|
||||||
|
|
||||||
def generate_image(self, path, dest_path):
|
def generate_image(self, path, dest_path):
|
||||||
|
|
||||||
with open(path, "rb") as image_file:
|
with open(path, "rb") as image_file:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user