diff --git a/crawler.py b/crawler.py index 4c0fa89..e3be54a 100644 --- a/crawler.py +++ b/crawler.py @@ -4,7 +4,7 @@ import json from multiprocessing import Process, Value from apscheduler.schedulers.background import BackgroundScheduler from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser -import time +from indexer import Indexer class RunningTask: @@ -13,6 +13,7 @@ class RunningTask: self.total_files = 0 self.parsed_files = Value("i", 0) self.task = task + self.done = Value("i", 0) def to_json(self): return json.dumps({"parsed": self.parsed_files.value, "total": self.total_files, "id": self.task.id}) @@ -67,6 +68,7 @@ class TaskManager: self.current_task = None self.storage = storage self.current_process = None + self.indexer = Indexer("changeme") scheduler = BackgroundScheduler() scheduler.add_job(self.check_new_task, "interval", seconds=0.5) @@ -75,20 +77,25 @@ class TaskManager: def start_task(self, task: Task): self.current_task = RunningTask(task) - c = Crawler([GenericFileParser([Md5CheckSumCalculator()], ExtensionMimeGuesser())]) + c = Crawler([]) path = self.storage.dirs()[task.dir_id].path self.current_task.total_files = c.countFiles(path) print("Started task - " + str(self.current_task.total_files) + " files") print(path) - self.current_process = Process(target=self.execute_crawl, args=(c, path, self.current_task.parsed_files)) - self.current_process.daemon = True + self.current_process = Process(target=self.execute_crawl, args=(path, self.current_task.parsed_files, self.current_task.done)) + # self.current_process.daemon = True self.current_process.start() - def execute_crawl(self, c: Crawler, path: str, counter: Value): + def execute_crawl(self, path: str, counter: Value, done: Value): + c = Crawler([GenericFileParser([Md5CheckSumCalculator()], ExtensionMimeGuesser())]) c.crawl(path, counter) + + Indexer("changeme").index(c.documents) + print("Done") + done.value = 1 def cancel_task(self): self.current_task = None @@ -101,9 +108,7 @@ class TaskManager: if not self.storage.tasks()[i].completed: self.start_task(self.storage.tasks()[i]) else: - print(self.current_task.parsed_files.value) - - if self.current_task.parsed_files.value == self.current_task.total_files: + if self.current_task.done.value == 1: self.current_process.terminate() self.storage.del_task(self.current_task.task.id) diff --git a/indexer.py b/indexer.py index a9c0883..fdebed4 100644 --- a/indexer.py +++ b/indexer.py @@ -22,34 +22,60 @@ class Indexer: t.daemon = True t.start() - time.sleep(5) + time.sleep(10) + self.init() @staticmethod def run_elasticsearch(): subprocess.Popen(["elasticsearch/bin/elasticsearch"]) @staticmethod - def create_bulk_index_string(docs: list, index_name: str): + def create_bulk_index_string(docs: list): """ Creates a insert string for sending to elasticsearch """ + print("Creating bulk index string...") + result = "" - action_string = '{"index":{"_index":"' + index_name + '","_type":"file"}}\n' + action_string = '{"index":{}}\n' for doc in docs: result += action_string result += json.dumps(doc) + "\n" + print(result) + return result def index(self, docs: list): - - index_string = self.create_bulk_index_string(docs, self.index_name) - self.es.bulk(index_string) + print("Indexing " + str(len(docs)) + " docs") + index_string = Indexer.create_bulk_index_string(docs) + print("bulk-start") + self.es.bulk(body=index_string, index=self.index_name, doc_type="file") + print("bulk-done") def clear(self): self.es.indices.delete(self.index_name) self.es.indices.create(self.index_name) + + def init(self): + self.es.indices.delete(index=self.index_name) + self.es.indices.create(index=self.index_name) + self.es.indices.close(index=self.index_name) + + self.es.indices.put_settings(body='{"analysis": {"analyzer": {"path_analyser": {' + '"tokenizer": "path_tokenizer"}}, "tokenizer": {"path_tokenizer": {' + '"type": "path_hierarchy"}}}}', index=self.index_name) + + self.es.indices.put_mapping(body='{"properties": {' + '"name": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},' + '"suggest-path": {"type": "completion", "analyzer": "keyword"},' + '"mime": {"type": "keyword"}' + '}}', doc_type="file", index=self.index_name) + + self.es.indices.open(index=self.index_name) + + print("Initialised elesticsearch") diff --git a/templates/layout.html b/templates/layout.html index 31518e3..6792640 100644 --- a/templates/layout.html +++ b/templates/layout.html @@ -42,7 +42,7 @@ {# todo: box-shadow 0 1px 10px 1px #1AC8DE#} - +
Navbar1 diff --git a/templates/task.html b/templates/task.html index abbf5ac..603a700 100644 --- a/templates/task.html +++ b/templates/task.html @@ -75,8 +75,15 @@ var percent = currentTask.parsed / currentTask.total * 100; try { - document.getElementById("task-bar-" + currentTask.id).setAttribute("style", "width: " + percent + "%;"); + + var bar = document.getElementById("task-bar-" + currentTask.id); + bar.setAttribute("style", "width: " + percent + "%;"); document.getElementById("task-label-" + currentTask.id).innerHTML = currentTask.parsed + " / " + currentTask.total + " (" + percent.toFixed(2) + "%)"; + + if (percent === 100) { + bar.classList.add("bg-success") + } + } catch (e) { window.reload(); }