diff --git a/crawler.py b/crawler.py index 7acb289..4c0fa89 100644 --- a/crawler.py +++ b/crawler.py @@ -1,4 +1,22 @@ import os +from storage import Task, LocalStorage +import json +from multiprocessing import Process, Value +from apscheduler.schedulers.background import BackgroundScheduler +from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser +import time + + +class RunningTask: + + def __init__(self, task: Task): + self.total_files = 0 + self.parsed_files = Value("i", 0) + self.task = task + + def to_json(self): + return json.dumps({"parsed": self.parsed_files.value, "total": self.total_files, "id": self.task.id}) + class Crawler: @@ -16,7 +34,8 @@ class Crawler: for ext in parser.extensions: self.ext_map[ext] = parser - def crawl(self, root_dir: str): + def crawl(self, root_dir: str, counter: Value=None): + for root, dirs, files in os.walk(root_dir): for filename in files: @@ -24,9 +43,15 @@ class Crawler: parser = self.ext_map.get(os.path.splitext(filename)[1], self.default_parser) - doc = parser.parse(full_path) + try: + if counter: + counter.value += 1 - self.documents.append(doc) + doc = parser.parse(full_path) + + self.documents.append(doc) + except FileNotFoundError: + continue # File was deleted def countFiles(self, root_dir: str): count = 0 @@ -36,3 +61,52 @@ class Crawler: return count + +class TaskManager: + def __init__(self, storage: LocalStorage): + self.current_task = None + self.storage = storage + self.current_process = None + + scheduler = BackgroundScheduler() + scheduler.add_job(self.check_new_task, "interval", seconds=0.5) + scheduler.start() + + def start_task(self, task: Task): + self.current_task = RunningTask(task) + + c = Crawler([GenericFileParser([Md5CheckSumCalculator()], ExtensionMimeGuesser())]) + path = self.storage.dirs()[task.dir_id].path + self.current_task.total_files = c.countFiles(path) + + print("Started task - " + str(self.current_task.total_files) + " files") + print(path) + + self.current_process = Process(target=self.execute_crawl, args=(c, path, self.current_task.parsed_files)) + self.current_process.daemon = True + self.current_process.start() + + def execute_crawl(self, c: Crawler, path: str, counter: Value): + c.crawl(path, counter) + print("Done") + + def cancel_task(self): + self.current_task = None + self.current_process.terminate() + + def check_new_task(self): + + if self.current_task is None: + for i in sorted(self.storage.tasks(), reverse=True): + if not self.storage.tasks()[i].completed: + self.start_task(self.storage.tasks()[i]) + else: + print(self.current_task.parsed_files.value) + + if self.current_task.parsed_files.value == self.current_task.total_files: + + self.current_process.terminate() + self.storage.del_task(self.current_task.task.id) + self.current_task = None + + diff --git a/requirements.txt b/requirements.txt index 57e9b2c..3d0e91d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ flask flask_bcrypt elasticsearch python-magic -requests \ No newline at end of file +requests +apscheduler \ No newline at end of file diff --git a/run.py b/run.py index 9e31ec5..1868696 100644 --- a/run.py +++ b/run.py @@ -1,128 +1,19 @@ from flask import Flask, render_template, send_file, request, redirect, flash, session from indexer import Indexer from storage import Directory, Option, Task +from storage import LocalStorage, DuplicateDirectoryException +from crawler import RunningTask, TaskManager import json # indexer = Indexer("fse") app = Flask(__name__) app.secret_key = "A very secret key" -# -# class Document: -# def __init__(self, doc_id, name, path, size, md5): -# self.doc_id = doc_id -# self.name = name -# self.path = path -# self.size = size -# self.md5 = md5 -# -# -# class ImageDocument(Document): -# def __init__(self, doc_id, name, path, size, md5): -# super().__init__(doc_id, name, path, size, md5) -# self.type = "image" -# -# -# class AudioClipDocument(Document): -# def __init__(self, doc_id, name, path, size, md5): -# super().__init__(doc_id, name, path, size, md5) -# self.type = "audio" -# -# -# def get_document(id): -# -# response = requests.get(SOLR_URL + "get?id=" + id) -# -# return json.loads(response.text)["doc"] -# -# -# def make_thumb(doc): -# size = (1024, 1024) -# -# thumb_path = "thumbnails/" + doc["id"] -# -# if not os.path.exists(thumb_path): -# -# file_path = doc["path"][0] + "/" + doc["name"][0] -# -# if doc["width"][0] > size[0]: -# -# image = Image.open(file_path) -# image.thumbnail(size, Image.ANTIALIAS) -# -# if image.mode == "RGB": -# image.save(thumb_path, "JPEG") -# elif image.mode == "RGBA": -# image.save(thumb_path, "PNG") -# else: -# image = image.convert("RGB") -# image.save(thumb_path, "JPEG") -# else: -# print("Skipping thumbnail") -# os.symlink(file_path, thumb_path) -# -# return "thumbnails/" + doc["id"] -# -# -# @app.route("/search/") -# def search(): -# -# query = request.args.get("query") -# page = int(request.args.get("page")) -# per_page = int(request.args.get("per_page")) -# -# results = solr.search(query, None, rows=per_page, start=per_page * page) -# -# docs = [] -# for r in results: -# -# if "mime" in r: -# mime_type = r["mime"][0] -# else: -# mime_type = "" -# -# if mime_type.startswith("image"): -# docs.append(ImageDocument(r["id"], r["name"][0], r["path"][0], r["size"], r["md5"])) -# -# elif mime_type.startswith("audio"): -# docs.append(AudioClipDocument(r["id"], r["name"][0], r["path"][0], r["size"], r["md5"])) -# -# return render_template("search.html", docs=docs) -# -# -# @app.route("/") -# def index(): -# return render_template("index.html") -# -# -# @app.route("/files//") -# def files(id): -# -# doc = get_document(id) -# -# if doc is not None: -# file_path = doc["path"][0] + "/" + doc["name"][0] -# return send_file(file_path, mimetype=mimetypes.guess_type(file_path)[0]) -# else: -# return "File not found" -# -# -# @app.route("/thumbs//") -# def thumbs(doc_id): -# -# doc = get_document(doc_id) -# -# if doc is not None: -# -# thumb_path = make_thumb(doc) -# -# return send_file("thumbnails/" + doc_id, mimetype=mimetypes.guess_type(thumb_path)[0]) -# else: -# return "File not found" -from storage import LocalStorage, DuplicateDirectoryException storage = LocalStorage("local_storage.db") +tm = TaskManager(storage) + @app.route("/") def tmp_route(): return "huh" @@ -225,10 +116,34 @@ def directory_del(dir_id): return redirect("/directory") +for t in storage.tasks(): + a_task = t + break + +# tm = None + @app.route("/task") def task(): - return render_template("task.html", tasks=storage.tasks(), directories=storage.dirs()) + return render_template("task.html", tasks=storage.tasks(), directories=storage.dirs(), + task_list=json.dumps(list(storage.tasks().keys()))) + # return render_template("task.html", tasks=storage.tasks(), directories=storage.dirs()) + + +@app.route("/task/current") +def get_current_task(): + + if tm and tm.current_task: + return tm.current_task.to_json() + else: + return "" + + +@app.route("/task/current/cancel") +def cancel_current_task(): + + tm.cancel_task() + return redirect("/task") @app.route("/task/add") diff --git a/templates/directory.html b/templates/directory.html index e1f59c7..7235ab1 100644 --- a/templates/directory.html +++ b/templates/directory.html @@ -6,9 +6,9 @@
{# Add directory form #} -
-
An excellent form
-
+
+
An excellent form
+
@@ -24,9 +24,9 @@
{# List of directories #} -
-
An excellent list
-
+
+
An excellent list
+
diff --git a/templates/directory_manage.html b/templates/directory_manage.html index 673fb2b..877ef7f 100644 --- a/templates/directory_manage.html +++ b/templates/directory_manage.html @@ -70,10 +70,10 @@
-
+
-
Summary
-
+
Summary
+
@@ -106,9 +106,9 @@ -
-
An excellent option list
-
+
+
An excellent option list
+
@@ -136,48 +136,41 @@ -
-
+
+
- -
+
- +
-
-
-
An excellent control panel
-
+
+
An excellent control panel
+
-
- - Create a task - - - + - diff --git a/templates/layout.html b/templates/layout.html index fa3f3b1..31518e3 100644 --- a/templates/layout.html +++ b/templates/layout.html @@ -7,17 +7,13 @@ - - - + + + - - - + {# #} @@ -35,6 +31,10 @@ padding: 4px; } + .card { + margin-top: 1em; + } + {# .info-table tr:nth-child(even) {#} {# background-color: #fafafa;#} {# }#} @@ -44,29 +44,8 @@ -
- - - - - - - - - - - {% for task_id in tasks %} - - - - - - - - - {% endfor %} - - -
Task typeDirectoryCompletedAction
{{ tasks[task_id].type }}{{ directories[tasks[task_id].dir_id].name }}{{ tasks[task_id].completed }}Cancel
+ + + +
+
An excellent panel
+
+ {% for task_id in tasks | sort()%} +
+ {{ directories[tasks[task_id].dir_id].name }} - + {{ tasks[task_id].type }} + +
+
+
+
+ Queued +
+
+
+ + +
+ +
+ + {% endfor %}