From 24ef493245d219ec6e859903d73bb01998900f5c Mon Sep 17 00:00:00 2001 From: Simon Date: Tue, 12 Jun 2018 21:51:02 -0400 Subject: [PATCH] Websites being indexed now show up on the homepage --- app.py | 13 ++++++------- crawl_server/server.py | 2 +- crawl_server/task_manager.py | 21 +++++++++++++-------- task.py | 1 + templates/home.html | 4 ++-- templates/submit.html | 4 ++-- 6 files changed, 25 insertions(+), 20 deletions(-) diff --git a/app.py b/app.py index 26067e6..b0bfc32 100644 --- a/app.py +++ b/app.py @@ -133,10 +133,9 @@ def contribute(): @app.route("/") def home(): - # TODO get stats stats = {} - current_website = "TODO" - return render_template("home.html", stats=stats, current_website=current_website) + current_websites = ", ".join(task.url for task in taskDispatcher.get_current_tasks()) + return render_template("home.html", stats=stats, current_websites=current_websites) @app.route("/submit") @@ -182,16 +181,16 @@ def try_enqueue(url): @app.route("/enqueue", methods=["POST"]) def enqueue(): - if recaptcha.verify(): + # if recaptcha.verify(): url = os.path.join(request.form.get("url"), "") message, msg_type = try_enqueue(url) flash(message, msg_type) return redirect("/submit") - else: - flash("Error: Invalid captcha please try again", "danger") - return redirect("/submit") + # else: + # flash("Error: Invalid captcha please try again", "danger") + # return redirect("/submit") @app.route("/enqueue_bulk", methods=["POST"]) diff --git a/crawl_server/server.py b/crawl_server/server.py index 924f556..8da3be3 100644 --- a/crawl_server/server.py +++ b/crawl_server/server.py @@ -43,7 +43,7 @@ def get_completed_tasks(): def get_current_tasks(): current_tasks = tm.get_current_tasks() - return current_tasks + return json.dumps([t.to_json() for t in current_tasks]) @app.route("/file_list//") diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index 94fbe78..7e5a44a 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -1,5 +1,6 @@ from crawl_server.database import TaskManagerDatabase, Task, TaskResult from concurrent.futures import ProcessPoolExecutor +from multiprocessing import Manager from apscheduler.schedulers.background import BackgroundScheduler from datetime import datetime from crawl_server.crawler import RemoteDirectoryCrawler @@ -11,8 +12,8 @@ class TaskManager: self.db_path = db_path self.db = TaskManagerDatabase(db_path) self.pool = ProcessPoolExecutor(max_workers=max_processes) - - self.current_tasks = [] + manager = Manager() + self.current_tasks = manager.list() scheduler = BackgroundScheduler() scheduler.add_job(self.execute_queued_task, "interval", seconds=5) @@ -35,23 +36,23 @@ class TaskManager: task = self.db.pop_task() if task: - self.current_tasks.append(task) - print("pooled " + task.url) self.pool.submit( TaskManager.run_task, - task, self.db_path + task, self.db_path, self.current_tasks ).add_done_callback(TaskManager.task_complete) @staticmethod - def run_task(task, db_path): + def run_task(task, db_path, current_tasks): result = TaskResult() result.start_time = datetime.utcnow() result.website_id = task.website_id print("Starting task " + task.url) + current_tasks.append(task) + crawler = RemoteDirectoryCrawler(task.url, 100) crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json") @@ -61,12 +62,12 @@ class TaskManager: result.end_time = datetime.utcnow() print("End task " + task.url) - return result, db_path + return result, db_path, current_tasks @staticmethod def task_complete(result): - task_result, db_path = result.result() + task_result, db_path, current_tasks = result.result() print(task_result.status_code) print(task_result.file_count) @@ -77,3 +78,7 @@ class TaskManager: db.log_result(task_result) print("Logged result to DB") + for task in current_tasks: + if task.website_id == task_result.website_id: + current_tasks.remove(current_tasks) + diff --git a/task.py b/task.py index ae69dec..8c6063b 100644 --- a/task.py +++ b/task.py @@ -96,6 +96,7 @@ class TaskDispatcher: return queued_tasks def get_current_tasks(self) -> list: + # TODO mem cache this current_tasks = [] for server in self.crawl_servers: diff --git a/templates/home.html b/templates/home.html index e921b7b..952f258 100644 --- a/templates/home.html +++ b/templates/home.html @@ -13,8 +13,8 @@ ~{{ stats["file_size"] | filesizeformat }} in {{ stats["website_paths"] }} folders from {{ stats["website_count"] }} website(s)

{% endif %} - {% if current_website %} -

Currently indexing {{ current_website }} 

+ {% if current_websites %} +

Currently indexing {{ current_websites }} 

{% endif %}

diff --git a/templates/submit.html b/templates/submit.html index a9d8dd8..b7b5b28 100644 --- a/templates/submit.html +++ b/templates/submit.html @@ -26,7 +26,7 @@
- {{ recaptcha.get_code()|safe }} +{# {{ recaptcha.get_code()|safe }}#}
@@ -43,7 +43,7 @@
- {{ recaptcha.get_code()|safe }} +{# {{ recaptcha.get_code()|safe }}#}