From 24ef493245d219ec6e859903d73bb01998900f5c Mon Sep 17 00:00:00 2001
From: Simon
Date: Tue, 12 Jun 2018 21:51:02 -0400
Subject: [PATCH] Websites being indexed now show up on the homepage
---
app.py | 13 ++++++-------
crawl_server/server.py | 2 +-
crawl_server/task_manager.py | 21 +++++++++++++--------
task.py | 1 +
templates/home.html | 4 ++--
templates/submit.html | 4 ++--
6 files changed, 25 insertions(+), 20 deletions(-)
diff --git a/app.py b/app.py
index 26067e6..b0bfc32 100644
--- a/app.py
+++ b/app.py
@@ -133,10 +133,9 @@ def contribute():
@app.route("/")
def home():
- # TODO get stats
stats = {}
- current_website = "TODO"
- return render_template("home.html", stats=stats, current_website=current_website)
+ current_websites = ", ".join(task.url for task in taskDispatcher.get_current_tasks())
+ return render_template("home.html", stats=stats, current_websites=current_websites)
@app.route("/submit")
@@ -182,16 +181,16 @@ def try_enqueue(url):
@app.route("/enqueue", methods=["POST"])
def enqueue():
- if recaptcha.verify():
+ # if recaptcha.verify():
url = os.path.join(request.form.get("url"), "")
message, msg_type = try_enqueue(url)
flash(message, msg_type)
return redirect("/submit")
- else:
- flash("Error: Invalid captcha please try again", "danger")
- return redirect("/submit")
+ # else:
+ # flash("Error: Invalid captcha please try again", "danger")
+ # return redirect("/submit")
@app.route("/enqueue_bulk", methods=["POST"])
diff --git a/crawl_server/server.py b/crawl_server/server.py
index 924f556..8da3be3 100644
--- a/crawl_server/server.py
+++ b/crawl_server/server.py
@@ -43,7 +43,7 @@ def get_completed_tasks():
def get_current_tasks():
current_tasks = tm.get_current_tasks()
- return current_tasks
+ return json.dumps([t.to_json() for t in current_tasks])
@app.route("/file_list//")
diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py
index 94fbe78..7e5a44a 100644
--- a/crawl_server/task_manager.py
+++ b/crawl_server/task_manager.py
@@ -1,5 +1,6 @@
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
from concurrent.futures import ProcessPoolExecutor
+from multiprocessing import Manager
from apscheduler.schedulers.background import BackgroundScheduler
from datetime import datetime
from crawl_server.crawler import RemoteDirectoryCrawler
@@ -11,8 +12,8 @@ class TaskManager:
self.db_path = db_path
self.db = TaskManagerDatabase(db_path)
self.pool = ProcessPoolExecutor(max_workers=max_processes)
-
- self.current_tasks = []
+ manager = Manager()
+ self.current_tasks = manager.list()
scheduler = BackgroundScheduler()
scheduler.add_job(self.execute_queued_task, "interval", seconds=5)
@@ -35,23 +36,23 @@ class TaskManager:
task = self.db.pop_task()
if task:
- self.current_tasks.append(task)
-
print("pooled " + task.url)
self.pool.submit(
TaskManager.run_task,
- task, self.db_path
+ task, self.db_path, self.current_tasks
).add_done_callback(TaskManager.task_complete)
@staticmethod
- def run_task(task, db_path):
+ def run_task(task, db_path, current_tasks):
result = TaskResult()
result.start_time = datetime.utcnow()
result.website_id = task.website_id
print("Starting task " + task.url)
+ current_tasks.append(task)
+
crawler = RemoteDirectoryCrawler(task.url, 100)
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
@@ -61,12 +62,12 @@ class TaskManager:
result.end_time = datetime.utcnow()
print("End task " + task.url)
- return result, db_path
+ return result, db_path, current_tasks
@staticmethod
def task_complete(result):
- task_result, db_path = result.result()
+ task_result, db_path, current_tasks = result.result()
print(task_result.status_code)
print(task_result.file_count)
@@ -77,3 +78,7 @@ class TaskManager:
db.log_result(task_result)
print("Logged result to DB")
+ for task in current_tasks:
+ if task.website_id == task_result.website_id:
+ current_tasks.remove(current_tasks)
+
diff --git a/task.py b/task.py
index ae69dec..8c6063b 100644
--- a/task.py
+++ b/task.py
@@ -96,6 +96,7 @@ class TaskDispatcher:
return queued_tasks
def get_current_tasks(self) -> list:
+ # TODO mem cache this
current_tasks = []
for server in self.crawl_servers:
diff --git a/templates/home.html b/templates/home.html
index e921b7b..952f258 100644
--- a/templates/home.html
+++ b/templates/home.html
@@ -13,8 +13,8 @@
~{{ stats["file_size"] | filesizeformat }} in
{{ stats["website_paths"] }} folders from {{ stats["website_count"] }} website(s)
{% endif %}
- {% if current_website %}
- Currently indexing {{ current_website }}
+ {% if current_websites %}
+ Currently indexing {{ current_websites }}
{% endif %}
diff --git a/templates/submit.html b/templates/submit.html
index a9d8dd8..b7b5b28 100644
--- a/templates/submit.html
+++ b/templates/submit.html
@@ -26,7 +26,7 @@