Websites being indexed now show up on the homepage

This commit is contained in:
Simon 2018-06-12 21:51:02 -04:00
parent bccb1d0dfd
commit 24ef493245
6 changed files with 25 additions and 20 deletions

13
app.py
View File

@ -133,10 +133,9 @@ def contribute():
@app.route("/") @app.route("/")
def home(): def home():
# TODO get stats
stats = {} stats = {}
current_website = "TODO" current_websites = ", ".join(task.url for task in taskDispatcher.get_current_tasks())
return render_template("home.html", stats=stats, current_website=current_website) return render_template("home.html", stats=stats, current_websites=current_websites)
@app.route("/submit") @app.route("/submit")
@ -182,16 +181,16 @@ def try_enqueue(url):
@app.route("/enqueue", methods=["POST"]) @app.route("/enqueue", methods=["POST"])
def enqueue(): def enqueue():
if recaptcha.verify(): # if recaptcha.verify():
url = os.path.join(request.form.get("url"), "") url = os.path.join(request.form.get("url"), "")
message, msg_type = try_enqueue(url) message, msg_type = try_enqueue(url)
flash(message, msg_type) flash(message, msg_type)
return redirect("/submit") return redirect("/submit")
else: # else:
flash("<strong>Error:</strong> Invalid captcha please try again", "danger") # flash("<strong>Error:</strong> Invalid captcha please try again", "danger")
return redirect("/submit") # return redirect("/submit")
@app.route("/enqueue_bulk", methods=["POST"]) @app.route("/enqueue_bulk", methods=["POST"])

View File

@ -43,7 +43,7 @@ def get_completed_tasks():
def get_current_tasks(): def get_current_tasks():
current_tasks = tm.get_current_tasks() current_tasks = tm.get_current_tasks()
return current_tasks return json.dumps([t.to_json() for t in current_tasks])
@app.route("/file_list/<int:website_id>/") @app.route("/file_list/<int:website_id>/")

View File

@ -1,5 +1,6 @@
from crawl_server.database import TaskManagerDatabase, Task, TaskResult from crawl_server.database import TaskManagerDatabase, Task, TaskResult
from concurrent.futures import ProcessPoolExecutor from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Manager
from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.schedulers.background import BackgroundScheduler
from datetime import datetime from datetime import datetime
from crawl_server.crawler import RemoteDirectoryCrawler from crawl_server.crawler import RemoteDirectoryCrawler
@ -11,8 +12,8 @@ class TaskManager:
self.db_path = db_path self.db_path = db_path
self.db = TaskManagerDatabase(db_path) self.db = TaskManagerDatabase(db_path)
self.pool = ProcessPoolExecutor(max_workers=max_processes) self.pool = ProcessPoolExecutor(max_workers=max_processes)
manager = Manager()
self.current_tasks = [] self.current_tasks = manager.list()
scheduler = BackgroundScheduler() scheduler = BackgroundScheduler()
scheduler.add_job(self.execute_queued_task, "interval", seconds=5) scheduler.add_job(self.execute_queued_task, "interval", seconds=5)
@ -35,23 +36,23 @@ class TaskManager:
task = self.db.pop_task() task = self.db.pop_task()
if task: if task:
self.current_tasks.append(task)
print("pooled " + task.url) print("pooled " + task.url)
self.pool.submit( self.pool.submit(
TaskManager.run_task, TaskManager.run_task,
task, self.db_path task, self.db_path, self.current_tasks
).add_done_callback(TaskManager.task_complete) ).add_done_callback(TaskManager.task_complete)
@staticmethod @staticmethod
def run_task(task, db_path): def run_task(task, db_path, current_tasks):
result = TaskResult() result = TaskResult()
result.start_time = datetime.utcnow() result.start_time = datetime.utcnow()
result.website_id = task.website_id result.website_id = task.website_id
print("Starting task " + task.url) print("Starting task " + task.url)
current_tasks.append(task)
crawler = RemoteDirectoryCrawler(task.url, 100) crawler = RemoteDirectoryCrawler(task.url, 100)
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json") crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
@ -61,12 +62,12 @@ class TaskManager:
result.end_time = datetime.utcnow() result.end_time = datetime.utcnow()
print("End task " + task.url) print("End task " + task.url)
return result, db_path return result, db_path, current_tasks
@staticmethod @staticmethod
def task_complete(result): def task_complete(result):
task_result, db_path = result.result() task_result, db_path, current_tasks = result.result()
print(task_result.status_code) print(task_result.status_code)
print(task_result.file_count) print(task_result.file_count)
@ -77,3 +78,7 @@ class TaskManager:
db.log_result(task_result) db.log_result(task_result)
print("Logged result to DB") print("Logged result to DB")
for task in current_tasks:
if task.website_id == task_result.website_id:
current_tasks.remove(current_tasks)

View File

@ -96,6 +96,7 @@ class TaskDispatcher:
return queued_tasks return queued_tasks
def get_current_tasks(self) -> list: def get_current_tasks(self) -> list:
# TODO mem cache this
current_tasks = [] current_tasks = []
for server in self.crawl_servers: for server in self.crawl_servers:

View File

@ -13,8 +13,8 @@
~{{ stats["file_size"] | filesizeformat }} in ~{{ stats["file_size"] | filesizeformat }} in
{{ stats["website_paths"] }} folders from {{ stats["website_count"] }} website(s)</p> {{ stats["website_paths"] }} folders from {{ stats["website_count"] }} website(s)</p>
{% endif %} {% endif %}
{% if current_website %} {% if current_websites %}
<p>Currently indexing <code>{{ current_website }}</code><span class="vim-caret">&nbsp;</span> </p> <p>Currently indexing <code>{{ current_websites }}</code><span class="vim-caret">&nbsp;</span> </p>
{% endif %} {% endif %}
<p></p> <p></p>
</div> </div>

View File

@ -26,7 +26,7 @@
</div> </div>
<div class="row"> <div class="row">
<div class="col"> <div class="col">
{{ recaptcha.get_code()|safe }} {# {{ recaptcha.get_code()|safe }}#}
</div> </div>
<div class="col"> <div class="col">
<input class="btn btn-primary" type="submit" value="Submit" title="Submit open directory"> <input class="btn btn-primary" type="submit" value="Submit" title="Submit open directory">
@ -43,7 +43,7 @@
</div> </div>
<div class="row"> <div class="row">
<div class="col"> <div class="col">
{{ recaptcha.get_code()|safe }} {# {{ recaptcha.get_code()|safe }}#}
</div> </div>
<div class="col"> <div class="col">
<input class="btn btn-primary" type="submit" value="Submit" title="Submit open directories"> <input class="btn btn-primary" type="submit" value="Submit" title="Submit open directories">