mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 10:26:44 +00:00
Websites being indexed now show up on the homepage
This commit is contained in:
parent
bccb1d0dfd
commit
24ef493245
13
app.py
13
app.py
@ -133,10 +133,9 @@ def contribute():
|
|||||||
@app.route("/")
|
@app.route("/")
|
||||||
def home():
|
def home():
|
||||||
|
|
||||||
# TODO get stats
|
|
||||||
stats = {}
|
stats = {}
|
||||||
current_website = "TODO"
|
current_websites = ", ".join(task.url for task in taskDispatcher.get_current_tasks())
|
||||||
return render_template("home.html", stats=stats, current_website=current_website)
|
return render_template("home.html", stats=stats, current_websites=current_websites)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/submit")
|
@app.route("/submit")
|
||||||
@ -182,16 +181,16 @@ def try_enqueue(url):
|
|||||||
|
|
||||||
@app.route("/enqueue", methods=["POST"])
|
@app.route("/enqueue", methods=["POST"])
|
||||||
def enqueue():
|
def enqueue():
|
||||||
if recaptcha.verify():
|
# if recaptcha.verify():
|
||||||
|
|
||||||
url = os.path.join(request.form.get("url"), "")
|
url = os.path.join(request.form.get("url"), "")
|
||||||
message, msg_type = try_enqueue(url)
|
message, msg_type = try_enqueue(url)
|
||||||
flash(message, msg_type)
|
flash(message, msg_type)
|
||||||
|
|
||||||
return redirect("/submit")
|
return redirect("/submit")
|
||||||
else:
|
# else:
|
||||||
flash("<strong>Error:</strong> Invalid captcha please try again", "danger")
|
# flash("<strong>Error:</strong> Invalid captcha please try again", "danger")
|
||||||
return redirect("/submit")
|
# return redirect("/submit")
|
||||||
|
|
||||||
|
|
||||||
@app.route("/enqueue_bulk", methods=["POST"])
|
@app.route("/enqueue_bulk", methods=["POST"])
|
||||||
|
@ -43,7 +43,7 @@ def get_completed_tasks():
|
|||||||
def get_current_tasks():
|
def get_current_tasks():
|
||||||
|
|
||||||
current_tasks = tm.get_current_tasks()
|
current_tasks = tm.get_current_tasks()
|
||||||
return current_tasks
|
return json.dumps([t.to_json() for t in current_tasks])
|
||||||
|
|
||||||
|
|
||||||
@app.route("/file_list/<int:website_id>/")
|
@app.route("/file_list/<int:website_id>/")
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
|
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
|
||||||
from concurrent.futures import ProcessPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
|
from multiprocessing import Manager
|
||||||
from apscheduler.schedulers.background import BackgroundScheduler
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from crawl_server.crawler import RemoteDirectoryCrawler
|
from crawl_server.crawler import RemoteDirectoryCrawler
|
||||||
@ -11,8 +12,8 @@ class TaskManager:
|
|||||||
self.db_path = db_path
|
self.db_path = db_path
|
||||||
self.db = TaskManagerDatabase(db_path)
|
self.db = TaskManagerDatabase(db_path)
|
||||||
self.pool = ProcessPoolExecutor(max_workers=max_processes)
|
self.pool = ProcessPoolExecutor(max_workers=max_processes)
|
||||||
|
manager = Manager()
|
||||||
self.current_tasks = []
|
self.current_tasks = manager.list()
|
||||||
|
|
||||||
scheduler = BackgroundScheduler()
|
scheduler = BackgroundScheduler()
|
||||||
scheduler.add_job(self.execute_queued_task, "interval", seconds=5)
|
scheduler.add_job(self.execute_queued_task, "interval", seconds=5)
|
||||||
@ -35,23 +36,23 @@ class TaskManager:
|
|||||||
task = self.db.pop_task()
|
task = self.db.pop_task()
|
||||||
if task:
|
if task:
|
||||||
|
|
||||||
self.current_tasks.append(task)
|
|
||||||
|
|
||||||
print("pooled " + task.url)
|
print("pooled " + task.url)
|
||||||
|
|
||||||
self.pool.submit(
|
self.pool.submit(
|
||||||
TaskManager.run_task,
|
TaskManager.run_task,
|
||||||
task, self.db_path
|
task, self.db_path, self.current_tasks
|
||||||
).add_done_callback(TaskManager.task_complete)
|
).add_done_callback(TaskManager.task_complete)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def run_task(task, db_path):
|
def run_task(task, db_path, current_tasks):
|
||||||
result = TaskResult()
|
result = TaskResult()
|
||||||
result.start_time = datetime.utcnow()
|
result.start_time = datetime.utcnow()
|
||||||
result.website_id = task.website_id
|
result.website_id = task.website_id
|
||||||
|
|
||||||
print("Starting task " + task.url)
|
print("Starting task " + task.url)
|
||||||
|
|
||||||
|
current_tasks.append(task)
|
||||||
|
|
||||||
crawler = RemoteDirectoryCrawler(task.url, 100)
|
crawler = RemoteDirectoryCrawler(task.url, 100)
|
||||||
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
|
crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json")
|
||||||
|
|
||||||
@ -61,12 +62,12 @@ class TaskManager:
|
|||||||
result.end_time = datetime.utcnow()
|
result.end_time = datetime.utcnow()
|
||||||
print("End task " + task.url)
|
print("End task " + task.url)
|
||||||
|
|
||||||
return result, db_path
|
return result, db_path, current_tasks
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def task_complete(result):
|
def task_complete(result):
|
||||||
|
|
||||||
task_result, db_path = result.result()
|
task_result, db_path, current_tasks = result.result()
|
||||||
|
|
||||||
print(task_result.status_code)
|
print(task_result.status_code)
|
||||||
print(task_result.file_count)
|
print(task_result.file_count)
|
||||||
@ -77,3 +78,7 @@ class TaskManager:
|
|||||||
db.log_result(task_result)
|
db.log_result(task_result)
|
||||||
print("Logged result to DB")
|
print("Logged result to DB")
|
||||||
|
|
||||||
|
for task in current_tasks:
|
||||||
|
if task.website_id == task_result.website_id:
|
||||||
|
current_tasks.remove(current_tasks)
|
||||||
|
|
||||||
|
1
task.py
1
task.py
@ -96,6 +96,7 @@ class TaskDispatcher:
|
|||||||
return queued_tasks
|
return queued_tasks
|
||||||
|
|
||||||
def get_current_tasks(self) -> list:
|
def get_current_tasks(self) -> list:
|
||||||
|
# TODO mem cache this
|
||||||
|
|
||||||
current_tasks = []
|
current_tasks = []
|
||||||
for server in self.crawl_servers:
|
for server in self.crawl_servers:
|
||||||
|
@ -13,8 +13,8 @@
|
|||||||
~{{ stats["file_size"] | filesizeformat }} in
|
~{{ stats["file_size"] | filesizeformat }} in
|
||||||
{{ stats["website_paths"] }} folders from {{ stats["website_count"] }} website(s)</p>
|
{{ stats["website_paths"] }} folders from {{ stats["website_count"] }} website(s)</p>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% if current_website %}
|
{% if current_websites %}
|
||||||
<p>Currently indexing <code>{{ current_website }}</code><span class="vim-caret"> </span> </p>
|
<p>Currently indexing <code>{{ current_websites }}</code><span class="vim-caret"> </span> </p>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
<p></p>
|
<p></p>
|
||||||
</div>
|
</div>
|
||||||
|
@ -26,7 +26,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<div class="col">
|
<div class="col">
|
||||||
{{ recaptcha.get_code()|safe }}
|
{# {{ recaptcha.get_code()|safe }}#}
|
||||||
</div>
|
</div>
|
||||||
<div class="col">
|
<div class="col">
|
||||||
<input class="btn btn-primary" type="submit" value="Submit" title="Submit open directory">
|
<input class="btn btn-primary" type="submit" value="Submit" title="Submit open directory">
|
||||||
@ -43,7 +43,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<div class="col">
|
<div class="col">
|
||||||
{{ recaptcha.get_code()|safe }}
|
{# {{ recaptcha.get_code()|safe }}#}
|
||||||
</div>
|
</div>
|
||||||
<div class="col">
|
<div class="col">
|
||||||
<input class="btn btn-primary" type="submit" value="Submit" title="Submit open directories">
|
<input class="btn btn-primary" type="submit" value="Submit" title="Submit open directories">
|
||||||
|
Loading…
x
Reference in New Issue
Block a user