Added button to queue empty websites

This commit is contained in:
Simon 2018-06-24 19:33:15 -04:00
parent f6ee338c0f
commit 059d9fd366
4 changed files with 40 additions and 16 deletions

35
app.py
View File

@ -2,7 +2,7 @@ from flask import Flask, render_template, redirect, request, flash, abort, Respo
import json import json
import os import os
import time import time
import ssl import itertools
from database import Database, Website, InvalidQueryException from database import Database, Website, InvalidQueryException
from flask_recaptcha import ReCaptcha from flask_recaptcha import ReCaptcha
import od_util import od_util
@ -126,18 +126,24 @@ def admin_redispatch_queued():
abort(404) abort(404)
def get_empty_websites():
current_tasks = itertools.chain(taskDispatcher.get_queued_tasks(), taskDispatcher.get_current_tasks())
queued_websites = [task.website_id for task in current_tasks]
all_websites = db.get_all_websites()
non_queued_websites = list(set(all_websites).difference(queued_websites))
return searchEngine.are_empty(non_queued_websites)
@app.route("/website/delete_empty") @app.route("/website/delete_empty")
def admin_delete_empty_website(): def admin_delete_empty_website():
"""Delete websites with no associated files that are not queued""" """Delete websites with no associated files that are not queued"""
if "username" in session: if "username" in session:
current_tasks = taskDispatcher.get_queued_tasks() + taskDispatcher.get_current_tasks() empty_websites = get_empty_websites()
queued_websites = [task.website_id for task in current_tasks]
all_websites = db.get_all_websites()
non_queued_websites = list(set(all_websites).difference(queued_websites))
empty_websites = searchEngine.are_empty(non_queued_websites)
for website in empty_websites: for website in empty_websites:
#db.delete_website(website) #db.delete_website(website)
@ -150,6 +156,21 @@ def admin_delete_empty_website():
abort(403) abort(403)
@app.route("/website/queue_empty")
def admin_queue_empty_websites():
if "username" in session:
for website_id in get_empty_websites():
website = db.get_website_by_id(website_id)
task = Task(website.id, website.url, 1)
taskDispatcher.dispatch_task(task)
flash("Dispatched empty websites", "success")
return redirect("/dashboard")
else:
abort(403)
@app.route("/website/<int:website_id>/clear") @app.route("/website/<int:website_id>/clear")
def admin_clear_website(website_id): def admin_clear_website(website_id):

View File

@ -7,6 +7,9 @@ import re
from ftplib import FTP from ftplib import FTP
import config import config
import urllib3
urllib3.disable_warnings()
def truncate_path(path, max_len): def truncate_path(path, max_len):
pattern = re.compile(r"/?.*?/") pattern = re.compile(r"/?.*?/")
@ -170,7 +173,7 @@ def is_od(url):
ftp.close() ftp.close()
return True return True
elif config.SUBMIT_HTTP: elif config.SUBMIT_HTTP:
r = requests.get(url, timeout=30, allow_redirects=False) r = requests.get(url, timeout=30, allow_redirects=False, verify=False)
if r.status_code != 200: if r.status_code != 200:
print("No redirects allowed!") print("No redirects allowed!")
return False return False

15
task.py
View File

@ -174,18 +174,17 @@ class TaskDispatcher:
for server in self.db.get_crawl_servers(): for server in self.db.get_crawl_servers():
for task in server.fetch_completed_tasks(): for task in server.fetch_completed_tasks():
print("Completed task") print("Completed task")
# All files are overwritten if task.file_count:
self.search.delete_docs(task.website_id) # All files are overwritten
file_list = server.fetch_website_files(task.website_id) self.search.delete_docs(task.website_id)
if file_list: file_list = server.fetch_website_files(task.website_id)
self.search.import_json(file_list, task.website_id) self.search.import_json(file_list, task.website_id)
# File list is safe to delete once indexed
server.free_website_files(task.website_id)
# Update last_modified date for website # Update last_modified date for website
self.db.update_website_date_if_exists(task.website_id) self.db.update_website_date_if_exists(task.website_id)
# File list is safe to delete once indexed
server.free_website_files(task.website_id)
def dispatch_task(self, task: Task): def dispatch_task(self, task: Task):
self._get_available_crawl_server().queue_task(task) self._get_available_crawl_server().queue_task(task)
@ -207,7 +206,7 @@ class TaskDispatcher:
return server_with_most_free_slots return server_with_most_free_slots
def get_queued_tasks(self) -> list: def get_queued_tasks(self):
queued_tasks_by_server = self._get_queued_tasks_by_server() queued_tasks_by_server = self._get_queued_tasks_by_server()
for queued_tasks in queued_tasks_by_server.values(): for queued_tasks in queued_tasks_by_server.values():

View File

@ -121,6 +121,7 @@
<a class="btn btn-danger" href="/website/delete_empty">Delete websites with no associated files that are not queued</a> <a class="btn btn-danger" href="/website/delete_empty">Delete websites with no associated files that are not queued</a>
<a class="btn btn-danger" href="/website/redispatch_queued">Re-dispatch queued tasks</a> <a class="btn btn-danger" href="/website/redispatch_queued">Re-dispatch queued tasks</a>
<a class="btn btn-danger" href="/website/queue_empty">Re-queue websites with no associated files</a>
<hr> <hr>
<a class="btn btn-info" href="/logout">Logout</a> <a class="btn btn-info" href="/logout">Logout</a>