Added button to queue empty websites

This commit is contained in:
Simon 2018-06-24 19:33:15 -04:00
parent f6ee338c0f
commit 059d9fd366
4 changed files with 40 additions and 16 deletions

35
app.py
View File

@ -2,7 +2,7 @@ from flask import Flask, render_template, redirect, request, flash, abort, Respo
import json
import os
import time
import ssl
import itertools
from database import Database, Website, InvalidQueryException
from flask_recaptcha import ReCaptcha
import od_util
@ -126,18 +126,24 @@ def admin_redispatch_queued():
abort(404)
def get_empty_websites():
current_tasks = itertools.chain(taskDispatcher.get_queued_tasks(), taskDispatcher.get_current_tasks())
queued_websites = [task.website_id for task in current_tasks]
all_websites = db.get_all_websites()
non_queued_websites = list(set(all_websites).difference(queued_websites))
return searchEngine.are_empty(non_queued_websites)
@app.route("/website/delete_empty")
def admin_delete_empty_website():
"""Delete websites with no associated files that are not queued"""
if "username" in session:
current_tasks = taskDispatcher.get_queued_tasks() + taskDispatcher.get_current_tasks()
queued_websites = [task.website_id for task in current_tasks]
all_websites = db.get_all_websites()
non_queued_websites = list(set(all_websites).difference(queued_websites))
empty_websites = searchEngine.are_empty(non_queued_websites)
empty_websites = get_empty_websites()
for website in empty_websites:
#db.delete_website(website)
@ -150,6 +156,21 @@ def admin_delete_empty_website():
abort(403)
@app.route("/website/queue_empty")
def admin_queue_empty_websites():
if "username" in session:
for website_id in get_empty_websites():
website = db.get_website_by_id(website_id)
task = Task(website.id, website.url, 1)
taskDispatcher.dispatch_task(task)
flash("Dispatched empty websites", "success")
return redirect("/dashboard")
else:
abort(403)
@app.route("/website/<int:website_id>/clear")
def admin_clear_website(website_id):

View File

@ -7,6 +7,9 @@ import re
from ftplib import FTP
import config
import urllib3
urllib3.disable_warnings()
def truncate_path(path, max_len):
pattern = re.compile(r"/?.*?/")
@ -170,7 +173,7 @@ def is_od(url):
ftp.close()
return True
elif config.SUBMIT_HTTP:
r = requests.get(url, timeout=30, allow_redirects=False)
r = requests.get(url, timeout=30, allow_redirects=False, verify=False)
if r.status_code != 200:
print("No redirects allowed!")
return False

15
task.py
View File

@ -174,18 +174,17 @@ class TaskDispatcher:
for server in self.db.get_crawl_servers():
for task in server.fetch_completed_tasks():
print("Completed task")
# All files are overwritten
self.search.delete_docs(task.website_id)
file_list = server.fetch_website_files(task.website_id)
if file_list:
if task.file_count:
# All files are overwritten
self.search.delete_docs(task.website_id)
file_list = server.fetch_website_files(task.website_id)
self.search.import_json(file_list, task.website_id)
# File list is safe to delete once indexed
server.free_website_files(task.website_id)
# Update last_modified date for website
self.db.update_website_date_if_exists(task.website_id)
# File list is safe to delete once indexed
server.free_website_files(task.website_id)
def dispatch_task(self, task: Task):
self._get_available_crawl_server().queue_task(task)
@ -207,7 +206,7 @@ class TaskDispatcher:
return server_with_most_free_slots
def get_queued_tasks(self) -> list:
def get_queued_tasks(self):
queued_tasks_by_server = self._get_queued_tasks_by_server()
for queued_tasks in queued_tasks_by_server.values():

View File

@ -121,6 +121,7 @@
<a class="btn btn-danger" href="/website/delete_empty">Delete websites with no associated files that are not queued</a>
<a class="btn btn-danger" href="/website/redispatch_queued">Re-dispatch queued tasks</a>
<a class="btn btn-danger" href="/website/queue_empty">Re-queue websites with no associated files</a>
<hr>
<a class="btn btn-info" href="/logout">Logout</a>