diff --git a/app.py b/app.py index 6591f94..dd845ec 100644 --- a/app.py +++ b/app.py @@ -2,7 +2,7 @@ from flask import Flask, render_template, redirect, request, flash, abort, Respo import json import os import time -import ssl +import itertools from database import Database, Website, InvalidQueryException from flask_recaptcha import ReCaptcha import od_util @@ -126,18 +126,24 @@ def admin_redispatch_queued(): abort(404) +def get_empty_websites(): + + current_tasks = itertools.chain(taskDispatcher.get_queued_tasks(), taskDispatcher.get_current_tasks()) + + queued_websites = [task.website_id for task in current_tasks] + all_websites = db.get_all_websites() + non_queued_websites = list(set(all_websites).difference(queued_websites)) + + return searchEngine.are_empty(non_queued_websites) + + @app.route("/website/delete_empty") def admin_delete_empty_website(): """Delete websites with no associated files that are not queued""" if "username" in session: - current_tasks = taskDispatcher.get_queued_tasks() + taskDispatcher.get_current_tasks() - queued_websites = [task.website_id for task in current_tasks] - all_websites = db.get_all_websites() - non_queued_websites = list(set(all_websites).difference(queued_websites)) - - empty_websites = searchEngine.are_empty(non_queued_websites) + empty_websites = get_empty_websites() for website in empty_websites: #db.delete_website(website) @@ -150,6 +156,21 @@ def admin_delete_empty_website(): abort(403) +@app.route("/website/queue_empty") +def admin_queue_empty_websites(): + if "username" in session: + + for website_id in get_empty_websites(): + website = db.get_website_by_id(website_id) + task = Task(website.id, website.url, 1) + taskDispatcher.dispatch_task(task) + flash("Dispatched empty websites", "success") + return redirect("/dashboard") + + else: + abort(403) + + @app.route("/website//clear") def admin_clear_website(website_id): diff --git a/od_util.py b/od_util.py index 21ba503..a111e15 100644 --- a/od_util.py +++ b/od_util.py @@ -7,6 +7,9 @@ import re from ftplib import FTP import config +import urllib3 +urllib3.disable_warnings() + def truncate_path(path, max_len): pattern = re.compile(r"/?.*?/") @@ -170,7 +173,7 @@ def is_od(url): ftp.close() return True elif config.SUBMIT_HTTP: - r = requests.get(url, timeout=30, allow_redirects=False) + r = requests.get(url, timeout=30, allow_redirects=False, verify=False) if r.status_code != 200: print("No redirects allowed!") return False diff --git a/task.py b/task.py index bc02f33..d706691 100644 --- a/task.py +++ b/task.py @@ -174,18 +174,17 @@ class TaskDispatcher: for server in self.db.get_crawl_servers(): for task in server.fetch_completed_tasks(): print("Completed task") - # All files are overwritten - self.search.delete_docs(task.website_id) - file_list = server.fetch_website_files(task.website_id) - if file_list: + if task.file_count: + # All files are overwritten + self.search.delete_docs(task.website_id) + file_list = server.fetch_website_files(task.website_id) self.search.import_json(file_list, task.website_id) + # File list is safe to delete once indexed + server.free_website_files(task.website_id) # Update last_modified date for website self.db.update_website_date_if_exists(task.website_id) - # File list is safe to delete once indexed - server.free_website_files(task.website_id) - def dispatch_task(self, task: Task): self._get_available_crawl_server().queue_task(task) @@ -207,7 +206,7 @@ class TaskDispatcher: return server_with_most_free_slots - def get_queued_tasks(self) -> list: + def get_queued_tasks(self): queued_tasks_by_server = self._get_queued_tasks_by_server() for queued_tasks in queued_tasks_by_server.values(): diff --git a/templates/dashboard.html b/templates/dashboard.html index 8908468..e9a1275 100644 --- a/templates/dashboard.html +++ b/templates/dashboard.html @@ -121,6 +121,7 @@ Delete websites with no associated files that are not queued Re-dispatch queued tasks + Re-queue websites with no associated files
Logout