mirror of
https://github.com/simon987/od-database.git
synced 2025-04-18 01:46:46 +00:00
Added button to queue empty websites
This commit is contained in:
parent
f6ee338c0f
commit
059d9fd366
35
app.py
35
app.py
@ -2,7 +2,7 @@ from flask import Flask, render_template, redirect, request, flash, abort, Respo
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import ssl
|
||||
import itertools
|
||||
from database import Database, Website, InvalidQueryException
|
||||
from flask_recaptcha import ReCaptcha
|
||||
import od_util
|
||||
@ -126,18 +126,24 @@ def admin_redispatch_queued():
|
||||
abort(404)
|
||||
|
||||
|
||||
def get_empty_websites():
|
||||
|
||||
current_tasks = itertools.chain(taskDispatcher.get_queued_tasks(), taskDispatcher.get_current_tasks())
|
||||
|
||||
queued_websites = [task.website_id for task in current_tasks]
|
||||
all_websites = db.get_all_websites()
|
||||
non_queued_websites = list(set(all_websites).difference(queued_websites))
|
||||
|
||||
return searchEngine.are_empty(non_queued_websites)
|
||||
|
||||
|
||||
@app.route("/website/delete_empty")
|
||||
def admin_delete_empty_website():
|
||||
"""Delete websites with no associated files that are not queued"""
|
||||
|
||||
if "username" in session:
|
||||
|
||||
current_tasks = taskDispatcher.get_queued_tasks() + taskDispatcher.get_current_tasks()
|
||||
queued_websites = [task.website_id for task in current_tasks]
|
||||
all_websites = db.get_all_websites()
|
||||
non_queued_websites = list(set(all_websites).difference(queued_websites))
|
||||
|
||||
empty_websites = searchEngine.are_empty(non_queued_websites)
|
||||
empty_websites = get_empty_websites()
|
||||
|
||||
for website in empty_websites:
|
||||
#db.delete_website(website)
|
||||
@ -150,6 +156,21 @@ def admin_delete_empty_website():
|
||||
abort(403)
|
||||
|
||||
|
||||
@app.route("/website/queue_empty")
|
||||
def admin_queue_empty_websites():
|
||||
if "username" in session:
|
||||
|
||||
for website_id in get_empty_websites():
|
||||
website = db.get_website_by_id(website_id)
|
||||
task = Task(website.id, website.url, 1)
|
||||
taskDispatcher.dispatch_task(task)
|
||||
flash("Dispatched empty websites", "success")
|
||||
return redirect("/dashboard")
|
||||
|
||||
else:
|
||||
abort(403)
|
||||
|
||||
|
||||
@app.route("/website/<int:website_id>/clear")
|
||||
def admin_clear_website(website_id):
|
||||
|
||||
|
@ -7,6 +7,9 @@ import re
|
||||
from ftplib import FTP
|
||||
import config
|
||||
|
||||
import urllib3
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
def truncate_path(path, max_len):
|
||||
pattern = re.compile(r"/?.*?/")
|
||||
@ -170,7 +173,7 @@ def is_od(url):
|
||||
ftp.close()
|
||||
return True
|
||||
elif config.SUBMIT_HTTP:
|
||||
r = requests.get(url, timeout=30, allow_redirects=False)
|
||||
r = requests.get(url, timeout=30, allow_redirects=False, verify=False)
|
||||
if r.status_code != 200:
|
||||
print("No redirects allowed!")
|
||||
return False
|
||||
|
15
task.py
15
task.py
@ -174,18 +174,17 @@ class TaskDispatcher:
|
||||
for server in self.db.get_crawl_servers():
|
||||
for task in server.fetch_completed_tasks():
|
||||
print("Completed task")
|
||||
# All files are overwritten
|
||||
self.search.delete_docs(task.website_id)
|
||||
file_list = server.fetch_website_files(task.website_id)
|
||||
if file_list:
|
||||
if task.file_count:
|
||||
# All files are overwritten
|
||||
self.search.delete_docs(task.website_id)
|
||||
file_list = server.fetch_website_files(task.website_id)
|
||||
self.search.import_json(file_list, task.website_id)
|
||||
# File list is safe to delete once indexed
|
||||
server.free_website_files(task.website_id)
|
||||
|
||||
# Update last_modified date for website
|
||||
self.db.update_website_date_if_exists(task.website_id)
|
||||
|
||||
# File list is safe to delete once indexed
|
||||
server.free_website_files(task.website_id)
|
||||
|
||||
def dispatch_task(self, task: Task):
|
||||
self._get_available_crawl_server().queue_task(task)
|
||||
|
||||
@ -207,7 +206,7 @@ class TaskDispatcher:
|
||||
|
||||
return server_with_most_free_slots
|
||||
|
||||
def get_queued_tasks(self) -> list:
|
||||
def get_queued_tasks(self):
|
||||
|
||||
queued_tasks_by_server = self._get_queued_tasks_by_server()
|
||||
for queued_tasks in queued_tasks_by_server.values():
|
||||
|
@ -121,6 +121,7 @@
|
||||
|
||||
<a class="btn btn-danger" href="/website/delete_empty">Delete websites with no associated files that are not queued</a>
|
||||
<a class="btn btn-danger" href="/website/redispatch_queued">Re-dispatch queued tasks</a>
|
||||
<a class="btn btn-danger" href="/website/queue_empty">Re-queue websites with no associated files</a>
|
||||
|
||||
<hr>
|
||||
<a class="btn btn-info" href="/logout">Logout</a>
|
||||
|
Loading…
x
Reference in New Issue
Block a user