mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
Added button to queue empty websites
This commit is contained in:
parent
f6ee338c0f
commit
059d9fd366
35
app.py
35
app.py
@ -2,7 +2,7 @@ from flask import Flask, render_template, redirect, request, flash, abort, Respo
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import ssl
|
import itertools
|
||||||
from database import Database, Website, InvalidQueryException
|
from database import Database, Website, InvalidQueryException
|
||||||
from flask_recaptcha import ReCaptcha
|
from flask_recaptcha import ReCaptcha
|
||||||
import od_util
|
import od_util
|
||||||
@ -126,18 +126,24 @@ def admin_redispatch_queued():
|
|||||||
abort(404)
|
abort(404)
|
||||||
|
|
||||||
|
|
||||||
|
def get_empty_websites():
|
||||||
|
|
||||||
|
current_tasks = itertools.chain(taskDispatcher.get_queued_tasks(), taskDispatcher.get_current_tasks())
|
||||||
|
|
||||||
|
queued_websites = [task.website_id for task in current_tasks]
|
||||||
|
all_websites = db.get_all_websites()
|
||||||
|
non_queued_websites = list(set(all_websites).difference(queued_websites))
|
||||||
|
|
||||||
|
return searchEngine.are_empty(non_queued_websites)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/website/delete_empty")
|
@app.route("/website/delete_empty")
|
||||||
def admin_delete_empty_website():
|
def admin_delete_empty_website():
|
||||||
"""Delete websites with no associated files that are not queued"""
|
"""Delete websites with no associated files that are not queued"""
|
||||||
|
|
||||||
if "username" in session:
|
if "username" in session:
|
||||||
|
|
||||||
current_tasks = taskDispatcher.get_queued_tasks() + taskDispatcher.get_current_tasks()
|
empty_websites = get_empty_websites()
|
||||||
queued_websites = [task.website_id for task in current_tasks]
|
|
||||||
all_websites = db.get_all_websites()
|
|
||||||
non_queued_websites = list(set(all_websites).difference(queued_websites))
|
|
||||||
|
|
||||||
empty_websites = searchEngine.are_empty(non_queued_websites)
|
|
||||||
|
|
||||||
for website in empty_websites:
|
for website in empty_websites:
|
||||||
#db.delete_website(website)
|
#db.delete_website(website)
|
||||||
@ -150,6 +156,21 @@ def admin_delete_empty_website():
|
|||||||
abort(403)
|
abort(403)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/website/queue_empty")
|
||||||
|
def admin_queue_empty_websites():
|
||||||
|
if "username" in session:
|
||||||
|
|
||||||
|
for website_id in get_empty_websites():
|
||||||
|
website = db.get_website_by_id(website_id)
|
||||||
|
task = Task(website.id, website.url, 1)
|
||||||
|
taskDispatcher.dispatch_task(task)
|
||||||
|
flash("Dispatched empty websites", "success")
|
||||||
|
return redirect("/dashboard")
|
||||||
|
|
||||||
|
else:
|
||||||
|
abort(403)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/website/<int:website_id>/clear")
|
@app.route("/website/<int:website_id>/clear")
|
||||||
def admin_clear_website(website_id):
|
def admin_clear_website(website_id):
|
||||||
|
|
||||||
|
@ -7,6 +7,9 @@ import re
|
|||||||
from ftplib import FTP
|
from ftplib import FTP
|
||||||
import config
|
import config
|
||||||
|
|
||||||
|
import urllib3
|
||||||
|
urllib3.disable_warnings()
|
||||||
|
|
||||||
|
|
||||||
def truncate_path(path, max_len):
|
def truncate_path(path, max_len):
|
||||||
pattern = re.compile(r"/?.*?/")
|
pattern = re.compile(r"/?.*?/")
|
||||||
@ -170,7 +173,7 @@ def is_od(url):
|
|||||||
ftp.close()
|
ftp.close()
|
||||||
return True
|
return True
|
||||||
elif config.SUBMIT_HTTP:
|
elif config.SUBMIT_HTTP:
|
||||||
r = requests.get(url, timeout=30, allow_redirects=False)
|
r = requests.get(url, timeout=30, allow_redirects=False, verify=False)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
print("No redirects allowed!")
|
print("No redirects allowed!")
|
||||||
return False
|
return False
|
||||||
|
9
task.py
9
task.py
@ -174,18 +174,17 @@ class TaskDispatcher:
|
|||||||
for server in self.db.get_crawl_servers():
|
for server in self.db.get_crawl_servers():
|
||||||
for task in server.fetch_completed_tasks():
|
for task in server.fetch_completed_tasks():
|
||||||
print("Completed task")
|
print("Completed task")
|
||||||
|
if task.file_count:
|
||||||
# All files are overwritten
|
# All files are overwritten
|
||||||
self.search.delete_docs(task.website_id)
|
self.search.delete_docs(task.website_id)
|
||||||
file_list = server.fetch_website_files(task.website_id)
|
file_list = server.fetch_website_files(task.website_id)
|
||||||
if file_list:
|
|
||||||
self.search.import_json(file_list, task.website_id)
|
self.search.import_json(file_list, task.website_id)
|
||||||
|
# File list is safe to delete once indexed
|
||||||
|
server.free_website_files(task.website_id)
|
||||||
|
|
||||||
# Update last_modified date for website
|
# Update last_modified date for website
|
||||||
self.db.update_website_date_if_exists(task.website_id)
|
self.db.update_website_date_if_exists(task.website_id)
|
||||||
|
|
||||||
# File list is safe to delete once indexed
|
|
||||||
server.free_website_files(task.website_id)
|
|
||||||
|
|
||||||
def dispatch_task(self, task: Task):
|
def dispatch_task(self, task: Task):
|
||||||
self._get_available_crawl_server().queue_task(task)
|
self._get_available_crawl_server().queue_task(task)
|
||||||
|
|
||||||
@ -207,7 +206,7 @@ class TaskDispatcher:
|
|||||||
|
|
||||||
return server_with_most_free_slots
|
return server_with_most_free_slots
|
||||||
|
|
||||||
def get_queued_tasks(self) -> list:
|
def get_queued_tasks(self):
|
||||||
|
|
||||||
queued_tasks_by_server = self._get_queued_tasks_by_server()
|
queued_tasks_by_server = self._get_queued_tasks_by_server()
|
||||||
for queued_tasks in queued_tasks_by_server.values():
|
for queued_tasks in queued_tasks_by_server.values():
|
||||||
|
@ -121,6 +121,7 @@
|
|||||||
|
|
||||||
<a class="btn btn-danger" href="/website/delete_empty">Delete websites with no associated files that are not queued</a>
|
<a class="btn btn-danger" href="/website/delete_empty">Delete websites with no associated files that are not queued</a>
|
||||||
<a class="btn btn-danger" href="/website/redispatch_queued">Re-dispatch queued tasks</a>
|
<a class="btn btn-danger" href="/website/redispatch_queued">Re-dispatch queued tasks</a>
|
||||||
|
<a class="btn btn-danger" href="/website/queue_empty">Re-queue websites with no associated files</a>
|
||||||
|
|
||||||
<hr>
|
<hr>
|
||||||
<a class="btn btn-info" href="/logout">Logout</a>
|
<a class="btn btn-info" href="/logout">Logout</a>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user