diff --git a/app.py b/app.py index bcf0676..6591f94 100644 --- a/app.py +++ b/app.py @@ -113,6 +113,19 @@ def websites(): return render_template("websites.html", websites=db.get_websites(100, page)) +@app.route("/website/redispatch_queued") +def admin_redispatch_queued(): + + if "username" in session: + + count = taskDispatcher.redispatch_queued() + flash("Re-dispatched " + str(count) + " tasks", "success") + return redirect("/dashboard") + + else: + abort(404) + + @app.route("/website/delete_empty") def admin_delete_empty_website(): """Delete websites with no associated files that are not queued""" @@ -127,7 +140,8 @@ def admin_delete_empty_website(): empty_websites = searchEngine.are_empty(non_queued_websites) for website in empty_websites: - db.delete_website(website) + #db.delete_website(website) + pass flash("Deleted: " + repr(list(empty_websites)), "success") return redirect("/dashboard") diff --git a/crawl_server/database.py b/crawl_server/database.py index 47603ef..a4e2ea7 100644 --- a/crawl_server/database.py +++ b/crawl_server/database.py @@ -83,6 +83,16 @@ class TaskManagerDatabase: else: return None + def pop_all_tasks(self): + + tasks = self.get_tasks() + + with sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + + cursor.execute("DELETE FROM Queue") + return tasks + def put_task(self, task: Task): with sqlite3.connect(self.db_path) as conn: diff --git a/crawl_server/server.py b/crawl_server/server.py index 47b63a1..050b5d5 100644 --- a/crawl_server/server.py +++ b/crawl_server/server.py @@ -90,6 +90,14 @@ def get_task_logs(): return Response(json_str, mimetype="application/json") +@app.route("/task/pop_all") +@auth.login_required +def pop_queued_tasks(): + + json_str = json.dumps([task.to_json() for task in tm.pop_tasks()]) + return Response(json_str, mimetype="application/json") + + @app.route("/stats/") @auth.login_required def get_stats(): diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index 5e4cd8b..e38b199 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -27,6 +27,9 @@ class TaskManager: def get_tasks(self): return self.db.get_tasks() + def pop_tasks(self): + return self.db.pop_all_tasks() + def get_current_tasks(self): return self.current_tasks diff --git a/task.py b/task.py index d459644..bc02f33 100644 --- a/task.py +++ b/task.py @@ -143,6 +143,21 @@ class CrawlServer: except ConnectionError: return {} + def pop_queued_tasks(self): + try: + r = requests.get(self.url + "/task/pop_all", headers=self._generate_headers(), verify=False) + + if r.status_code != 200: + print("Problem while popping tasks for '" + self.name + "': " + str(r.status_code)) + print(r.text) + + return [ + Task(t["website_id"], t["url"], t["priority"], t["callback_type"], t["callback_args"]) + for t in json.loads(r.text) + ] + except ConnectionError: + return [] + class TaskDispatcher: @@ -176,7 +191,7 @@ class TaskDispatcher: def _get_available_crawl_server(self) -> CrawlServer: - queued_tasks_by_server = self._get_current_tasks_by_server() + queued_tasks_by_server = self._get_queued_tasks_by_server() server_with_most_free_slots = None most_free_slots = -10000 @@ -253,3 +268,15 @@ class TaskDispatcher: stats[server.name] = server_stats return stats + + def redispatch_queued(self) -> int: + + counter = 0 + for server in self.db.get_crawl_servers(): + for task in server.pop_queued_tasks(): + self.dispatch_task(task) + counter += 1 + + return counter + + diff --git a/templates/dashboard.html b/templates/dashboard.html index 777a5e5..8908468 100644 --- a/templates/dashboard.html +++ b/templates/dashboard.html @@ -119,8 +119,8 @@