diff --git a/app.py b/app.py index dd845ec..e295fb7 100644 --- a/app.py +++ b/app.py @@ -50,7 +50,7 @@ def downloads(): @app.route("/stats") def stats_page(): - crawl_server_stats = taskDispatcher.get_stats_by_server() + crawl_server_stats = db.get_stats_by_server() return render_template("stats.html", crawl_server_stats=crawl_server_stats) @@ -444,7 +444,7 @@ def admin_del_token(): def admin_crawl_logs(): if "username" in session: - results = taskDispatcher.get_task_logs_by_server() + results = db.get_crawl_logs() return render_template("crawl_logs.html", logs=results) else: diff --git a/crawl_server/database.py b/crawl_server/database.py index a4e2ea7..1e47b84 100644 --- a/crawl_server/database.py +++ b/crawl_server/database.py @@ -5,13 +5,15 @@ import sqlite3 class TaskResult: - def __init__(self, status_code=None, file_count=0, start_time=0, end_time=0, website_id=0, indexed_time=0): + def __init__(self, status_code=None, file_count=0, start_time=0, + end_time=0, website_id=0, indexed_time=0, server_name=""): self.status_code = status_code self.file_count = file_count self.start_time = start_time self.end_time = end_time self.website_id = website_id self.indexed_time = indexed_time + self.server_name = server_name def to_json(self): return { @@ -139,11 +141,3 @@ class TaskManagerDatabase: return [TaskResult(r[0], r[1], r[2], r[3], r[4]) for r in db_result] - def get_all_results(self): - - with sqlite3.connect(self.db_path, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES) as conn: - cursor = conn.cursor() - - cursor.execute("SELECT website_id, status_code, file_count, start_time, end_time, indexed_time " - "FROM TaskResult ORDER BY id ASC") - return [TaskResult(r[1], r[2], r[3].timestamp(), r[4].timestamp(), r[0], r[5].timestamp() if r[5] else None) for r in cursor.fetchall()] diff --git a/crawl_server/server.py b/crawl_server/server.py index 050b5d5..970e0c7 100644 --- a/crawl_server/server.py +++ b/crawl_server/server.py @@ -98,12 +98,5 @@ def pop_queued_tasks(): return Response(json_str, mimetype="application/json") -@app.route("/stats/") -@auth.login_required -def get_stats(): - json_str = json.dumps(tm.get_stats()) - return Response(json_str, mimetype="application/json") - - if __name__ == "__main__": app.run(port=config.CRAWL_SERVER_PORT, host="0.0.0.0", ssl_context="adhoc") diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index 2d44d21..585aa8a 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -36,9 +36,6 @@ class TaskManager: def get_non_indexed_results(self): return self.db.get_non_indexed_results() - def get_all_results(self): - return self.db.get_all_results() - def execute_queued_task(self): if len(self.current_tasks) <= self.max_processes: @@ -103,20 +100,4 @@ class TaskManager: if task.website_id == task_result.website_id: del current_tasks[i] - def get_stats(self): - - task_results = self.get_all_results() - stats = dict() - - if len(task_results) > 0: - stats["task_count"] = len(task_results) - stats["task_time"] = sum((task.end_time - task.start_time) for task in task_results) - stats["task_time_avg"] = stats["task_time"] / len(task_results) - stats["task_file_count"] = sum(task.file_count for task in task_results) - stats["task_file_count_avg"] = stats["task_file_count"] / len(task_results) - - return stats - - - diff --git a/database.py b/database.py index ad50d47..b628fe7 100644 --- a/database.py +++ b/database.py @@ -1,10 +1,12 @@ import sqlite3 import datetime +from collections import defaultdict from urllib.parse import urlparse import os import bcrypt import uuid import task +from crawl_server.database import TaskResult class InvalidQueryException(Exception): @@ -312,6 +314,46 @@ class Database: cursor.execute("UPDATE CrawlServer SET url=?, name=?, slots=? WHERE id=?", (url, name, slots, server_id)) conn.commit() + def log_result(self, result: TaskResult): + + with sqlite3.connect(self.db_path) as conn: + + cursor = conn.cursor() + + cursor.execute("INSERT INTO TaskResult " + "(server, website_id, status_code, file_count, start_time, end_time) " + "VALUES (?,?,?,?,?,?)", + (result.server_id, result.website_id, result.status_code, + result.file_count, result.start_time, result.end_time)) + conn.commit() + + def get_crawl_logs(self): + + with sqlite3.connect(self.db_path, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES) as conn: + cursor = conn.cursor() + + cursor.execute("SELECT website_id, status_code, file_count, start_time, end_time, indexed_time, S.name " + "FROM TaskResult INNER JOIN CrawlServer S on TaskResult.server = S.id " + "ORDER BY end_time DESC") + return [TaskResult(r[1], r[2], r[3].timestamp(), r[4].timestamp(), + r[0], r[5].timestamp() if r[5] else None, r[6]) for r in cursor.fetchall()] + + def get_stats_by_server(self): + + stats = dict() + task_results = self.get_crawl_logs() + + for server in self.get_crawl_servers(): + task_count = sum(1 for result in task_results if result.server_name == server.name) + if task_count > 0: + stats[server.name] = dict() + stats[server.name]["file_count"] = sum(result.file_count for result in task_results if result.server_name == server.name) + stats[server.name]["time"] = sum((result.end_time - result.start_time) for result in task_results if result.server_name == server.name) + stats[server.name]["task_count"] = task_count + stats[server.name]["time_avg"] = stats[server.name]["time"] / task_count + stats[server.name]["file_count_avg"] = stats[server.name]["file_count"] / task_count + + return stats diff --git a/init_script.sql b/init_script.sql index 0b07f12..3bc3327 100644 --- a/init_script.sql +++ b/init_script.sql @@ -30,4 +30,17 @@ CREATE TABLE CrawlServer ( name TEXT, token TEXT, slots INTEGER -) +); + +CREATE TABLE TaskResult ( + id INTEGER PRIMARY KEY, + server INT, + website_id INT, + status_code TEXT, + file_count INT, + start_time TIMESTAMP, + end_time TIMESTAMP, + indexed_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + + FOREIGN KEY (server) REFERENCES CrawlServer(id) +); diff --git a/task.py b/task.py index d706691..ed7c2ca 100644 --- a/task.py +++ b/task.py @@ -38,7 +38,7 @@ class CrawlServer: except ConnectionError: return False - def fetch_completed_tasks(self) -> list: + def pop_completed_tasks(self) -> list: try: r = requests.get(self.url + "/task/completed", headers=self._generate_headers(), verify=False) @@ -113,36 +113,6 @@ class CrawlServer: print(e) return False - def fetch_crawl_logs(self): - - try: - r = requests.get(self.url + "/task/logs/", headers=self._generate_headers(), verify=False) - - if r.status_code != 200: - print("Problem while fetching crawl logs for '" + self.name + "': " + str(r.status_code)) - print(r.text) - return [] - - return [ - TaskResult(r["status_code"], r["file_count"], r["start_time"], - r["end_time"], r["website_id"], r["indexed_time"]) - for r in json.loads(r.text)] - except ConnectionError: - return [] - - def fetch_stats(self): - try: - r = requests.get(self.url + "/stats/", headers=self._generate_headers(), verify=False) - - if r.status_code != 200: - print("Problem while fetching stats for '" + self.name + "': " + str(r.status_code)) - print(r.text) - return [] - - return json.loads(r.text) - except ConnectionError: - return {} - def pop_queued_tasks(self): try: r = requests.get(self.url + "/task/pop_all", headers=self._generate_headers(), verify=False) @@ -172,8 +142,11 @@ class TaskDispatcher: def check_completed_tasks(self): for server in self.db.get_crawl_servers(): - for task in server.fetch_completed_tasks(): + for task in server.pop_completed_tasks(): print("Completed task") + + task.server_id = server.id + if task.file_count: # All files are overwritten self.search.delete_docs(task.website_id) @@ -185,6 +158,8 @@ class TaskDispatcher: # Update last_modified date for website self.db.update_website_date_if_exists(task.website_id) + self.db.log_result(task) + def dispatch_task(self, task: Task): self._get_available_crawl_server().queue_task(task) @@ -248,26 +223,6 @@ class TaskDispatcher: return current_tasks - def get_task_logs_by_server(self) -> dict: - - task_logs = dict() - - for server in self.db.get_crawl_servers(): - task_logs[server.name] = server.fetch_crawl_logs() - - return task_logs - - def get_stats_by_server(self) -> dict: - - stats = dict() - - for server in self.db.get_crawl_servers(): - server_stats = server.fetch_stats() - if server_stats: - stats[server.name] = server_stats - - return stats - def redispatch_queued(self) -> int: counter = 0 diff --git a/templates/crawl_logs.html b/templates/crawl_logs.html index 60fb839..a48a04b 100644 --- a/templates/crawl_logs.html +++ b/templates/crawl_logs.html @@ -19,10 +19,9 @@ - {% for server in logs %} - {% for task_result in logs[server] %} + {% for task_result in logs %} - {{ server }} + {{ task_result.server_name }} #{{ task_result.website_id }} {{ task_result.status_code }} {{ task_result.file_count }} @@ -32,7 +31,6 @@ {{ task_result.indexed_time | datetime_format }} {% endfor %} - {% endfor %} diff --git a/templates/dashboard.html b/templates/dashboard.html index e9a1275..7cc74c1 100644 --- a/templates/dashboard.html +++ b/templates/dashboard.html @@ -7,6 +7,9 @@
Dashboard
+ Logs +
+

Crawl servers

diff --git a/templates/stats.html b/templates/stats.html index 9771849..45ead7e 100644 --- a/templates/stats.html +++ b/templates/stats.html @@ -90,25 +90,25 @@ {% for server in crawl_server_stats %} - + {% endfor %} {% for server in crawl_server_stats %} - + {% endfor %} {% for server in crawl_server_stats %} - + {% endfor %} {% for server in crawl_server_stats %} - + {% endfor %}
Crawl time{{ crawl_server_stats[server].task_time|round(2) }}s{{ crawl_server_stats[server].time|round(2) }}s
Crawl time average{{ crawl_server_stats[server].task_time_avg|round(2) }}s per task{{ crawl_server_stats[server].time_avg|round(2) }}s per task
File crawled{{ crawl_server_stats[server].task_file_count }}{{ crawl_server_stats[server].file_count }}
File crawled average{{ crawl_server_stats[server].task_file_count_avg | round(2) }} per task{{ crawl_server_stats[server].file_count_avg | round(2) }} per task