From 400abc9a3c9fc471bcd6efe558dec743ad97965e Mon Sep 17 00:00:00 2001 From: Simon Date: Mon, 18 Jun 2018 11:41:26 -0400 Subject: [PATCH] Added crawl logs page --- app.py | 20 +++++++++++++++++- crawl_server/database.py | 15 ++++++++++++-- crawl_server/server.py | 8 ++++++++ crawl_server/task_db_init.sql | 6 +++--- crawl_server/task_manager.py | 3 +++ task.py | 19 ++++++++++++++++++ templates/crawl_logs.html | 38 +++++++++++++++++++++++++++++++++++ templates/downloads.html | 2 +- templates/search.html | 2 +- 9 files changed, 105 insertions(+), 8 deletions(-) create mode 100644 templates/crawl_logs.html diff --git a/app.py b/app.py index 1a077ae..ddece8c 100644 --- a/app.py +++ b/app.py @@ -26,11 +26,17 @@ taskDispatcher = TaskDispatcher() searchEngine = ElasticSearchEngine("od-database") -@app.template_filter("datetime_format") +@app.template_filter("date_format") def datetime_format(value, format='%Y-%m-%d'): return time.strftime(format, time.gmtime(value)) +@app.template_filter("datetime_format") +def datetime_format(value, format='%Y-%m-%d %H:%M:%S'): + return time.strftime(format, time.gmtime(value)) + + + @app.route("/dl") def downloads(): @@ -293,6 +299,18 @@ def admin_del_token(): return abort(403) +@app.route("/logs", methods=["GET"]) +def admin_crawl_logs(): + if "username" in session: + + results = taskDispatcher.get_task_logs_by_server() + print(results) + + return render_template("crawl_logs.html", logs=results) + else: + return abort(403) + + if __name__ == '__main__': if config.USE_SSL: context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) diff --git a/crawl_server/database.py b/crawl_server/database.py index f5c7f70..47603ef 100644 --- a/crawl_server/database.py +++ b/crawl_server/database.py @@ -5,12 +5,13 @@ import sqlite3 class TaskResult: - def __init__(self, status_code=None, file_count=0, start_time=0, end_time=0, website_id=0): + def __init__(self, status_code=None, file_count=0, start_time=0, end_time=0, website_id=0, indexed_time=0): self.status_code = status_code self.file_count = file_count self.start_time = start_time self.end_time = end_time self.website_id = website_id + self.indexed_time = indexed_time def to_json(self): return { @@ -18,7 +19,8 @@ class TaskResult: "file_count": self.file_count, "start_time": self.start_time, "end_time": self.end_time, - "website_id": self.website_id + "website_id": self.website_id, + "indexed_time": self.indexed_time } @@ -126,3 +128,12 @@ class TaskManagerDatabase: conn.commit() return [TaskResult(r[0], r[1], r[2], r[3], r[4]) for r in db_result] + + def get_all_results(self): + + with sqlite3.connect(self.db_path, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES) as conn: + cursor = conn.cursor() + + cursor.execute("SELECT website_id, status_code, file_count, start_time, end_time, indexed_time " + "FROM TaskResult ORDER BY id ASC") + return [TaskResult(r[1], r[2], r[3].timestamp(), r[4].timestamp(), r[0], r[5].timestamp() if r[5] else None) for r in cursor.fetchall()] diff --git a/crawl_server/server.py b/crawl_server/server.py index 35d9d20..2514c8b 100644 --- a/crawl_server/server.py +++ b/crawl_server/server.py @@ -72,5 +72,13 @@ def get_file_list(website_id): return abort(404) +@app.route("/task/logs/") +@auth.login_required +def get_task_logs(): + + json_str = json.dumps([result.to_json() for result in tm.get_all_results()]) + return json_str + + if __name__ == "__main__": app.run(port=5001, host="0.0.0.0") diff --git a/crawl_server/task_db_init.sql b/crawl_server/task_db_init.sql index 61a27b4..bc6440c 100644 --- a/crawl_server/task_db_init.sql +++ b/crawl_server/task_db_init.sql @@ -13,7 +13,7 @@ CREATE TABLE TaskResult ( website_id INT, status_code TEXT, file_count INT, - start_time INT, - end_time INT, - indexed_time INT DEFAULT NULL + start_time TIMESTAMP, + end_time TIMESTAMP, + indexed_time TIMESTAMP DEFAULT NULL ); \ No newline at end of file diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index 10729a9..df98b77 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -32,6 +32,9 @@ class TaskManager: def get_non_indexed_results(self): return self.db.get_non_indexed_results() + def get_all_results(self): + return self.db.get_all_results() + def execute_queued_task(self): if len(self.current_tasks) <= self.max_processes: diff --git a/task.py b/task.py index 5267dbf..4eabf9d 100644 --- a/task.py +++ b/task.py @@ -70,6 +70,16 @@ class CrawlServer: except ConnectionError: return "" + def fetch_crawl_logs(self): + + try: + r = requests.get(self.url + "/task/logs/", headers=CrawlServer.headers) + return [ + TaskResult(r["status_code"], r["file_count"], r["start_time"], r["end_time"], r["website_id"], r["indexed_time"]) + for r in json.loads(r.text)] + except ConnectionError: + return [] + class TaskDispatcher: @@ -119,4 +129,13 @@ class TaskDispatcher: return current_tasks + def get_task_logs_by_server(self) -> dict: + + task_logs = dict() + + for server in self.crawl_servers: + task_logs[server.url] = server.fetch_crawl_logs() + + return task_logs + diff --git a/templates/crawl_logs.html b/templates/crawl_logs.html new file mode 100644 index 0000000..55c7743 --- /dev/null +++ b/templates/crawl_logs.html @@ -0,0 +1,38 @@ +{% extends "layout.html" %} +{% set title = "Crawl logs - OD-Database" %} + +{% block body %} +
+ + + + + + + + + + + + + + + + {% for server in logs %} + {% for task_result in logs[server] %} + + + + + + + + + + {% endfor %} + {% endfor %} + +
ServerWebsiteStatus codeFile countStartEndIndex
{{ server }}#{{ task_result.website_id }}{{ task_result.status_code }}{{ task_result.file_count }}{{ task_result.start_time | datetime_format }}{{ task_result.end_time | datetime_format }}{{ task_result.indexed_time | datetime_format }}
+ +
+{% endblock body %} diff --git a/templates/downloads.html b/templates/downloads.html index e29b3a7..49c5f59 100644 --- a/templates/downloads.html +++ b/templates/downloads.html @@ -25,7 +25,7 @@ out.csv.xz {{ export_file_stats.st_size |filesizeformat }} - {{ export_file_stats.st_mtime|datetime_format }} + {{ export_file_stats.st_mtime|date_format }} {% endif %} diff --git a/templates/search.html b/templates/search.html index 6aa216f..2cdfc3a 100644 --- a/templates/search.html +++ b/templates/search.html @@ -77,7 +77,7 @@ {# File size & date #}
{{ src["size"] | filesizeformat if src["size"] >= 0 else "?" }}
- {{ src["mtime"] | datetime_format }} + {{ src["mtime"] | date_format }} {% endfor %}