From 8768e39f089b58a7abf7467eea9adbe6983b6108 Mon Sep 17 00:00:00 2001 From: Simon Date: Mon, 18 Jun 2018 19:56:25 -0400 Subject: [PATCH] Added stats page --- app.py | 15 +++- crawl_server/server.py | 11 ++- crawl_server/task_manager.py | 15 ++++ debug_put.py | 2 +- search/search.py | 66 ++++++++++++++--- static/js/report.js | 41 +++++------ task.py | 23 +++++- templates/layout.html | 3 + templates/stats.html | 134 +++++++++++++++++++++++++++++++++++ templates/website.html | 17 +++++ 10 files changed, 289 insertions(+), 38 deletions(-) create mode 100644 templates/stats.html diff --git a/app.py b/app.py index 1929e95..7e6012e 100644 --- a/app.py +++ b/app.py @@ -36,7 +36,6 @@ def datetime_format(value, format='%Y-%m-%d %H:%M:%S'): return time.strftime(format, time.gmtime(value)) - @app.route("/dl") def downloads(): @@ -49,6 +48,18 @@ def downloads(): return render_template("downloads.html", export_file_stats=export_file_stats) +@app.route("/stats") +def stats_page(): + crawl_server_stats = taskDispatcher.get_stats_by_server() + return render_template("stats.html", crawl_server_stats=crawl_server_stats) + + +@app.route("/stats/json_chart") +def stats_json(): + stats = searchEngine.get_global_stats() + return Response(json.dumps(stats), mimetype="application/json") + + @app.route("/get_export") def get_export(): @@ -78,7 +89,7 @@ def website_json_chart(website_id): stats = searchEngine.get_stats(website_id) stats["base_url"] = website.url stats["report_time"] = website.last_modified - return json.dumps(stats) + return Response(json.dumps(stats), mimetype="application/json") else: abort(404) diff --git a/crawl_server/server.py b/crawl_server/server.py index 2514c8b..16889b7 100644 --- a/crawl_server/server.py +++ b/crawl_server/server.py @@ -50,7 +50,7 @@ def task_put(): @auth.login_required def get_completed_tasks(): json_str = json.dumps([result.to_json() for result in tm.get_non_indexed_results()]) - return json_str + return Response(json_str, mimetype="application/json") @app.route("/task/current", methods=["GET"]) @@ -77,7 +77,14 @@ def get_file_list(website_id): def get_task_logs(): json_str = json.dumps([result.to_json() for result in tm.get_all_results()]) - return json_str + return Response(json_str, mimetype="application/json") + + +@app.route("/stats/") +@auth.login_required +def get_stats(): + json_str = json.dumps(tm.get_stats()) + return Response(json_str, mimetype="application/json") if __name__ == "__main__": diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index 82b4687..a40b624 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -103,5 +103,20 @@ class TaskManager: if task.website_id == task_result.website_id: del current_tasks[i] + def get_stats(self): + + task_results = self.get_all_results() + stats = dict() + + if len(task_results) > 0: + stats["task_count"] = len(task_results) + stats["task_time"] = sum((task.end_time - task.start_time) for task in task_results) + stats["task_time_avg"] = stats["task_time"] / len(task_results) + stats["task_file_count"] = sum(task.file_count for task in task_results) + stats["task_file_count_avg"] = stats["task_file_count"] / len(task_results) + + return stats + + diff --git a/debug_put.py b/debug_put.py index 450f4c9..2b27f3c 100644 --- a/debug_put.py +++ b/debug_put.py @@ -4,7 +4,7 @@ import json payload = json.dumps({ "website_id": 123, - "url": "https://computerarchive.org/files/computer/", + "url": "ftp://132.249.213.137", # "url": "http://localhost:8000/", # "url": "http://ubuntu.mirrorservice.org/", "priority": 2, diff --git a/search/search.py b/search/search.py index c0eee63..ecba537 100644 --- a/search/search.py +++ b/search/search.py @@ -75,7 +75,8 @@ class ElasticSearchEngine(SearchEngine): # Mappings self.es.indices.put_mapping(body={"properties": { "path": {"analyzer": "standard", "type": "text"}, - "name": {"analyzer": "standard", "type": "text", "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}}, + "name": {"analyzer": "standard", "type": "text", + "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}}, "mtime": {"type": "date", "format": "epoch_millis"}, "size": {"type": "long"}, "website_id": {"type": "integer"}, @@ -214,21 +215,70 @@ class ElasticSearchEngine(SearchEngine): def get_global_stats(self): - result = self.es.search(body={ + # TODO: mem cache this + + size_per_ext = self.es.search(body={ "query": { - "match_all": {} + "bool": { + "must_not": { + "term": {"size": -1} + } + } }, "aggs": { - "total_size": { - "sum": {"field": "size"} + "ext_group": { + "terms": { + "field": "ext", + "size": 30 + }, + "aggs": { + "size": { + "sum": { + "field": "size" + } + } + } } }, "size": 0 }, index=self.index_name) + total_stats = self.es.search(body={ + "query": { + "bool": { + "must_not": { + "term": {"size": -1} + } + } + }, + "aggs": { + "file_stats": { + "extended_stats": { + "field": "size", + "sigma": 1 + } + } + }, + "size": 0 + }, index=self.index_name) + + es_stats = self.es.indices.stats(self.index_name) + print(es_stats) + stats = dict() - stats["file_count"] = result["hits"]["total"] - stats["file_size"] = result["aggregations"]["total_size"]["value"] + stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"] + stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"] + stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"] + stats["es_search_time_avg"] = stats["es_search_time"] / (stats["es_search_count"] if stats["es_search_count"] != 0 else 1) + stats["total_count"] = es_stats["indices"][self.index_name]["total"]["indexing"]["index_total"] + stats["total_count_nonzero"] = total_stats["hits"]["total"] + stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"] + stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"] + stats["size_std_deviation"] = total_stats["aggregations"]["file_stats"]["std_deviation"] + stats["size_std_deviation_bounds"] = total_stats["aggregations"]["file_stats"]["std_deviation_bounds"] + stats["size_variance"] = total_stats["aggregations"]["file_stats"]["variance"] + stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"]) + for b in size_per_ext["aggregations"]["ext_group"]["buckets"]] + stats["base_url"] = "entire database" return stats - diff --git a/static/js/report.js b/static/js/report.js index 3d99cbc..524949f 100644 --- a/static/js/report.js +++ b/static/js/report.js @@ -1,20 +1,4 @@ -var xhttp = new XMLHttpRequest(); -xhttp.onreadystatechange = function() { - if (this.readyState === 4 && this.status === 200) { - - console.log("Received: " + this.responseText); - - var rData = this.responseText; - - drawChart(JSON.parse(rData)); - fillTable(JSON.parse(rData)); - - document.getElementById("loading-text").innerHTML = ""; - } -}; -xhttp.open("GET", "./json_chart", true); -xhttp.send(); function drawChart(rData) { @@ -70,7 +54,7 @@ function drawChart(rData) { }); } -function fillTable(rData) { +function fillWebsiteTable(rData) { document.getElementById("baseUrl").innerHTML = rData["base_url"]; document.getElementById("fileCount").innerHTML = rData["total_count"]; @@ -79,13 +63,26 @@ function fillTable(rData) { } +function fillDatabaseTable(rData) { + document.getElementById("esIndexSize") .innerHTML = humanFileSize(rData["es_index_size"]); + document.getElementById("esSearchCount").innerHTML = rData["es_search_count"]; + document.getElementById("esSearchTime").innerHTML = rData["es_search_time"] + "ms"; + document.getElementById("esSearchTimeAvg").innerHTML = rData["es_search_time_avg"].toFixed(2) + "ms"; + document.getElementById("totalCount").innerHTML = rData["total_count"]; + document.getElementById("totalCountNonzero").innerText = rData["total_count_nonzero"]; + document.getElementById("totalSize").innerHTML = humanFileSize(rData["total_size"]); + document.getElementById("sizeAvg").innerHTML = humanFileSize(rData["size_avg"]); + document.getElementById("sizeStdDeviation").innerHTML = humanFileSize(rData["size_std_deviation"]); + document.getElementById("sizeStdDeviationBounds").innerHTML = "[" + humanFileSize(rData["size_std_deviation_bounds"]["lower"]) + + ", " + humanFileSize(rData["size_std_deviation_bounds"]["upper"]) + "]"; + document.getElementById("sizeVariance").innerHTML = humanFileSize(rData["size_variance"]); +} function isRelevant(rData, ext) { - // console.log("Checking + " + rData["ext_stats"][ext][2]); - // console.log("total + " + rData["total_size"]); - // console.log("size + " + rData["ext_stats"][ext][0]); - // console.log("min + " + 0.03 * rData["total_count"]); + // if (ext[2] === "") { + // return false; + // } if(rData["total_size"] < 100000) { return rData["ext_stats"][ext][1] > 0.03 * rData["total_count"] @@ -113,7 +110,7 @@ function getRandomColor() { */ function humanFileSize(bytes) { - if(bytes <= 0) { + if(bytes === 0) { return "? B" } diff --git a/task.py b/task.py index 4eabf9d..5a9501a 100644 --- a/task.py +++ b/task.py @@ -14,8 +14,9 @@ class CrawlServer: "Authorization": "Token " + config.CRAWL_SERVER_TOKEN, } - def __init__(self, url): + def __init__(self, url, name): self.url = url + self.name = name def queue_task(self, task: Task) -> bool: @@ -80,6 +81,13 @@ class CrawlServer: except ConnectionError: return [] + def fetch_stats(self): + try: + r = requests.get(self.url + "/stats/", headers=CrawlServer.headers) + return json.loads(r.text) + except ConnectionError: + return {} + class TaskDispatcher: @@ -92,7 +100,7 @@ class TaskDispatcher: # TODO load from config self.crawl_servers = [ - CrawlServer("http://localhost:5001"), + CrawlServer("http://localhost:5001", "OVH_VPS_SSD2 #1"), ] def check_completed_tasks(self): @@ -134,8 +142,17 @@ class TaskDispatcher: task_logs = dict() for server in self.crawl_servers: - task_logs[server.url] = server.fetch_crawl_logs() + task_logs[server.name] = server.fetch_crawl_logs() return task_logs + def get_stats_by_server(self) -> dict: + + stats = dict() + + for server in self.crawl_servers: + stats[server.name] = server.fetch_stats() + + return stats + diff --git a/templates/layout.html b/templates/layout.html index 435a8a7..c303b09 100644 --- a/templates/layout.html +++ b/templates/layout.html @@ -40,6 +40,9 @@ + diff --git a/templates/stats.html b/templates/stats.html new file mode 100644 index 0000000..7efcce1 --- /dev/null +++ b/templates/stats.html @@ -0,0 +1,134 @@ +{% extends "layout.html" %} +{% set title = "Stats - OD-Database" %} +{% set current_page = "stats" %} + +{% block body %} +
+ +
+
Statistics
+
+ +
+

Calculating...

+ + + +
+ +

Database stats

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Database index size
Query count
Total query time
Average time per query
Total file count
File count with known size
Size total
Size average
Size standard deviation
Size standard deviation bounds (σ = 1)
Size variance
+ +

Crawl server stats

+ + + + + {% for server in crawl_server_stats %} + + {% endfor %} + + + + + + {% for server in crawl_server_stats %} + + {% endfor %} + + + + {% for server in crawl_server_stats %} + + {% endfor %} + + + + {% for server in crawl_server_stats %} + + {% endfor %} + + + + {% for server in crawl_server_stats %} + + {% endfor %} + + + + {% for server in crawl_server_stats %} + + {% endfor %} + + +
{{ server }}
Completed tasks{{ crawl_server_stats[server].task_count }}
Crawl time{{ crawl_server_stats[server].task_time|round(2) }}s
Crawl time average{{ crawl_server_stats[server].task_time_avg|round(2) }}s per task
File crawled{{ crawl_server_stats[server].task_file_count }}
File crawled average{{ crawl_server_stats[server].task_file_count_avg | round(2) }} per task
+
+
+ +
+ + +{% endblock body %} diff --git a/templates/website.html b/templates/website.html index 15eaefd..1d1134b 100644 --- a/templates/website.html +++ b/templates/website.html @@ -43,4 +43,21 @@ + {% endblock body %}