diff --git a/app.py b/app.py index 1929e95..7e6012e 100644 --- a/app.py +++ b/app.py @@ -36,7 +36,6 @@ def datetime_format(value, format='%Y-%m-%d %H:%M:%S'): return time.strftime(format, time.gmtime(value)) - @app.route("/dl") def downloads(): @@ -49,6 +48,18 @@ def downloads(): return render_template("downloads.html", export_file_stats=export_file_stats) +@app.route("/stats") +def stats_page(): + crawl_server_stats = taskDispatcher.get_stats_by_server() + return render_template("stats.html", crawl_server_stats=crawl_server_stats) + + +@app.route("/stats/json_chart") +def stats_json(): + stats = searchEngine.get_global_stats() + return Response(json.dumps(stats), mimetype="application/json") + + @app.route("/get_export") def get_export(): @@ -78,7 +89,7 @@ def website_json_chart(website_id): stats = searchEngine.get_stats(website_id) stats["base_url"] = website.url stats["report_time"] = website.last_modified - return json.dumps(stats) + return Response(json.dumps(stats), mimetype="application/json") else: abort(404) diff --git a/crawl_server/server.py b/crawl_server/server.py index 2514c8b..16889b7 100644 --- a/crawl_server/server.py +++ b/crawl_server/server.py @@ -50,7 +50,7 @@ def task_put(): @auth.login_required def get_completed_tasks(): json_str = json.dumps([result.to_json() for result in tm.get_non_indexed_results()]) - return json_str + return Response(json_str, mimetype="application/json") @app.route("/task/current", methods=["GET"]) @@ -77,7 +77,14 @@ def get_file_list(website_id): def get_task_logs(): json_str = json.dumps([result.to_json() for result in tm.get_all_results()]) - return json_str + return Response(json_str, mimetype="application/json") + + +@app.route("/stats/") +@auth.login_required +def get_stats(): + json_str = json.dumps(tm.get_stats()) + return Response(json_str, mimetype="application/json") if __name__ == "__main__": diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index 82b4687..a40b624 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -103,5 +103,20 @@ class TaskManager: if task.website_id == task_result.website_id: del current_tasks[i] + def get_stats(self): + + task_results = self.get_all_results() + stats = dict() + + if len(task_results) > 0: + stats["task_count"] = len(task_results) + stats["task_time"] = sum((task.end_time - task.start_time) for task in task_results) + stats["task_time_avg"] = stats["task_time"] / len(task_results) + stats["task_file_count"] = sum(task.file_count for task in task_results) + stats["task_file_count_avg"] = stats["task_file_count"] / len(task_results) + + return stats + + diff --git a/debug_put.py b/debug_put.py index 450f4c9..2b27f3c 100644 --- a/debug_put.py +++ b/debug_put.py @@ -4,7 +4,7 @@ import json payload = json.dumps({ "website_id": 123, - "url": "https://computerarchive.org/files/computer/", + "url": "ftp://132.249.213.137", # "url": "http://localhost:8000/", # "url": "http://ubuntu.mirrorservice.org/", "priority": 2, diff --git a/search/search.py b/search/search.py index c0eee63..ecba537 100644 --- a/search/search.py +++ b/search/search.py @@ -75,7 +75,8 @@ class ElasticSearchEngine(SearchEngine): # Mappings self.es.indices.put_mapping(body={"properties": { "path": {"analyzer": "standard", "type": "text"}, - "name": {"analyzer": "standard", "type": "text", "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}}, + "name": {"analyzer": "standard", "type": "text", + "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}}, "mtime": {"type": "date", "format": "epoch_millis"}, "size": {"type": "long"}, "website_id": {"type": "integer"}, @@ -214,21 +215,70 @@ class ElasticSearchEngine(SearchEngine): def get_global_stats(self): - result = self.es.search(body={ + # TODO: mem cache this + + size_per_ext = self.es.search(body={ "query": { - "match_all": {} + "bool": { + "must_not": { + "term": {"size": -1} + } + } }, "aggs": { - "total_size": { - "sum": {"field": "size"} + "ext_group": { + "terms": { + "field": "ext", + "size": 30 + }, + "aggs": { + "size": { + "sum": { + "field": "size" + } + } + } } }, "size": 0 }, index=self.index_name) + total_stats = self.es.search(body={ + "query": { + "bool": { + "must_not": { + "term": {"size": -1} + } + } + }, + "aggs": { + "file_stats": { + "extended_stats": { + "field": "size", + "sigma": 1 + } + } + }, + "size": 0 + }, index=self.index_name) + + es_stats = self.es.indices.stats(self.index_name) + print(es_stats) + stats = dict() - stats["file_count"] = result["hits"]["total"] - stats["file_size"] = result["aggregations"]["total_size"]["value"] + stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"] + stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"] + stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"] + stats["es_search_time_avg"] = stats["es_search_time"] / (stats["es_search_count"] if stats["es_search_count"] != 0 else 1) + stats["total_count"] = es_stats["indices"][self.index_name]["total"]["indexing"]["index_total"] + stats["total_count_nonzero"] = total_stats["hits"]["total"] + stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"] + stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"] + stats["size_std_deviation"] = total_stats["aggregations"]["file_stats"]["std_deviation"] + stats["size_std_deviation_bounds"] = total_stats["aggregations"]["file_stats"]["std_deviation_bounds"] + stats["size_variance"] = total_stats["aggregations"]["file_stats"]["variance"] + stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"]) + for b in size_per_ext["aggregations"]["ext_group"]["buckets"]] + stats["base_url"] = "entire database" return stats - diff --git a/static/js/report.js b/static/js/report.js index 3d99cbc..524949f 100644 --- a/static/js/report.js +++ b/static/js/report.js @@ -1,20 +1,4 @@ -var xhttp = new XMLHttpRequest(); -xhttp.onreadystatechange = function() { - if (this.readyState === 4 && this.status === 200) { - - console.log("Received: " + this.responseText); - - var rData = this.responseText; - - drawChart(JSON.parse(rData)); - fillTable(JSON.parse(rData)); - - document.getElementById("loading-text").innerHTML = ""; - } -}; -xhttp.open("GET", "./json_chart", true); -xhttp.send(); function drawChart(rData) { @@ -70,7 +54,7 @@ function drawChart(rData) { }); } -function fillTable(rData) { +function fillWebsiteTable(rData) { document.getElementById("baseUrl").innerHTML = rData["base_url"]; document.getElementById("fileCount").innerHTML = rData["total_count"]; @@ -79,13 +63,26 @@ function fillTable(rData) { } +function fillDatabaseTable(rData) { + document.getElementById("esIndexSize") .innerHTML = humanFileSize(rData["es_index_size"]); + document.getElementById("esSearchCount").innerHTML = rData["es_search_count"]; + document.getElementById("esSearchTime").innerHTML = rData["es_search_time"] + "ms"; + document.getElementById("esSearchTimeAvg").innerHTML = rData["es_search_time_avg"].toFixed(2) + "ms"; + document.getElementById("totalCount").innerHTML = rData["total_count"]; + document.getElementById("totalCountNonzero").innerText = rData["total_count_nonzero"]; + document.getElementById("totalSize").innerHTML = humanFileSize(rData["total_size"]); + document.getElementById("sizeAvg").innerHTML = humanFileSize(rData["size_avg"]); + document.getElementById("sizeStdDeviation").innerHTML = humanFileSize(rData["size_std_deviation"]); + document.getElementById("sizeStdDeviationBounds").innerHTML = "[" + humanFileSize(rData["size_std_deviation_bounds"]["lower"]) + + ", " + humanFileSize(rData["size_std_deviation_bounds"]["upper"]) + "]"; + document.getElementById("sizeVariance").innerHTML = humanFileSize(rData["size_variance"]); +} function isRelevant(rData, ext) { - // console.log("Checking + " + rData["ext_stats"][ext][2]); - // console.log("total + " + rData["total_size"]); - // console.log("size + " + rData["ext_stats"][ext][0]); - // console.log("min + " + 0.03 * rData["total_count"]); + // if (ext[2] === "") { + // return false; + // } if(rData["total_size"] < 100000) { return rData["ext_stats"][ext][1] > 0.03 * rData["total_count"] @@ -113,7 +110,7 @@ function getRandomColor() { */ function humanFileSize(bytes) { - if(bytes <= 0) { + if(bytes === 0) { return "? B" } diff --git a/task.py b/task.py index 4eabf9d..5a9501a 100644 --- a/task.py +++ b/task.py @@ -14,8 +14,9 @@ class CrawlServer: "Authorization": "Token " + config.CRAWL_SERVER_TOKEN, } - def __init__(self, url): + def __init__(self, url, name): self.url = url + self.name = name def queue_task(self, task: Task) -> bool: @@ -80,6 +81,13 @@ class CrawlServer: except ConnectionError: return [] + def fetch_stats(self): + try: + r = requests.get(self.url + "/stats/", headers=CrawlServer.headers) + return json.loads(r.text) + except ConnectionError: + return {} + class TaskDispatcher: @@ -92,7 +100,7 @@ class TaskDispatcher: # TODO load from config self.crawl_servers = [ - CrawlServer("http://localhost:5001"), + CrawlServer("http://localhost:5001", "OVH_VPS_SSD2 #1"), ] def check_completed_tasks(self): @@ -134,8 +142,17 @@ class TaskDispatcher: task_logs = dict() for server in self.crawl_servers: - task_logs[server.url] = server.fetch_crawl_logs() + task_logs[server.name] = server.fetch_crawl_logs() return task_logs + def get_stats_by_server(self) -> dict: + + stats = dict() + + for server in self.crawl_servers: + stats[server.name] = server.fetch_stats() + + return stats + diff --git a/templates/layout.html b/templates/layout.html index 435a8a7..c303b09 100644 --- a/templates/layout.html +++ b/templates/layout.html @@ -40,6 +40,9 @@
Calculating...
+ + + +Database index size | ++ |
---|---|
Query count | ++ |
Total query time | ++ |
Average time per query | ++ |
Total file count | ++ |
File count with known size | ++ |
Size total | ++ |
Size average | ++ |
Size standard deviation | ++ |
Size standard deviation bounds (σ = 1) | ++ |
Size variance | ++ |
+ {% for server in crawl_server_stats %} + | {{ server }} | + {% endfor %} +
---|---|
Completed tasks | + {% for server in crawl_server_stats %} +{{ crawl_server_stats[server].task_count }} | + {% endfor %} +
Crawl time | + {% for server in crawl_server_stats %} +{{ crawl_server_stats[server].task_time|round(2) }}s | + {% endfor %} +
Crawl time average | + {% for server in crawl_server_stats %} +{{ crawl_server_stats[server].task_time_avg|round(2) }}s per task | + {% endfor %} +
File crawled | + {% for server in crawl_server_stats %} +{{ crawl_server_stats[server].task_file_count }} | + {% endfor %} +
File crawled average | + {% for server in crawl_server_stats %} +{{ crawl_server_stats[server].task_file_count_avg | round(2) }} per task | + {% endfor %} +