mirror of
				https://github.com/simon987/od-database.git
				synced 2025-10-25 03:46:52 +00:00 
			
		
		
		
	Added stats page
This commit is contained in:
		
							parent
							
								
									7923647ea3
								
							
						
					
					
						commit
						8768e39f08
					
				
							
								
								
									
										15
									
								
								app.py
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								app.py
									
									
									
									
									
								
							| @ -36,7 +36,6 @@ def datetime_format(value, format='%Y-%m-%d %H:%M:%S'): | ||||
|     return time.strftime(format, time.gmtime(value)) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| @app.route("/dl") | ||||
| def downloads(): | ||||
| 
 | ||||
| @ -49,6 +48,18 @@ def downloads(): | ||||
|     return render_template("downloads.html", export_file_stats=export_file_stats) | ||||
| 
 | ||||
| 
 | ||||
| @app.route("/stats") | ||||
| def stats_page(): | ||||
|     crawl_server_stats = taskDispatcher.get_stats_by_server() | ||||
|     return render_template("stats.html", crawl_server_stats=crawl_server_stats) | ||||
| 
 | ||||
| 
 | ||||
| @app.route("/stats/json_chart") | ||||
| def stats_json(): | ||||
|     stats = searchEngine.get_global_stats() | ||||
|     return Response(json.dumps(stats), mimetype="application/json") | ||||
| 
 | ||||
| 
 | ||||
| @app.route("/get_export") | ||||
| def get_export(): | ||||
| 
 | ||||
| @ -78,7 +89,7 @@ def website_json_chart(website_id): | ||||
|         stats = searchEngine.get_stats(website_id) | ||||
|         stats["base_url"] = website.url | ||||
|         stats["report_time"] = website.last_modified | ||||
|         return json.dumps(stats) | ||||
|         return Response(json.dumps(stats), mimetype="application/json") | ||||
|     else: | ||||
|         abort(404) | ||||
| 
 | ||||
|  | ||||
| @ -50,7 +50,7 @@ def task_put(): | ||||
| @auth.login_required | ||||
| def get_completed_tasks(): | ||||
|     json_str = json.dumps([result.to_json() for result in tm.get_non_indexed_results()]) | ||||
|     return json_str | ||||
|     return Response(json_str, mimetype="application/json") | ||||
| 
 | ||||
| 
 | ||||
| @app.route("/task/current", methods=["GET"]) | ||||
| @ -77,7 +77,14 @@ def get_file_list(website_id): | ||||
| def get_task_logs(): | ||||
| 
 | ||||
|     json_str = json.dumps([result.to_json() for result in tm.get_all_results()]) | ||||
|     return json_str | ||||
|     return Response(json_str, mimetype="application/json") | ||||
| 
 | ||||
| 
 | ||||
| @app.route("/stats/") | ||||
| @auth.login_required | ||||
| def get_stats(): | ||||
|     json_str = json.dumps(tm.get_stats()) | ||||
|     return Response(json_str, mimetype="application/json") | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|  | ||||
| @ -103,5 +103,20 @@ class TaskManager: | ||||
|             if task.website_id == task_result.website_id: | ||||
|                 del current_tasks[i] | ||||
| 
 | ||||
|     def get_stats(self): | ||||
| 
 | ||||
|         task_results = self.get_all_results() | ||||
|         stats = dict() | ||||
| 
 | ||||
|         if len(task_results) > 0: | ||||
|             stats["task_count"] = len(task_results) | ||||
|             stats["task_time"] = sum((task.end_time - task.start_time) for task in task_results) | ||||
|             stats["task_time_avg"] = stats["task_time"] / len(task_results) | ||||
|             stats["task_file_count"] = sum(task.file_count for task in task_results) | ||||
|             stats["task_file_count_avg"] = stats["task_file_count"] / len(task_results) | ||||
| 
 | ||||
|         return stats | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
| @ -4,7 +4,7 @@ import json | ||||
| 
 | ||||
| payload = json.dumps({ | ||||
|     "website_id": 123, | ||||
|     "url": "https://computerarchive.org/files/computer/", | ||||
|     "url": "ftp://132.249.213.137", | ||||
|     # "url": "http://localhost:8000/", | ||||
|     # "url": "http://ubuntu.mirrorservice.org/", | ||||
|     "priority": 2, | ||||
|  | ||||
| @ -75,7 +75,8 @@ class ElasticSearchEngine(SearchEngine): | ||||
|         # Mappings | ||||
|         self.es.indices.put_mapping(body={"properties": { | ||||
|             "path": {"analyzer": "standard", "type": "text"}, | ||||
|             "name": {"analyzer": "standard", "type": "text", "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}}, | ||||
|             "name": {"analyzer": "standard", "type": "text", | ||||
|                      "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}}, | ||||
|             "mtime": {"type": "date", "format": "epoch_millis"}, | ||||
|             "size": {"type": "long"}, | ||||
|             "website_id": {"type": "integer"}, | ||||
| @ -214,21 +215,70 @@ class ElasticSearchEngine(SearchEngine): | ||||
| 
 | ||||
|     def get_global_stats(self): | ||||
| 
 | ||||
|         result = self.es.search(body={ | ||||
|         # TODO: mem cache this | ||||
| 
 | ||||
|         size_per_ext = self.es.search(body={ | ||||
|             "query": { | ||||
|                 "match_all": {} | ||||
|                 "bool": { | ||||
|                     "must_not": { | ||||
|                         "term": {"size": -1} | ||||
|                     } | ||||
|                 } | ||||
|             }, | ||||
|             "aggs": { | ||||
|                 "total_size": { | ||||
|                     "sum": {"field": "size"} | ||||
|                 "ext_group": { | ||||
|                     "terms": { | ||||
|                         "field": "ext", | ||||
|                         "size": 30 | ||||
|                     }, | ||||
|                     "aggs": { | ||||
|                         "size": { | ||||
|                             "sum": { | ||||
|                                 "field": "size" | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             }, | ||||
|             "size": 0 | ||||
|         }, index=self.index_name) | ||||
| 
 | ||||
|         total_stats = self.es.search(body={ | ||||
|             "query": { | ||||
|                 "bool": { | ||||
|                     "must_not": { | ||||
|                         "term": {"size": -1} | ||||
|                     } | ||||
|                 } | ||||
|             }, | ||||
|             "aggs": { | ||||
|                 "file_stats": { | ||||
|                     "extended_stats": { | ||||
|                         "field": "size", | ||||
|                         "sigma": 1 | ||||
|                     } | ||||
|                 } | ||||
|             }, | ||||
|             "size": 0 | ||||
|         }, index=self.index_name) | ||||
| 
 | ||||
|         es_stats = self.es.indices.stats(self.index_name) | ||||
|         print(es_stats) | ||||
| 
 | ||||
|         stats = dict() | ||||
|         stats["file_count"] = result["hits"]["total"] | ||||
|         stats["file_size"] = result["aggregations"]["total_size"]["value"] | ||||
|         stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"] | ||||
|         stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"] | ||||
|         stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"] | ||||
|         stats["es_search_time_avg"] = stats["es_search_time"] / (stats["es_search_count"] if stats["es_search_count"] != 0 else 1) | ||||
|         stats["total_count"] = es_stats["indices"][self.index_name]["total"]["indexing"]["index_total"] | ||||
|         stats["total_count_nonzero"] = total_stats["hits"]["total"] | ||||
|         stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"] | ||||
|         stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"] | ||||
|         stats["size_std_deviation"] = total_stats["aggregations"]["file_stats"]["std_deviation"] | ||||
|         stats["size_std_deviation_bounds"] = total_stats["aggregations"]["file_stats"]["std_deviation_bounds"] | ||||
|         stats["size_variance"] = total_stats["aggregations"]["file_stats"]["variance"] | ||||
|         stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"]) | ||||
|                               for b in size_per_ext["aggregations"]["ext_group"]["buckets"]] | ||||
|         stats["base_url"] = "entire database" | ||||
| 
 | ||||
|         return stats | ||||
| 
 | ||||
|  | ||||
| @ -1,20 +1,4 @@ | ||||
| var xhttp = new XMLHttpRequest(); | ||||
| 
 | ||||
| xhttp.onreadystatechange = function() { | ||||
|     if (this.readyState === 4 && this.status === 200) { | ||||
| 
 | ||||
|         console.log("Received: " + this.responseText); | ||||
| 
 | ||||
|         var rData = this.responseText; | ||||
| 
 | ||||
|         drawChart(JSON.parse(rData)); | ||||
|         fillTable(JSON.parse(rData)); | ||||
| 
 | ||||
|         document.getElementById("loading-text").innerHTML = ""; | ||||
|     } | ||||
| }; | ||||
| xhttp.open("GET", "./json_chart", true); | ||||
| xhttp.send(); | ||||
| 
 | ||||
| function drawChart(rData) { | ||||
| 
 | ||||
| @ -70,7 +54,7 @@ function drawChart(rData) { | ||||
|     }); | ||||
| } | ||||
| 
 | ||||
| function fillTable(rData) { | ||||
| function fillWebsiteTable(rData) { | ||||
| 
 | ||||
|     document.getElementById("baseUrl").innerHTML = rData["base_url"]; | ||||
|     document.getElementById("fileCount").innerHTML = rData["total_count"]; | ||||
| @ -79,13 +63,26 @@ function fillTable(rData) { | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| function fillDatabaseTable(rData) { | ||||
|     document.getElementById("esIndexSize") .innerHTML = humanFileSize(rData["es_index_size"]); | ||||
|     document.getElementById("esSearchCount").innerHTML = rData["es_search_count"]; | ||||
|     document.getElementById("esSearchTime").innerHTML = rData["es_search_time"] + "ms"; | ||||
|     document.getElementById("esSearchTimeAvg").innerHTML = rData["es_search_time_avg"].toFixed(2) + "ms"; | ||||
|     document.getElementById("totalCount").innerHTML = rData["total_count"]; | ||||
|     document.getElementById("totalCountNonzero").innerText = rData["total_count_nonzero"]; | ||||
|     document.getElementById("totalSize").innerHTML = humanFileSize(rData["total_size"]); | ||||
|     document.getElementById("sizeAvg").innerHTML = humanFileSize(rData["size_avg"]); | ||||
|     document.getElementById("sizeStdDeviation").innerHTML = humanFileSize(rData["size_std_deviation"]); | ||||
|     document.getElementById("sizeStdDeviationBounds").innerHTML = "[" + humanFileSize(rData["size_std_deviation_bounds"]["lower"]) + | ||||
|         ", " + humanFileSize(rData["size_std_deviation_bounds"]["upper"]) + "]"; | ||||
|     document.getElementById("sizeVariance").innerHTML = humanFileSize(rData["size_variance"]); | ||||
| } | ||||
| 
 | ||||
| function isRelevant(rData, ext) { | ||||
| 
 | ||||
|     // console.log("Checking + " + rData["ext_stats"][ext][2]);
 | ||||
|     // console.log("total + " + rData["total_size"]);
 | ||||
|     // console.log("size + " + rData["ext_stats"][ext][0]);
 | ||||
|     // console.log("min + " + 0.03 * rData["total_count"]);
 | ||||
|     // if (ext[2] === "") {
 | ||||
|     //     return false;
 | ||||
|     // }
 | ||||
| 
 | ||||
|     if(rData["total_size"] < 100000) { | ||||
|         return rData["ext_stats"][ext][1] > 0.03 * rData["total_count"] | ||||
| @ -113,7 +110,7 @@ function getRandomColor() { | ||||
|  */ | ||||
| function humanFileSize(bytes) { | ||||
| 
 | ||||
|     if(bytes <= 0) { | ||||
|     if(bytes === 0) { | ||||
|         return "? B" | ||||
|     } | ||||
| 
 | ||||
|  | ||||
							
								
								
									
										23
									
								
								task.py
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								task.py
									
									
									
									
									
								
							| @ -14,8 +14,9 @@ class CrawlServer: | ||||
|         "Authorization": "Token " + config.CRAWL_SERVER_TOKEN, | ||||
|     } | ||||
| 
 | ||||
|     def __init__(self, url): | ||||
|     def __init__(self, url, name): | ||||
|         self.url = url | ||||
|         self.name = name | ||||
| 
 | ||||
|     def queue_task(self, task: Task) -> bool: | ||||
| 
 | ||||
| @ -80,6 +81,13 @@ class CrawlServer: | ||||
|         except ConnectionError: | ||||
|             return [] | ||||
| 
 | ||||
|     def fetch_stats(self): | ||||
|         try: | ||||
|             r = requests.get(self.url + "/stats/", headers=CrawlServer.headers) | ||||
|             return json.loads(r.text) | ||||
|         except ConnectionError: | ||||
|             return {} | ||||
| 
 | ||||
| 
 | ||||
| class TaskDispatcher: | ||||
| 
 | ||||
| @ -92,7 +100,7 @@ class TaskDispatcher: | ||||
| 
 | ||||
|         # TODO load from config | ||||
|         self.crawl_servers = [ | ||||
|             CrawlServer("http://localhost:5001"), | ||||
|             CrawlServer("http://localhost:5001", "OVH_VPS_SSD2 #1"), | ||||
|         ] | ||||
| 
 | ||||
|     def check_completed_tasks(self): | ||||
| @ -134,8 +142,17 @@ class TaskDispatcher: | ||||
|         task_logs = dict() | ||||
| 
 | ||||
|         for server in self.crawl_servers: | ||||
|             task_logs[server.url] = server.fetch_crawl_logs() | ||||
|             task_logs[server.name] = server.fetch_crawl_logs() | ||||
| 
 | ||||
|         return task_logs | ||||
| 
 | ||||
|     def get_stats_by_server(self) -> dict: | ||||
| 
 | ||||
|         stats = dict() | ||||
| 
 | ||||
|         for server in self.crawl_servers: | ||||
|             stats[server.name] = server.fetch_stats() | ||||
| 
 | ||||
|         return stats | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
| @ -40,6 +40,9 @@ | ||||
|             <li class="nav-item"> | ||||
|                 <a class="nav-link {{ "active" if current_page == "dl" else "" }}" href="/dl">Downloads</a> | ||||
|             </li> | ||||
|             <li class="nav-item"> | ||||
|                 <a class="nav-link {{ "active" if current_page == "stats" else "" }}" href="/stats">Stats</a> | ||||
|             </li> | ||||
|         </ul> | ||||
|     </div> | ||||
| </nav> | ||||
|  | ||||
							
								
								
									
										134
									
								
								templates/stats.html
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										134
									
								
								templates/stats.html
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,134 @@ | ||||
| {% extends "layout.html" %} | ||||
| {% set title = "Stats - OD-Database" %} | ||||
| {% set current_page = "stats" %} | ||||
| 
 | ||||
| {% block body %} | ||||
|     <div class="container"> | ||||
| 
 | ||||
|         <div class="card"> | ||||
|             <div class="card-header">Statistics</div> | ||||
|             <div class="card-body"> | ||||
| 
 | ||||
|                 <div id="chart-wrapper" style="margin-bottom: 1em"> | ||||
|                     <p id="loading-text">Calculating...</p> | ||||
|                     <canvas id="typesChart"></canvas> | ||||
|                     <script src="/static/js/Chart.min.js"></script> | ||||
|                     <script src="/static/js/report.js"></script> | ||||
|                 </div> | ||||
| 
 | ||||
|                 <h4>Database stats</h4> | ||||
|                 <table class="table table-striped"> | ||||
|                     <tbody> | ||||
|                     <tr> | ||||
|                         <th>Database index size</th> | ||||
|                         <td id="esIndexSize"></td> | ||||
|                     </tr> | ||||
|                     <tr> | ||||
|                         <th>Query count</th> | ||||
|                         <td id="esSearchCount"></td> | ||||
|                     </tr> | ||||
|                     <tr> | ||||
|                         <th>Total query time</th> | ||||
|                         <td id="esSearchTime"></td> | ||||
|                     </tr> | ||||
|                     <tr> | ||||
|                         <th>Average time per query</th> | ||||
|                         <td id="esSearchTimeAvg"></td> | ||||
|                     </tr> | ||||
|                     <tr> | ||||
|                         <th>Total file count</th> | ||||
|                         <td id="totalCount"></td> | ||||
|                     </tr> | ||||
|                     <tr> | ||||
|                         <th>File count with known size</th> | ||||
|                         <td id="totalCountNonzero"></td> | ||||
|                     </tr> | ||||
|                     <tr> | ||||
|                         <th>Size total</th> | ||||
|                         <td id="totalSize"></td> | ||||
|                     </tr> | ||||
|                     <tr> | ||||
|                         <th>Size average</th> | ||||
|                         <td id="sizeAvg"></td> | ||||
|                     </tr> | ||||
|                     <tr> | ||||
|                         <th>Size standard deviation</th> | ||||
|                         <td id="sizeStdDeviation"></td> | ||||
|                     </tr> | ||||
|                     <tr> | ||||
|                         <th>Size standard deviation bounds (σ = 1)</th> | ||||
|                         <td id="sizeStdDeviationBounds"></td> | ||||
|                     </tr> | ||||
|                     <tr> | ||||
|                         <th>Size variance</th> | ||||
|                         <td id="sizeVariance"></td> | ||||
|                     </tr> | ||||
|                     </tbody> | ||||
|                 </table> | ||||
| 
 | ||||
|                 <h4>Crawl server stats</h4> | ||||
|                 <table class="table table-striped"> | ||||
|                     <thead> | ||||
|                     <tr> | ||||
|                         <th></th> | ||||
|                         {% for server in crawl_server_stats %} | ||||
|                             <th>{{ server }}</th> | ||||
|                         {% endfor %} | ||||
|                     </tr> | ||||
|                     </thead> | ||||
|                     <tbody> | ||||
|                     <tr> | ||||
|                         <th>Completed tasks</th> | ||||
|                         {% for server in crawl_server_stats %} | ||||
|                             <td>{{ crawl_server_stats[server].task_count }}</td> | ||||
|                         {% endfor %} | ||||
|                     </tr> | ||||
|                     <tr> | ||||
|                         <th>Crawl time</th> | ||||
|                         {% for server in crawl_server_stats %} | ||||
|                             <td>{{ crawl_server_stats[server].task_time|round(2) }}s</td> | ||||
|                         {% endfor %} | ||||
|                     </tr> | ||||
|                     <tr> | ||||
|                         <th>Crawl time average</th> | ||||
|                         {% for server in crawl_server_stats %} | ||||
|                             <td>{{ crawl_server_stats[server].task_time_avg|round(2) }}s per task</td> | ||||
|                         {% endfor %} | ||||
|                     </tr> | ||||
|                     <tr> | ||||
|                         <th>File crawled</th> | ||||
|                         {% for server in crawl_server_stats %} | ||||
|                             <td>{{ crawl_server_stats[server].task_file_count }}</td> | ||||
|                         {% endfor %} | ||||
|                     </tr> | ||||
|                     <tr> | ||||
|                         <th>File crawled average</th> | ||||
|                         {% for server in crawl_server_stats %} | ||||
|                             <td>{{ crawl_server_stats[server].task_file_count_avg | round(2) }} per task</td> | ||||
|                         {% endfor %} | ||||
|                     </tr> | ||||
|                     </tbody> | ||||
|                 </table> | ||||
|             </div> | ||||
|         </div> | ||||
| 
 | ||||
|     </div> | ||||
| 
 | ||||
|     <script> | ||||
|         var xhttp = new XMLHttpRequest(); | ||||
| 
 | ||||
|         xhttp.onreadystatechange = function () { | ||||
|             if (this.readyState === 4 && this.status === 200) { | ||||
| 
 | ||||
|                 let rData = JSON.parse(this.responseText); | ||||
| 
 | ||||
|                 drawChart(rData); | ||||
|                 fillDatabaseTable(rData); | ||||
| 
 | ||||
|                 document.getElementById("loading-text").innerHTML = ""; | ||||
|             } | ||||
|         }; | ||||
|         xhttp.open("GET", "/stats/json_chart", true); | ||||
|         xhttp.send(); | ||||
|     </script> | ||||
| {% endblock body %} | ||||
| @ -43,4 +43,21 @@ | ||||
|             </div> | ||||
|         </div> | ||||
|     </div> | ||||
|     <script> | ||||
|         var xhttp = new XMLHttpRequest(); | ||||
| 
 | ||||
|         xhttp.onreadystatechange = function () { | ||||
|             if (this.readyState === 4 && this.status === 200) { | ||||
| 
 | ||||
|                 var rData = this.responseText; | ||||
| 
 | ||||
|                 drawChart(JSON.parse(rData)); | ||||
|                 fillWebsiteTable(JSON.parse(rData)); | ||||
| 
 | ||||
|                 document.getElementById("loading-text").innerHTML = ""; | ||||
|             } | ||||
|         }; | ||||
|         xhttp.open("GET", "./json_chart", true); | ||||
|         xhttp.send(); | ||||
|     </script> | ||||
| {% endblock body %} | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user