mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
Added stats page
This commit is contained in:
parent
7923647ea3
commit
8768e39f08
15
app.py
15
app.py
@ -36,7 +36,6 @@ def datetime_format(value, format='%Y-%m-%d %H:%M:%S'):
|
||||
return time.strftime(format, time.gmtime(value))
|
||||
|
||||
|
||||
|
||||
@app.route("/dl")
|
||||
def downloads():
|
||||
|
||||
@ -49,6 +48,18 @@ def downloads():
|
||||
return render_template("downloads.html", export_file_stats=export_file_stats)
|
||||
|
||||
|
||||
@app.route("/stats")
|
||||
def stats_page():
|
||||
crawl_server_stats = taskDispatcher.get_stats_by_server()
|
||||
return render_template("stats.html", crawl_server_stats=crawl_server_stats)
|
||||
|
||||
|
||||
@app.route("/stats/json_chart")
|
||||
def stats_json():
|
||||
stats = searchEngine.get_global_stats()
|
||||
return Response(json.dumps(stats), mimetype="application/json")
|
||||
|
||||
|
||||
@app.route("/get_export")
|
||||
def get_export():
|
||||
|
||||
@ -78,7 +89,7 @@ def website_json_chart(website_id):
|
||||
stats = searchEngine.get_stats(website_id)
|
||||
stats["base_url"] = website.url
|
||||
stats["report_time"] = website.last_modified
|
||||
return json.dumps(stats)
|
||||
return Response(json.dumps(stats), mimetype="application/json")
|
||||
else:
|
||||
abort(404)
|
||||
|
||||
|
@ -50,7 +50,7 @@ def task_put():
|
||||
@auth.login_required
|
||||
def get_completed_tasks():
|
||||
json_str = json.dumps([result.to_json() for result in tm.get_non_indexed_results()])
|
||||
return json_str
|
||||
return Response(json_str, mimetype="application/json")
|
||||
|
||||
|
||||
@app.route("/task/current", methods=["GET"])
|
||||
@ -77,7 +77,14 @@ def get_file_list(website_id):
|
||||
def get_task_logs():
|
||||
|
||||
json_str = json.dumps([result.to_json() for result in tm.get_all_results()])
|
||||
return json_str
|
||||
return Response(json_str, mimetype="application/json")
|
||||
|
||||
|
||||
@app.route("/stats/")
|
||||
@auth.login_required
|
||||
def get_stats():
|
||||
json_str = json.dumps(tm.get_stats())
|
||||
return Response(json_str, mimetype="application/json")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -103,5 +103,20 @@ class TaskManager:
|
||||
if task.website_id == task_result.website_id:
|
||||
del current_tasks[i]
|
||||
|
||||
def get_stats(self):
|
||||
|
||||
task_results = self.get_all_results()
|
||||
stats = dict()
|
||||
|
||||
if len(task_results) > 0:
|
||||
stats["task_count"] = len(task_results)
|
||||
stats["task_time"] = sum((task.end_time - task.start_time) for task in task_results)
|
||||
stats["task_time_avg"] = stats["task_time"] / len(task_results)
|
||||
stats["task_file_count"] = sum(task.file_count for task in task_results)
|
||||
stats["task_file_count_avg"] = stats["task_file_count"] / len(task_results)
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -4,7 +4,7 @@ import json
|
||||
|
||||
payload = json.dumps({
|
||||
"website_id": 123,
|
||||
"url": "https://computerarchive.org/files/computer/",
|
||||
"url": "ftp://132.249.213.137",
|
||||
# "url": "http://localhost:8000/",
|
||||
# "url": "http://ubuntu.mirrorservice.org/",
|
||||
"priority": 2,
|
||||
|
@ -75,7 +75,8 @@ class ElasticSearchEngine(SearchEngine):
|
||||
# Mappings
|
||||
self.es.indices.put_mapping(body={"properties": {
|
||||
"path": {"analyzer": "standard", "type": "text"},
|
||||
"name": {"analyzer": "standard", "type": "text", "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}},
|
||||
"name": {"analyzer": "standard", "type": "text",
|
||||
"fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}},
|
||||
"mtime": {"type": "date", "format": "epoch_millis"},
|
||||
"size": {"type": "long"},
|
||||
"website_id": {"type": "integer"},
|
||||
@ -214,21 +215,70 @@ class ElasticSearchEngine(SearchEngine):
|
||||
|
||||
def get_global_stats(self):
|
||||
|
||||
result = self.es.search(body={
|
||||
# TODO: mem cache this
|
||||
|
||||
size_per_ext = self.es.search(body={
|
||||
"query": {
|
||||
"match_all": {}
|
||||
"bool": {
|
||||
"must_not": {
|
||||
"term": {"size": -1}
|
||||
}
|
||||
}
|
||||
},
|
||||
"aggs": {
|
||||
"total_size": {
|
||||
"sum": {"field": "size"}
|
||||
"ext_group": {
|
||||
"terms": {
|
||||
"field": "ext",
|
||||
"size": 30
|
||||
},
|
||||
"aggs": {
|
||||
"size": {
|
||||
"sum": {
|
||||
"field": "size"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"size": 0
|
||||
}, index=self.index_name)
|
||||
|
||||
total_stats = self.es.search(body={
|
||||
"query": {
|
||||
"bool": {
|
||||
"must_not": {
|
||||
"term": {"size": -1}
|
||||
}
|
||||
}
|
||||
},
|
||||
"aggs": {
|
||||
"file_stats": {
|
||||
"extended_stats": {
|
||||
"field": "size",
|
||||
"sigma": 1
|
||||
}
|
||||
}
|
||||
},
|
||||
"size": 0
|
||||
}, index=self.index_name)
|
||||
|
||||
es_stats = self.es.indices.stats(self.index_name)
|
||||
print(es_stats)
|
||||
|
||||
stats = dict()
|
||||
stats["file_count"] = result["hits"]["total"]
|
||||
stats["file_size"] = result["aggregations"]["total_size"]["value"]
|
||||
stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"]
|
||||
stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"]
|
||||
stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"]
|
||||
stats["es_search_time_avg"] = stats["es_search_time"] / (stats["es_search_count"] if stats["es_search_count"] != 0 else 1)
|
||||
stats["total_count"] = es_stats["indices"][self.index_name]["total"]["indexing"]["index_total"]
|
||||
stats["total_count_nonzero"] = total_stats["hits"]["total"]
|
||||
stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"]
|
||||
stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"]
|
||||
stats["size_std_deviation"] = total_stats["aggregations"]["file_stats"]["std_deviation"]
|
||||
stats["size_std_deviation_bounds"] = total_stats["aggregations"]["file_stats"]["std_deviation_bounds"]
|
||||
stats["size_variance"] = total_stats["aggregations"]["file_stats"]["variance"]
|
||||
stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"])
|
||||
for b in size_per_ext["aggregations"]["ext_group"]["buckets"]]
|
||||
stats["base_url"] = "entire database"
|
||||
|
||||
return stats
|
||||
|
||||
|
@ -1,20 +1,4 @@
|
||||
var xhttp = new XMLHttpRequest();
|
||||
|
||||
xhttp.onreadystatechange = function() {
|
||||
if (this.readyState === 4 && this.status === 200) {
|
||||
|
||||
console.log("Received: " + this.responseText);
|
||||
|
||||
var rData = this.responseText;
|
||||
|
||||
drawChart(JSON.parse(rData));
|
||||
fillTable(JSON.parse(rData));
|
||||
|
||||
document.getElementById("loading-text").innerHTML = "";
|
||||
}
|
||||
};
|
||||
xhttp.open("GET", "./json_chart", true);
|
||||
xhttp.send();
|
||||
|
||||
function drawChart(rData) {
|
||||
|
||||
@ -70,7 +54,7 @@ function drawChart(rData) {
|
||||
});
|
||||
}
|
||||
|
||||
function fillTable(rData) {
|
||||
function fillWebsiteTable(rData) {
|
||||
|
||||
document.getElementById("baseUrl").innerHTML = rData["base_url"];
|
||||
document.getElementById("fileCount").innerHTML = rData["total_count"];
|
||||
@ -79,13 +63,26 @@ function fillTable(rData) {
|
||||
|
||||
}
|
||||
|
||||
function fillDatabaseTable(rData) {
|
||||
document.getElementById("esIndexSize") .innerHTML = humanFileSize(rData["es_index_size"]);
|
||||
document.getElementById("esSearchCount").innerHTML = rData["es_search_count"];
|
||||
document.getElementById("esSearchTime").innerHTML = rData["es_search_time"] + "ms";
|
||||
document.getElementById("esSearchTimeAvg").innerHTML = rData["es_search_time_avg"].toFixed(2) + "ms";
|
||||
document.getElementById("totalCount").innerHTML = rData["total_count"];
|
||||
document.getElementById("totalCountNonzero").innerText = rData["total_count_nonzero"];
|
||||
document.getElementById("totalSize").innerHTML = humanFileSize(rData["total_size"]);
|
||||
document.getElementById("sizeAvg").innerHTML = humanFileSize(rData["size_avg"]);
|
||||
document.getElementById("sizeStdDeviation").innerHTML = humanFileSize(rData["size_std_deviation"]);
|
||||
document.getElementById("sizeStdDeviationBounds").innerHTML = "[" + humanFileSize(rData["size_std_deviation_bounds"]["lower"]) +
|
||||
", " + humanFileSize(rData["size_std_deviation_bounds"]["upper"]) + "]";
|
||||
document.getElementById("sizeVariance").innerHTML = humanFileSize(rData["size_variance"]);
|
||||
}
|
||||
|
||||
function isRelevant(rData, ext) {
|
||||
|
||||
// console.log("Checking + " + rData["ext_stats"][ext][2]);
|
||||
// console.log("total + " + rData["total_size"]);
|
||||
// console.log("size + " + rData["ext_stats"][ext][0]);
|
||||
// console.log("min + " + 0.03 * rData["total_count"]);
|
||||
// if (ext[2] === "") {
|
||||
// return false;
|
||||
// }
|
||||
|
||||
if(rData["total_size"] < 100000) {
|
||||
return rData["ext_stats"][ext][1] > 0.03 * rData["total_count"]
|
||||
@ -113,7 +110,7 @@ function getRandomColor() {
|
||||
*/
|
||||
function humanFileSize(bytes) {
|
||||
|
||||
if(bytes <= 0) {
|
||||
if(bytes === 0) {
|
||||
return "? B"
|
||||
}
|
||||
|
||||
|
23
task.py
23
task.py
@ -14,8 +14,9 @@ class CrawlServer:
|
||||
"Authorization": "Token " + config.CRAWL_SERVER_TOKEN,
|
||||
}
|
||||
|
||||
def __init__(self, url):
|
||||
def __init__(self, url, name):
|
||||
self.url = url
|
||||
self.name = name
|
||||
|
||||
def queue_task(self, task: Task) -> bool:
|
||||
|
||||
@ -80,6 +81,13 @@ class CrawlServer:
|
||||
except ConnectionError:
|
||||
return []
|
||||
|
||||
def fetch_stats(self):
|
||||
try:
|
||||
r = requests.get(self.url + "/stats/", headers=CrawlServer.headers)
|
||||
return json.loads(r.text)
|
||||
except ConnectionError:
|
||||
return {}
|
||||
|
||||
|
||||
class TaskDispatcher:
|
||||
|
||||
@ -92,7 +100,7 @@ class TaskDispatcher:
|
||||
|
||||
# TODO load from config
|
||||
self.crawl_servers = [
|
||||
CrawlServer("http://localhost:5001"),
|
||||
CrawlServer("http://localhost:5001", "OVH_VPS_SSD2 #1"),
|
||||
]
|
||||
|
||||
def check_completed_tasks(self):
|
||||
@ -134,8 +142,17 @@ class TaskDispatcher:
|
||||
task_logs = dict()
|
||||
|
||||
for server in self.crawl_servers:
|
||||
task_logs[server.url] = server.fetch_crawl_logs()
|
||||
task_logs[server.name] = server.fetch_crawl_logs()
|
||||
|
||||
return task_logs
|
||||
|
||||
def get_stats_by_server(self) -> dict:
|
||||
|
||||
stats = dict()
|
||||
|
||||
for server in self.crawl_servers:
|
||||
stats[server.name] = server.fetch_stats()
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
|
@ -40,6 +40,9 @@
|
||||
<li class="nav-item">
|
||||
<a class="nav-link {{ "active" if current_page == "dl" else "" }}" href="/dl">Downloads</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link {{ "active" if current_page == "stats" else "" }}" href="/stats">Stats</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav>
|
||||
|
134
templates/stats.html
Normal file
134
templates/stats.html
Normal file
@ -0,0 +1,134 @@
|
||||
{% extends "layout.html" %}
|
||||
{% set title = "Stats - OD-Database" %}
|
||||
{% set current_page = "stats" %}
|
||||
|
||||
{% block body %}
|
||||
<div class="container">
|
||||
|
||||
<div class="card">
|
||||
<div class="card-header">Statistics</div>
|
||||
<div class="card-body">
|
||||
|
||||
<div id="chart-wrapper" style="margin-bottom: 1em">
|
||||
<p id="loading-text">Calculating...</p>
|
||||
<canvas id="typesChart"></canvas>
|
||||
<script src="/static/js/Chart.min.js"></script>
|
||||
<script src="/static/js/report.js"></script>
|
||||
</div>
|
||||
|
||||
<h4>Database stats</h4>
|
||||
<table class="table table-striped">
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>Database index size</th>
|
||||
<td id="esIndexSize"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Query count</th>
|
||||
<td id="esSearchCount"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Total query time</th>
|
||||
<td id="esSearchTime"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Average time per query</th>
|
||||
<td id="esSearchTimeAvg"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Total file count</th>
|
||||
<td id="totalCount"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>File count with known size</th>
|
||||
<td id="totalCountNonzero"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Size total</th>
|
||||
<td id="totalSize"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Size average</th>
|
||||
<td id="sizeAvg"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Size standard deviation</th>
|
||||
<td id="sizeStdDeviation"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Size standard deviation bounds (σ = 1)</th>
|
||||
<td id="sizeStdDeviationBounds"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Size variance</th>
|
||||
<td id="sizeVariance"></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<h4>Crawl server stats</h4>
|
||||
<table class="table table-striped">
|
||||
<thead>
|
||||
<tr>
|
||||
<th></th>
|
||||
{% for server in crawl_server_stats %}
|
||||
<th>{{ server }}</th>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>Completed tasks</th>
|
||||
{% for server in crawl_server_stats %}
|
||||
<td>{{ crawl_server_stats[server].task_count }}</td>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Crawl time</th>
|
||||
{% for server in crawl_server_stats %}
|
||||
<td>{{ crawl_server_stats[server].task_time|round(2) }}s</td>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Crawl time average</th>
|
||||
{% for server in crawl_server_stats %}
|
||||
<td>{{ crawl_server_stats[server].task_time_avg|round(2) }}s per task</td>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
<tr>
|
||||
<th>File crawled</th>
|
||||
{% for server in crawl_server_stats %}
|
||||
<td>{{ crawl_server_stats[server].task_file_count }}</td>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
<tr>
|
||||
<th>File crawled average</th>
|
||||
{% for server in crawl_server_stats %}
|
||||
<td>{{ crawl_server_stats[server].task_file_count_avg | round(2) }} per task</td>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<script>
|
||||
var xhttp = new XMLHttpRequest();
|
||||
|
||||
xhttp.onreadystatechange = function () {
|
||||
if (this.readyState === 4 && this.status === 200) {
|
||||
|
||||
let rData = JSON.parse(this.responseText);
|
||||
|
||||
drawChart(rData);
|
||||
fillDatabaseTable(rData);
|
||||
|
||||
document.getElementById("loading-text").innerHTML = "";
|
||||
}
|
||||
};
|
||||
xhttp.open("GET", "/stats/json_chart", true);
|
||||
xhttp.send();
|
||||
</script>
|
||||
{% endblock body %}
|
@ -43,4 +43,21 @@
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
var xhttp = new XMLHttpRequest();
|
||||
|
||||
xhttp.onreadystatechange = function () {
|
||||
if (this.readyState === 4 && this.status === 200) {
|
||||
|
||||
var rData = this.responseText;
|
||||
|
||||
drawChart(JSON.parse(rData));
|
||||
fillWebsiteTable(JSON.parse(rData));
|
||||
|
||||
document.getElementById("loading-text").innerHTML = "";
|
||||
}
|
||||
};
|
||||
xhttp.open("GET", "./json_chart", true);
|
||||
xhttp.send();
|
||||
</script>
|
||||
{% endblock body %}
|
||||
|
Loading…
x
Reference in New Issue
Block a user