Added stats page

This commit is contained in:
Simon 2018-06-18 19:56:25 -04:00
parent 7923647ea3
commit 8768e39f08
10 changed files with 289 additions and 38 deletions

15
app.py
View File

@ -36,7 +36,6 @@ def datetime_format(value, format='%Y-%m-%d %H:%M:%S'):
return time.strftime(format, time.gmtime(value))
@app.route("/dl")
def downloads():
@ -49,6 +48,18 @@ def downloads():
return render_template("downloads.html", export_file_stats=export_file_stats)
@app.route("/stats")
def stats_page():
crawl_server_stats = taskDispatcher.get_stats_by_server()
return render_template("stats.html", crawl_server_stats=crawl_server_stats)
@app.route("/stats/json_chart")
def stats_json():
stats = searchEngine.get_global_stats()
return Response(json.dumps(stats), mimetype="application/json")
@app.route("/get_export")
def get_export():
@ -78,7 +89,7 @@ def website_json_chart(website_id):
stats = searchEngine.get_stats(website_id)
stats["base_url"] = website.url
stats["report_time"] = website.last_modified
return json.dumps(stats)
return Response(json.dumps(stats), mimetype="application/json")
else:
abort(404)

View File

@ -50,7 +50,7 @@ def task_put():
@auth.login_required
def get_completed_tasks():
json_str = json.dumps([result.to_json() for result in tm.get_non_indexed_results()])
return json_str
return Response(json_str, mimetype="application/json")
@app.route("/task/current", methods=["GET"])
@ -77,7 +77,14 @@ def get_file_list(website_id):
def get_task_logs():
json_str = json.dumps([result.to_json() for result in tm.get_all_results()])
return json_str
return Response(json_str, mimetype="application/json")
@app.route("/stats/")
@auth.login_required
def get_stats():
json_str = json.dumps(tm.get_stats())
return Response(json_str, mimetype="application/json")
if __name__ == "__main__":

View File

@ -103,5 +103,20 @@ class TaskManager:
if task.website_id == task_result.website_id:
del current_tasks[i]
def get_stats(self):
task_results = self.get_all_results()
stats = dict()
if len(task_results) > 0:
stats["task_count"] = len(task_results)
stats["task_time"] = sum((task.end_time - task.start_time) for task in task_results)
stats["task_time_avg"] = stats["task_time"] / len(task_results)
stats["task_file_count"] = sum(task.file_count for task in task_results)
stats["task_file_count_avg"] = stats["task_file_count"] / len(task_results)
return stats

View File

@ -4,7 +4,7 @@ import json
payload = json.dumps({
"website_id": 123,
"url": "https://computerarchive.org/files/computer/",
"url": "ftp://132.249.213.137",
# "url": "http://localhost:8000/",
# "url": "http://ubuntu.mirrorservice.org/",
"priority": 2,

View File

@ -75,7 +75,8 @@ class ElasticSearchEngine(SearchEngine):
# Mappings
self.es.indices.put_mapping(body={"properties": {
"path": {"analyzer": "standard", "type": "text"},
"name": {"analyzer": "standard", "type": "text", "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}},
"name": {"analyzer": "standard", "type": "text",
"fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}},
"mtime": {"type": "date", "format": "epoch_millis"},
"size": {"type": "long"},
"website_id": {"type": "integer"},
@ -214,21 +215,70 @@ class ElasticSearchEngine(SearchEngine):
def get_global_stats(self):
result = self.es.search(body={
# TODO: mem cache this
size_per_ext = self.es.search(body={
"query": {
"match_all": {}
"bool": {
"must_not": {
"term": {"size": -1}
}
}
},
"aggs": {
"total_size": {
"sum": {"field": "size"}
"ext_group": {
"terms": {
"field": "ext",
"size": 30
},
"aggs": {
"size": {
"sum": {
"field": "size"
}
}
}
}
},
"size": 0
}, index=self.index_name)
total_stats = self.es.search(body={
"query": {
"bool": {
"must_not": {
"term": {"size": -1}
}
}
},
"aggs": {
"file_stats": {
"extended_stats": {
"field": "size",
"sigma": 1
}
}
},
"size": 0
}, index=self.index_name)
es_stats = self.es.indices.stats(self.index_name)
print(es_stats)
stats = dict()
stats["file_count"] = result["hits"]["total"]
stats["file_size"] = result["aggregations"]["total_size"]["value"]
stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"]
stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"]
stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"]
stats["es_search_time_avg"] = stats["es_search_time"] / (stats["es_search_count"] if stats["es_search_count"] != 0 else 1)
stats["total_count"] = es_stats["indices"][self.index_name]["total"]["indexing"]["index_total"]
stats["total_count_nonzero"] = total_stats["hits"]["total"]
stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"]
stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"]
stats["size_std_deviation"] = total_stats["aggregations"]["file_stats"]["std_deviation"]
stats["size_std_deviation_bounds"] = total_stats["aggregations"]["file_stats"]["std_deviation_bounds"]
stats["size_variance"] = total_stats["aggregations"]["file_stats"]["variance"]
stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"])
for b in size_per_ext["aggregations"]["ext_group"]["buckets"]]
stats["base_url"] = "entire database"
return stats

View File

@ -1,20 +1,4 @@
var xhttp = new XMLHttpRequest();
xhttp.onreadystatechange = function() {
if (this.readyState === 4 && this.status === 200) {
console.log("Received: " + this.responseText);
var rData = this.responseText;
drawChart(JSON.parse(rData));
fillTable(JSON.parse(rData));
document.getElementById("loading-text").innerHTML = "";
}
};
xhttp.open("GET", "./json_chart", true);
xhttp.send();
function drawChart(rData) {
@ -70,7 +54,7 @@ function drawChart(rData) {
});
}
function fillTable(rData) {
function fillWebsiteTable(rData) {
document.getElementById("baseUrl").innerHTML = rData["base_url"];
document.getElementById("fileCount").innerHTML = rData["total_count"];
@ -79,13 +63,26 @@ function fillTable(rData) {
}
function fillDatabaseTable(rData) {
document.getElementById("esIndexSize") .innerHTML = humanFileSize(rData["es_index_size"]);
document.getElementById("esSearchCount").innerHTML = rData["es_search_count"];
document.getElementById("esSearchTime").innerHTML = rData["es_search_time"] + "ms";
document.getElementById("esSearchTimeAvg").innerHTML = rData["es_search_time_avg"].toFixed(2) + "ms";
document.getElementById("totalCount").innerHTML = rData["total_count"];
document.getElementById("totalCountNonzero").innerText = rData["total_count_nonzero"];
document.getElementById("totalSize").innerHTML = humanFileSize(rData["total_size"]);
document.getElementById("sizeAvg").innerHTML = humanFileSize(rData["size_avg"]);
document.getElementById("sizeStdDeviation").innerHTML = humanFileSize(rData["size_std_deviation"]);
document.getElementById("sizeStdDeviationBounds").innerHTML = "[" + humanFileSize(rData["size_std_deviation_bounds"]["lower"]) +
", " + humanFileSize(rData["size_std_deviation_bounds"]["upper"]) + "]";
document.getElementById("sizeVariance").innerHTML = humanFileSize(rData["size_variance"]);
}
function isRelevant(rData, ext) {
// console.log("Checking + " + rData["ext_stats"][ext][2]);
// console.log("total + " + rData["total_size"]);
// console.log("size + " + rData["ext_stats"][ext][0]);
// console.log("min + " + 0.03 * rData["total_count"]);
// if (ext[2] === "") {
// return false;
// }
if(rData["total_size"] < 100000) {
return rData["ext_stats"][ext][1] > 0.03 * rData["total_count"]
@ -113,7 +110,7 @@ function getRandomColor() {
*/
function humanFileSize(bytes) {
if(bytes <= 0) {
if(bytes === 0) {
return "? B"
}

23
task.py
View File

@ -14,8 +14,9 @@ class CrawlServer:
"Authorization": "Token " + config.CRAWL_SERVER_TOKEN,
}
def __init__(self, url):
def __init__(self, url, name):
self.url = url
self.name = name
def queue_task(self, task: Task) -> bool:
@ -80,6 +81,13 @@ class CrawlServer:
except ConnectionError:
return []
def fetch_stats(self):
try:
r = requests.get(self.url + "/stats/", headers=CrawlServer.headers)
return json.loads(r.text)
except ConnectionError:
return {}
class TaskDispatcher:
@ -92,7 +100,7 @@ class TaskDispatcher:
# TODO load from config
self.crawl_servers = [
CrawlServer("http://localhost:5001"),
CrawlServer("http://localhost:5001", "OVH_VPS_SSD2 #1"),
]
def check_completed_tasks(self):
@ -134,8 +142,17 @@ class TaskDispatcher:
task_logs = dict()
for server in self.crawl_servers:
task_logs[server.url] = server.fetch_crawl_logs()
task_logs[server.name] = server.fetch_crawl_logs()
return task_logs
def get_stats_by_server(self) -> dict:
stats = dict()
for server in self.crawl_servers:
stats[server.name] = server.fetch_stats()
return stats

View File

@ -40,6 +40,9 @@
<li class="nav-item">
<a class="nav-link {{ "active" if current_page == "dl" else "" }}" href="/dl">Downloads</a>
</li>
<li class="nav-item">
<a class="nav-link {{ "active" if current_page == "stats" else "" }}" href="/stats">Stats</a>
</li>
</ul>
</div>
</nav>

134
templates/stats.html Normal file
View File

@ -0,0 +1,134 @@
{% extends "layout.html" %}
{% set title = "Stats - OD-Database" %}
{% set current_page = "stats" %}
{% block body %}
<div class="container">
<div class="card">
<div class="card-header">Statistics</div>
<div class="card-body">
<div id="chart-wrapper" style="margin-bottom: 1em">
<p id="loading-text">Calculating...</p>
<canvas id="typesChart"></canvas>
<script src="/static/js/Chart.min.js"></script>
<script src="/static/js/report.js"></script>
</div>
<h4>Database stats</h4>
<table class="table table-striped">
<tbody>
<tr>
<th>Database index size</th>
<td id="esIndexSize"></td>
</tr>
<tr>
<th>Query count</th>
<td id="esSearchCount"></td>
</tr>
<tr>
<th>Total query time</th>
<td id="esSearchTime"></td>
</tr>
<tr>
<th>Average time per query</th>
<td id="esSearchTimeAvg"></td>
</tr>
<tr>
<th>Total file count</th>
<td id="totalCount"></td>
</tr>
<tr>
<th>File count with known size</th>
<td id="totalCountNonzero"></td>
</tr>
<tr>
<th>Size total</th>
<td id="totalSize"></td>
</tr>
<tr>
<th>Size average</th>
<td id="sizeAvg"></td>
</tr>
<tr>
<th>Size standard deviation</th>
<td id="sizeStdDeviation"></td>
</tr>
<tr>
<th>Size standard deviation bounds (σ = 1)</th>
<td id="sizeStdDeviationBounds"></td>
</tr>
<tr>
<th>Size variance</th>
<td id="sizeVariance"></td>
</tr>
</tbody>
</table>
<h4>Crawl server stats</h4>
<table class="table table-striped">
<thead>
<tr>
<th></th>
{% for server in crawl_server_stats %}
<th>{{ server }}</th>
{% endfor %}
</tr>
</thead>
<tbody>
<tr>
<th>Completed tasks</th>
{% for server in crawl_server_stats %}
<td>{{ crawl_server_stats[server].task_count }}</td>
{% endfor %}
</tr>
<tr>
<th>Crawl time</th>
{% for server in crawl_server_stats %}
<td>{{ crawl_server_stats[server].task_time|round(2) }}s</td>
{% endfor %}
</tr>
<tr>
<th>Crawl time average</th>
{% for server in crawl_server_stats %}
<td>{{ crawl_server_stats[server].task_time_avg|round(2) }}s per task</td>
{% endfor %}
</tr>
<tr>
<th>File crawled</th>
{% for server in crawl_server_stats %}
<td>{{ crawl_server_stats[server].task_file_count }}</td>
{% endfor %}
</tr>
<tr>
<th>File crawled average</th>
{% for server in crawl_server_stats %}
<td>{{ crawl_server_stats[server].task_file_count_avg | round(2) }} per task</td>
{% endfor %}
</tr>
</tbody>
</table>
</div>
</div>
</div>
<script>
var xhttp = new XMLHttpRequest();
xhttp.onreadystatechange = function () {
if (this.readyState === 4 && this.status === 200) {
let rData = JSON.parse(this.responseText);
drawChart(rData);
fillDatabaseTable(rData);
document.getElementById("loading-text").innerHTML = "";
}
};
xhttp.open("GET", "/stats/json_chart", true);
xhttp.send();
</script>
{% endblock body %}

View File

@ -43,4 +43,21 @@
</div>
</div>
</div>
<script>
var xhttp = new XMLHttpRequest();
xhttp.onreadystatechange = function () {
if (this.readyState === 4 && this.status === 200) {
var rData = this.responseText;
drawChart(JSON.parse(rData));
fillWebsiteTable(JSON.parse(rData));
document.getElementById("loading-text").innerHTML = "";
}
};
xhttp.open("GET", "./json_chart", true);
xhttp.send();
</script>
{% endblock body %}