Stats are generated in background and stored to file instead of on-demand

This commit is contained in:
Simon 2018-07-24 20:08:07 -04:00
parent bf82478fee
commit fbbe952e4d
2 changed files with 25 additions and 11 deletions

2
app.py
View File

@ -68,8 +68,10 @@ def stats_page():
@cache.cached(240) @cache.cached(240)
def stats_json(): def stats_json():
stats = searchEngine.get_global_stats() stats = searchEngine.get_global_stats()
if stats:
db.join_website_on_stats(stats) db.join_website_on_stats(stats)
return Response(json.dumps(stats), mimetype="application/json") return Response(json.dumps(stats), mimetype="application/json")
return abort(500)
@app.route("/get_export") @app.route("/get_export")

View File

@ -3,6 +3,7 @@ import time
from elasticsearch import helpers from elasticsearch import helpers
import os import os
import ujson import ujson
from apscheduler.schedulers.background import BackgroundScheduler
class IndexingError(Exception): class IndexingError(Exception):
@ -46,6 +47,10 @@ class ElasticSearchEngine(SearchEngine):
self.index_name = index_name self.index_name = index_name
self.es = elasticsearch.Elasticsearch() self.es = elasticsearch.Elasticsearch()
scheduler = BackgroundScheduler()
scheduler.add_job(self._generate_global_stats, "interval", seconds=180)
scheduler.start()
if not self.es.indices.exists(self.index_name): if not self.es.indices.exists(self.index_name):
self.init() self.init()
@ -271,6 +276,14 @@ class ElasticSearchEngine(SearchEngine):
def get_global_stats(self): def get_global_stats(self):
if os.path.exists("_stats.json"):
with open("_stats.json", "r") as f:
return ujson.load(f)
else:
return None
def _generate_global_stats(self):
size_per_ext = self.es.search(body={ size_per_ext = self.es.search(body={
"query": { "query": {
"bool": { "bool": {
@ -298,7 +311,7 @@ class ElasticSearchEngine(SearchEngine):
}, },
"size": 0 "size": 0
}, index=self.index_name, request_timeout=20) }, index=self.index_name, request_timeout=120)
total_stats = self.es.search(body={ total_stats = self.es.search(body={
"query": { "query": {
@ -320,7 +333,7 @@ class ElasticSearchEngine(SearchEngine):
}, },
"size": 0 "size": 0
}, index=self.index_name, request_timeout=20) }, index=self.index_name, request_timeout=120)
size_and_date_histogram = self.es.search(body={ size_and_date_histogram = self.es.search(body={
"query": { "query": {
@ -355,7 +368,7 @@ class ElasticSearchEngine(SearchEngine):
} }
}, },
"size": 0 "size": 0
}, index=self.index_name, request_timeout=20) }, index=self.index_name, request_timeout=120)
website_scatter = self.es.search(body={ website_scatter = self.es.search(body={
"query": { "query": {
@ -383,17 +396,15 @@ class ElasticSearchEngine(SearchEngine):
} }
}, },
"size": 0 "size": 0
}, index=self.index_name, request_timeout=20) }, index=self.index_name, request_timeout=120)
es_stats = self.es.indices.stats(self.index_name, request_timeout=20) es_stats = self.es.indices.stats(self.index_name, request_timeout=120)
stats = dict() stats = dict()
stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"] stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"]
stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"] stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"]
stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"] stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"]
stats["es_search_time_avg"] = stats["es_search_time"] / ( stats["es_search_time_avg"] = stats["es_search_time"] / (stats["es_search_count"] if stats["es_search_count"] != 0 else 1)
stats["es_search_count"] if stats["es_search_count"] != 0 else 1)
stats["total_count"] = total_stats["hits"]["total"] stats["total_count"] = total_stats["hits"]["total"]
stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"] stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"]
@ -411,7 +422,8 @@ class ElasticSearchEngine(SearchEngine):
for b in website_scatter["aggregations"]["websites"]["buckets"]] for b in website_scatter["aggregations"]["websites"]["buckets"]]
stats["base_url"] = "entire database" stats["base_url"] = "entire database"
return stats with open("_stats.json", "w") as f:
ujson.dump(stats, f)
def stream_all_docs(self): def stream_all_docs(self):
return helpers.scan(query={ return helpers.scan(query={