From 4c9d79fdbf009afeef02f637e505b25a4ec15da6 Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 28 Jun 2018 10:40:54 -0400 Subject: [PATCH] Added filter for large files in stats --- app.py | 1 - search/search.py | 107 +++++++++++++++++++++------------------ templates/downloads.html | 2 +- templates/home.html | 2 +- 4 files changed, 59 insertions(+), 53 deletions(-) diff --git a/app.py b/app.py index 71ead8e..fae2c27 100644 --- a/app.py +++ b/app.py @@ -10,7 +10,6 @@ import config from flask_caching import Cache from task import TaskDispatcher, Task, CrawlServer from search.search import ElasticSearchEngine -from jinja2 import Undefined app = Flask(__name__) recaptcha = ReCaptcha(app=app, diff --git a/search/search.py b/search/search.py index ff98aaf..7ef7607 100644 --- a/search/search.py +++ b/search/search.py @@ -16,7 +16,8 @@ class SearchEngine: def import_json(self, in_str: str, website_id: int): raise NotImplementedError - def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min, date_max) -> {}: + def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min, + date_max) -> {}: raise NotImplementedError def reset(self): @@ -142,7 +143,8 @@ class ElasticSearchEngine(SearchEngine): action_string = '{"index":{}}\n' return "\n".join("".join([action_string, ujson.dumps(doc)]) for doc in docs) - def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, date_max) -> {}: + def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, + date_max) -> {}: filters = [] if extensions: @@ -264,16 +266,18 @@ class ElasticSearchEngine(SearchEngine): size_per_ext = self.es.search(body={ "query": { "bool": { - "must_not": { - "term": {"size": -1} - } + "filter": [ + {"range": { + "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB + }} + ] } }, "aggs": { "ext_group": { "terms": { "field": "ext", - "size": 20 + "size": 40 }, "aggs": { "size": { @@ -285,14 +289,17 @@ class ElasticSearchEngine(SearchEngine): } }, "size": 0 + }, index=self.index_name, request_timeout=30) total_stats = self.es.search(body={ "query": { "bool": { - "must_not": { - "term": {"size": -1} - } + "filter": [ + {"range": { + "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB + }} + ] } }, "aggs": { @@ -304,24 +311,20 @@ class ElasticSearchEngine(SearchEngine): } }, "size": 0 + }, index=self.index_name, request_timeout=30) size_and_date_histogram = self.es.search(body={ "query": { "bool": { - "must_not": { - "term": {"size": -1}, - }, "filter": [ + {"range": { + "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB + }}, {"range": { "mtime": { "gt": 0 # 1970-01-01 } - }}, - {"range": { - "size": { - "gt": 0 - } }} ] } @@ -349,9 +352,11 @@ class ElasticSearchEngine(SearchEngine): website_scatter = self.es.search(body={ "query": { "bool": { - "must_not": { - "term": {"size": -1}, - } + "filter": [ + {"range": { + "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB + }} + ] } }, "aggs": { @@ -379,7 +384,9 @@ class ElasticSearchEngine(SearchEngine): stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"] stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"] stats["es_search_time_avg"] = stats["es_search_time"] / ( + stats["es_search_count"] if stats["es_search_count"] != 0 else 1) + stats["total_count"] = total_stats["hits"]["total"] stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"] stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"] @@ -398,40 +405,40 @@ class ElasticSearchEngine(SearchEngine): return stats - def stream_all_docs(self): - return helpers.scan(query={ - "query": { - "match_all": {} - } - }, scroll="5m", client=self.es, index=self.index_name) +def stream_all_docs(self): + return helpers.scan(query={ + "query": { + "match_all": {} + } + }, scroll="5m", client=self.es, index=self.index_name) - def are_empty(self, websites): - result = self.es.search(body={ - "query": { - "bool": { - "filter": { - "terms": { - "website_id": websites - }, - } - } - }, - "aggs": { - "websites": { +def are_empty(self, websites): + result = self.es.search(body={ + "query": { + "bool": { + "filter": { "terms": { - "field": "website_id", - "size": 100000, - "min_doc_count": 1 - } + "website_id": websites + }, } - }, - "size": 0 - }, index=self.index_name, request_timeout=30) + } + }, + "aggs": { + "websites": { + "terms": { + "field": "website_id", + "size": 100000, + "min_doc_count": 1 + } + } + }, + "size": 0 + }, index=self.index_name, request_timeout=30) - non_empty_websites = [bucket["key"] for bucket in result["aggregations"]["websites"]["buckets"]] + non_empty_websites = [bucket["key"] for bucket in result["aggregations"]["websites"]["buckets"]] - for website in websites: - if website not in non_empty_websites: - yield website + for website in websites: + if website not in non_empty_websites: + yield website diff --git a/templates/downloads.html b/templates/downloads.html index c4b5ce8..f4fb68c 100644 --- a/templates/downloads.html +++ b/templates/downloads.html @@ -25,7 +25,7 @@ out.csv.xz {{ export_file_stats.st_size |filesizeformat }} - {{ export_file_stats.st_mtime|datetime_format }} + {{ export_file_stats.st_mtime|datetime_format }} UTC {% endif %} diff --git a/templates/home.html b/templates/home.html index 8b567ff..63b4fae 100644 --- a/templates/home.html +++ b/templates/home.html @@ -10,7 +10,7 @@ {% if stats and stats["total_size"] %}

{{ stats["total_count"] }} files totalling - ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} website(s)

+ ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites

{% endif %} {% if current_websites %}

Currently indexing {{ current_websites }}