Added filter for large files in stats

This commit is contained in:
Simon 2018-06-28 10:40:54 -04:00
parent 2638e47360
commit 4c9d79fdbf
4 changed files with 59 additions and 53 deletions

1
app.py
View File

@ -10,7 +10,6 @@ import config
from flask_caching import Cache from flask_caching import Cache
from task import TaskDispatcher, Task, CrawlServer from task import TaskDispatcher, Task, CrawlServer
from search.search import ElasticSearchEngine from search.search import ElasticSearchEngine
from jinja2 import Undefined
app = Flask(__name__) app = Flask(__name__)
recaptcha = ReCaptcha(app=app, recaptcha = ReCaptcha(app=app,

View File

@ -16,7 +16,8 @@ class SearchEngine:
def import_json(self, in_str: str, website_id: int): def import_json(self, in_str: str, website_id: int):
raise NotImplementedError raise NotImplementedError
def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min, date_max) -> {}: def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min,
date_max) -> {}:
raise NotImplementedError raise NotImplementedError
def reset(self): def reset(self):
@ -142,7 +143,8 @@ class ElasticSearchEngine(SearchEngine):
action_string = '{"index":{}}\n' action_string = '{"index":{}}\n'
return "\n".join("".join([action_string, ujson.dumps(doc)]) for doc in docs) return "\n".join("".join([action_string, ujson.dumps(doc)]) for doc in docs)
def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, date_max) -> {}: def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min,
date_max) -> {}:
filters = [] filters = []
if extensions: if extensions:
@ -264,16 +266,18 @@ class ElasticSearchEngine(SearchEngine):
size_per_ext = self.es.search(body={ size_per_ext = self.es.search(body={
"query": { "query": {
"bool": { "bool": {
"must_not": { "filter": [
"term": {"size": -1} {"range": {
} "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
}}
]
} }
}, },
"aggs": { "aggs": {
"ext_group": { "ext_group": {
"terms": { "terms": {
"field": "ext", "field": "ext",
"size": 20 "size": 40
}, },
"aggs": { "aggs": {
"size": { "size": {
@ -285,14 +289,17 @@ class ElasticSearchEngine(SearchEngine):
} }
}, },
"size": 0 "size": 0
}, index=self.index_name, request_timeout=30) }, index=self.index_name, request_timeout=30)
total_stats = self.es.search(body={ total_stats = self.es.search(body={
"query": { "query": {
"bool": { "bool": {
"must_not": { "filter": [
"term": {"size": -1} {"range": {
} "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
}}
]
} }
}, },
"aggs": { "aggs": {
@ -304,24 +311,20 @@ class ElasticSearchEngine(SearchEngine):
} }
}, },
"size": 0 "size": 0
}, index=self.index_name, request_timeout=30) }, index=self.index_name, request_timeout=30)
size_and_date_histogram = self.es.search(body={ size_and_date_histogram = self.es.search(body={
"query": { "query": {
"bool": { "bool": {
"must_not": {
"term": {"size": -1},
},
"filter": [ "filter": [
{"range": {
"size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
}},
{"range": { {"range": {
"mtime": { "mtime": {
"gt": 0 # 1970-01-01 "gt": 0 # 1970-01-01
} }
}},
{"range": {
"size": {
"gt": 0
}
}} }}
] ]
} }
@ -349,9 +352,11 @@ class ElasticSearchEngine(SearchEngine):
website_scatter = self.es.search(body={ website_scatter = self.es.search(body={
"query": { "query": {
"bool": { "bool": {
"must_not": { "filter": [
"term": {"size": -1}, {"range": {
} "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
}}
]
} }
}, },
"aggs": { "aggs": {
@ -379,7 +384,9 @@ class ElasticSearchEngine(SearchEngine):
stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"] stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"]
stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"] stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"]
stats["es_search_time_avg"] = stats["es_search_time"] / ( stats["es_search_time_avg"] = stats["es_search_time"] / (
stats["es_search_count"] if stats["es_search_count"] != 0 else 1) stats["es_search_count"] if stats["es_search_count"] != 0 else 1)
stats["total_count"] = total_stats["hits"]["total"] stats["total_count"] = total_stats["hits"]["total"]
stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"] stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"]
stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"] stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"]
@ -398,40 +405,40 @@ class ElasticSearchEngine(SearchEngine):
return stats return stats
def stream_all_docs(self):
return helpers.scan(query={ def stream_all_docs(self):
"query": { return helpers.scan(query={
"match_all": {} "query": {
} "match_all": {}
}, scroll="5m", client=self.es, index=self.index_name) }
}, scroll="5m", client=self.es, index=self.index_name)
def are_empty(self, websites):
result = self.es.search(body={ def are_empty(self, websites):
"query": { result = self.es.search(body={
"bool": { "query": {
"filter": { "bool": {
"terms": { "filter": {
"website_id": websites
},
}
}
},
"aggs": {
"websites": {
"terms": { "terms": {
"field": "website_id", "website_id": websites
"size": 100000, },
"min_doc_count": 1
}
} }
}, }
"size": 0 },
}, index=self.index_name, request_timeout=30) "aggs": {
"websites": {
"terms": {
"field": "website_id",
"size": 100000,
"min_doc_count": 1
}
}
},
"size": 0
}, index=self.index_name, request_timeout=30)
non_empty_websites = [bucket["key"] for bucket in result["aggregations"]["websites"]["buckets"]] non_empty_websites = [bucket["key"] for bucket in result["aggregations"]["websites"]["buckets"]]
for website in websites: for website in websites:
if website not in non_empty_websites: if website not in non_empty_websites:
yield website yield website

View File

@ -25,7 +25,7 @@
<tr> <tr>
<td><a href="/get_export">out.csv.xz</a></td> <td><a href="/get_export">out.csv.xz</a></td>
<td>{{ export_file_stats.st_size |filesizeformat }}</td> <td>{{ export_file_stats.st_size |filesizeformat }}</td>
<td>{{ export_file_stats.st_mtime|datetime_format }}</td> <td>{{ export_file_stats.st_mtime|datetime_format }} UTC</td>
</tr> </tr>
{% endif %} {% endif %}
</tbody> </tbody>

View File

@ -10,7 +10,7 @@
{% if stats and stats["total_size"] %} {% if stats and stats["total_size"] %}
<p class="lead">{{ stats["total_count"] }} files totalling <p class="lead">{{ stats["total_count"] }} files totalling
~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} website(s)</p> ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites</p>
{% endif %} {% endif %}
{% if current_websites %} {% if current_websites %}
<p>Currently indexing <code>{{ current_websites }}</code><span class="vim-caret">&nbsp;</span> </p> <p>Currently indexing <code>{{ current_websites }}</code><span class="vim-caret">&nbsp;</span> </p>