From 4c9d79fdbf009afeef02f637e505b25a4ec15da6 Mon Sep 17 00:00:00 2001
From: Simon
Date: Thu, 28 Jun 2018 10:40:54 -0400
Subject: [PATCH] Added filter for large files in stats
---
app.py | 1 -
search/search.py | 107 +++++++++++++++++++++------------------
templates/downloads.html | 2 +-
templates/home.html | 2 +-
4 files changed, 59 insertions(+), 53 deletions(-)
diff --git a/app.py b/app.py
index 71ead8e..fae2c27 100644
--- a/app.py
+++ b/app.py
@@ -10,7 +10,6 @@ import config
from flask_caching import Cache
from task import TaskDispatcher, Task, CrawlServer
from search.search import ElasticSearchEngine
-from jinja2 import Undefined
app = Flask(__name__)
recaptcha = ReCaptcha(app=app,
diff --git a/search/search.py b/search/search.py
index ff98aaf..7ef7607 100644
--- a/search/search.py
+++ b/search/search.py
@@ -16,7 +16,8 @@ class SearchEngine:
def import_json(self, in_str: str, website_id: int):
raise NotImplementedError
- def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min, date_max) -> {}:
+ def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min,
+ date_max) -> {}:
raise NotImplementedError
def reset(self):
@@ -142,7 +143,8 @@ class ElasticSearchEngine(SearchEngine):
action_string = '{"index":{}}\n'
return "\n".join("".join([action_string, ujson.dumps(doc)]) for doc in docs)
- def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, date_max) -> {}:
+ def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min,
+ date_max) -> {}:
filters = []
if extensions:
@@ -264,16 +266,18 @@ class ElasticSearchEngine(SearchEngine):
size_per_ext = self.es.search(body={
"query": {
"bool": {
- "must_not": {
- "term": {"size": -1}
- }
+ "filter": [
+ {"range": {
+ "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
+ }}
+ ]
}
},
"aggs": {
"ext_group": {
"terms": {
"field": "ext",
- "size": 20
+ "size": 40
},
"aggs": {
"size": {
@@ -285,14 +289,17 @@ class ElasticSearchEngine(SearchEngine):
}
},
"size": 0
+
}, index=self.index_name, request_timeout=30)
total_stats = self.es.search(body={
"query": {
"bool": {
- "must_not": {
- "term": {"size": -1}
- }
+ "filter": [
+ {"range": {
+ "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
+ }}
+ ]
}
},
"aggs": {
@@ -304,24 +311,20 @@ class ElasticSearchEngine(SearchEngine):
}
},
"size": 0
+
}, index=self.index_name, request_timeout=30)
size_and_date_histogram = self.es.search(body={
"query": {
"bool": {
- "must_not": {
- "term": {"size": -1},
- },
"filter": [
+ {"range": {
+ "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
+ }},
{"range": {
"mtime": {
"gt": 0 # 1970-01-01
}
- }},
- {"range": {
- "size": {
- "gt": 0
- }
}}
]
}
@@ -349,9 +352,11 @@ class ElasticSearchEngine(SearchEngine):
website_scatter = self.es.search(body={
"query": {
"bool": {
- "must_not": {
- "term": {"size": -1},
- }
+ "filter": [
+ {"range": {
+ "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
+ }}
+ ]
}
},
"aggs": {
@@ -379,7 +384,9 @@ class ElasticSearchEngine(SearchEngine):
stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"]
stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"]
stats["es_search_time_avg"] = stats["es_search_time"] / (
+
stats["es_search_count"] if stats["es_search_count"] != 0 else 1)
+
stats["total_count"] = total_stats["hits"]["total"]
stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"]
stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"]
@@ -398,40 +405,40 @@ class ElasticSearchEngine(SearchEngine):
return stats
- def stream_all_docs(self):
- return helpers.scan(query={
- "query": {
- "match_all": {}
- }
- }, scroll="5m", client=self.es, index=self.index_name)
+def stream_all_docs(self):
+ return helpers.scan(query={
+ "query": {
+ "match_all": {}
+ }
+ }, scroll="5m", client=self.es, index=self.index_name)
- def are_empty(self, websites):
- result = self.es.search(body={
- "query": {
- "bool": {
- "filter": {
- "terms": {
- "website_id": websites
- },
- }
- }
- },
- "aggs": {
- "websites": {
+def are_empty(self, websites):
+ result = self.es.search(body={
+ "query": {
+ "bool": {
+ "filter": {
"terms": {
- "field": "website_id",
- "size": 100000,
- "min_doc_count": 1
- }
+ "website_id": websites
+ },
}
- },
- "size": 0
- }, index=self.index_name, request_timeout=30)
+ }
+ },
+ "aggs": {
+ "websites": {
+ "terms": {
+ "field": "website_id",
+ "size": 100000,
+ "min_doc_count": 1
+ }
+ }
+ },
+ "size": 0
+ }, index=self.index_name, request_timeout=30)
- non_empty_websites = [bucket["key"] for bucket in result["aggregations"]["websites"]["buckets"]]
+ non_empty_websites = [bucket["key"] for bucket in result["aggregations"]["websites"]["buckets"]]
- for website in websites:
- if website not in non_empty_websites:
- yield website
+ for website in websites:
+ if website not in non_empty_websites:
+ yield website
diff --git a/templates/downloads.html b/templates/downloads.html
index c4b5ce8..f4fb68c 100644
--- a/templates/downloads.html
+++ b/templates/downloads.html
@@ -25,7 +25,7 @@
out.csv.xz |
{{ export_file_stats.st_size |filesizeformat }} |
- {{ export_file_stats.st_mtime|datetime_format }} |
+ {{ export_file_stats.st_mtime|datetime_format }} UTC |
{% endif %}
diff --git a/templates/home.html b/templates/home.html
index 8b567ff..63b4fae 100644
--- a/templates/home.html
+++ b/templates/home.html
@@ -10,7 +10,7 @@
{% if stats and stats["total_size"] %}
{{ stats["total_count"] }} files totalling
- ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} website(s)
+ ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites
{% endif %}
{% if current_websites %}
Currently indexing {{ current_websites }}