mirror of
https://github.com/simon987/od-database.git
synced 2025-04-24 12:45:51 +00:00
Added filter for large files in stats
This commit is contained in:
parent
2638e47360
commit
4c9d79fdbf
1
app.py
1
app.py
@ -10,7 +10,6 @@ import config
|
|||||||
from flask_caching import Cache
|
from flask_caching import Cache
|
||||||
from task import TaskDispatcher, Task, CrawlServer
|
from task import TaskDispatcher, Task, CrawlServer
|
||||||
from search.search import ElasticSearchEngine
|
from search.search import ElasticSearchEngine
|
||||||
from jinja2 import Undefined
|
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
recaptcha = ReCaptcha(app=app,
|
recaptcha = ReCaptcha(app=app,
|
||||||
|
@ -16,7 +16,8 @@ class SearchEngine:
|
|||||||
def import_json(self, in_str: str, website_id: int):
|
def import_json(self, in_str: str, website_id: int):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min, date_max) -> {}:
|
def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min,
|
||||||
|
date_max) -> {}:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
@ -142,7 +143,8 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
action_string = '{"index":{}}\n'
|
action_string = '{"index":{}}\n'
|
||||||
return "\n".join("".join([action_string, ujson.dumps(doc)]) for doc in docs)
|
return "\n".join("".join([action_string, ujson.dumps(doc)]) for doc in docs)
|
||||||
|
|
||||||
def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, date_max) -> {}:
|
def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min,
|
||||||
|
date_max) -> {}:
|
||||||
|
|
||||||
filters = []
|
filters = []
|
||||||
if extensions:
|
if extensions:
|
||||||
@ -264,16 +266,18 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
size_per_ext = self.es.search(body={
|
size_per_ext = self.es.search(body={
|
||||||
"query": {
|
"query": {
|
||||||
"bool": {
|
"bool": {
|
||||||
"must_not": {
|
"filter": [
|
||||||
"term": {"size": -1}
|
{"range": {
|
||||||
}
|
"size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
|
||||||
|
}}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"aggs": {
|
"aggs": {
|
||||||
"ext_group": {
|
"ext_group": {
|
||||||
"terms": {
|
"terms": {
|
||||||
"field": "ext",
|
"field": "ext",
|
||||||
"size": 20
|
"size": 40
|
||||||
},
|
},
|
||||||
"aggs": {
|
"aggs": {
|
||||||
"size": {
|
"size": {
|
||||||
@ -285,14 +289,17 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"size": 0
|
"size": 0
|
||||||
|
|
||||||
}, index=self.index_name, request_timeout=30)
|
}, index=self.index_name, request_timeout=30)
|
||||||
|
|
||||||
total_stats = self.es.search(body={
|
total_stats = self.es.search(body={
|
||||||
"query": {
|
"query": {
|
||||||
"bool": {
|
"bool": {
|
||||||
"must_not": {
|
"filter": [
|
||||||
"term": {"size": -1}
|
{"range": {
|
||||||
}
|
"size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
|
||||||
|
}}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"aggs": {
|
"aggs": {
|
||||||
@ -304,24 +311,20 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"size": 0
|
"size": 0
|
||||||
|
|
||||||
}, index=self.index_name, request_timeout=30)
|
}, index=self.index_name, request_timeout=30)
|
||||||
|
|
||||||
size_and_date_histogram = self.es.search(body={
|
size_and_date_histogram = self.es.search(body={
|
||||||
"query": {
|
"query": {
|
||||||
"bool": {
|
"bool": {
|
||||||
"must_not": {
|
|
||||||
"term": {"size": -1},
|
|
||||||
},
|
|
||||||
"filter": [
|
"filter": [
|
||||||
|
{"range": {
|
||||||
|
"size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
|
||||||
|
}},
|
||||||
{"range": {
|
{"range": {
|
||||||
"mtime": {
|
"mtime": {
|
||||||
"gt": 0 # 1970-01-01
|
"gt": 0 # 1970-01-01
|
||||||
}
|
}
|
||||||
}},
|
|
||||||
{"range": {
|
|
||||||
"size": {
|
|
||||||
"gt": 0
|
|
||||||
}
|
|
||||||
}}
|
}}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -349,9 +352,11 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
website_scatter = self.es.search(body={
|
website_scatter = self.es.search(body={
|
||||||
"query": {
|
"query": {
|
||||||
"bool": {
|
"bool": {
|
||||||
"must_not": {
|
"filter": [
|
||||||
"term": {"size": -1},
|
{"range": {
|
||||||
}
|
"size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
|
||||||
|
}}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"aggs": {
|
"aggs": {
|
||||||
@ -379,7 +384,9 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"]
|
stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"]
|
||||||
stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"]
|
stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"]
|
||||||
stats["es_search_time_avg"] = stats["es_search_time"] / (
|
stats["es_search_time_avg"] = stats["es_search_time"] / (
|
||||||
|
|
||||||
stats["es_search_count"] if stats["es_search_count"] != 0 else 1)
|
stats["es_search_count"] if stats["es_search_count"] != 0 else 1)
|
||||||
|
|
||||||
stats["total_count"] = total_stats["hits"]["total"]
|
stats["total_count"] = total_stats["hits"]["total"]
|
||||||
stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"]
|
stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"]
|
||||||
stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"]
|
stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"]
|
||||||
@ -398,16 +405,16 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
|
|
||||||
return stats
|
return stats
|
||||||
|
|
||||||
def stream_all_docs(self):
|
|
||||||
|
|
||||||
|
def stream_all_docs(self):
|
||||||
return helpers.scan(query={
|
return helpers.scan(query={
|
||||||
"query": {
|
"query": {
|
||||||
"match_all": {}
|
"match_all": {}
|
||||||
}
|
}
|
||||||
}, scroll="5m", client=self.es, index=self.index_name)
|
}, scroll="5m", client=self.es, index=self.index_name)
|
||||||
|
|
||||||
def are_empty(self, websites):
|
|
||||||
|
|
||||||
|
def are_empty(self, websites):
|
||||||
result = self.es.search(body={
|
result = self.es.search(body={
|
||||||
"query": {
|
"query": {
|
||||||
"bool": {
|
"bool": {
|
||||||
|
@ -25,7 +25,7 @@
|
|||||||
<tr>
|
<tr>
|
||||||
<td><a href="/get_export">out.csv.xz</a></td>
|
<td><a href="/get_export">out.csv.xz</a></td>
|
||||||
<td>{{ export_file_stats.st_size |filesizeformat }}</td>
|
<td>{{ export_file_stats.st_size |filesizeformat }}</td>
|
||||||
<td>{{ export_file_stats.st_mtime|datetime_format }}</td>
|
<td>{{ export_file_stats.st_mtime|datetime_format }} UTC</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</tbody>
|
</tbody>
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
|
|
||||||
{% if stats and stats["total_size"] %}
|
{% if stats and stats["total_size"] %}
|
||||||
<p class="lead">{{ stats["total_count"] }} files totalling
|
<p class="lead">{{ stats["total_count"] }} files totalling
|
||||||
~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} website(s)</p>
|
~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites</p>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% if current_websites %}
|
{% if current_websites %}
|
||||||
<p>Currently indexing <code>{{ current_websites }}</code><span class="vim-caret"> </span> </p>
|
<p>Currently indexing <code>{{ current_websites }}</code><span class="vim-caret"> </span> </p>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user