Added filter for large files in stats

2025-10-26 03:56:52 +00:00 · 2018-06-28 10:40:54 -04:00 · 2018-06-28 10:40:54 -04:00 · 4c9d79fdbf
commit 4c9d79fdbf
parent 2638e47360
4 changed files with 59 additions and 53 deletions
--- a/app.py
+++ b/app.py
@ -10,7 +10,6 @@ import config
 from flask_caching import Cache
 from task import TaskDispatcher, Task, CrawlServer
 from search.search import ElasticSearchEngine
-from jinja2 import Undefined

 app = Flask(__name__)
 recaptcha = ReCaptcha(app=app,
--- a/search/search.py
+++ b/search/search.py
@ -16,7 +16,8 @@ class SearchEngine:
    def import_json(self, in_str: str, website_id: int):
        raise NotImplementedError

-    def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min, date_max) -> {}:
+    def search(self, query, page, per_page, sort_order, extension, size_min, size_max, match_all, fields, date_min,
+               date_max) -> {}:
        raise NotImplementedError

    def reset(self):
@ -142,7 +143,8 @@ class ElasticSearchEngine(SearchEngine):
        action_string = '{"index":{}}\n'
        return "\n".join("".join([action_string, ujson.dumps(doc)]) for doc in docs)

-    def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, date_max) -> {}:
+    def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min,
+               date_max) -> {}:

        filters = []
        if extensions:
@ -264,16 +266,18 @@ class ElasticSearchEngine(SearchEngine):
        size_per_ext = self.es.search(body={
            "query": {
                "bool": {
-                    "must_not": {
-                        "term": {"size": -1}
-                    }
+                    "filter": [
+                        {"range": {
+                            "size": {"gte": 0, "lte": (1000000000000 - 1)}  # 0-1TB
+                        }}
+                    ]
                }
            },
            "aggs": {
                "ext_group": {
                    "terms": {
                        "field": "ext",
-                        "size": 20
+                        "size": 40
                    },
                    "aggs": {
                        "size": {
@ -285,14 +289,17 @@ class ElasticSearchEngine(SearchEngine):
                }
            },
            "size": 0
+
        }, index=self.index_name, request_timeout=30)

        total_stats = self.es.search(body={
            "query": {
                "bool": {
-                    "must_not": {
-                        "term": {"size": -1}
-                    }
+                    "filter": [
+                        {"range": {
+                            "size": {"gte": 0, "lte": (1000000000000 - 1)}  # 0-1TB
+                        }}
+                    ]
                }
            },
            "aggs": {
@ -304,24 +311,20 @@ class ElasticSearchEngine(SearchEngine):
                }
            },
            "size": 0
+
        }, index=self.index_name, request_timeout=30)

        size_and_date_histogram = self.es.search(body={
            "query": {
                "bool": {
-                    "must_not": {
-                        "term": {"size": -1},
-                    },
                    "filter": [
+                        {"range": {
+                            "size": {"gte": 0, "lte": (1000000000000 - 1)}  # 0-1TB
+                        }},
                        {"range": {
                            "mtime": {
                                "gt": 0  # 1970-01-01
                            }
-                        }},
-                        {"range": {
-                            "size": {
-                                "gt": 0
-                            }
                        }}
                    ]
                }
@ -349,9 +352,11 @@ class ElasticSearchEngine(SearchEngine):
        website_scatter = self.es.search(body={
            "query": {
                "bool": {
-                    "must_not": {
-                        "term": {"size": -1},
-                    }
+                    "filter": [
+                        {"range": {
+                            "size": {"gte": 0, "lte": (1000000000000 - 1)}  # 0-1TB
+                        }}
+                    ]
                }
            },
            "aggs": {
@ -379,7 +384,9 @@ class ElasticSearchEngine(SearchEngine):
        stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"]
        stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"]
        stats["es_search_time_avg"] = stats["es_search_time"] / (
+
            stats["es_search_count"] if stats["es_search_count"] != 0 else 1)
+
        stats["total_count"] = total_stats["hits"]["total"]
        stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"]
        stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"]
@ -398,40 +405,40 @@ class ElasticSearchEngine(SearchEngine):

        return stats

-    def stream_all_docs(self):

-        return helpers.scan(query={
-            "query": {
-                "match_all": {}
-            }
-        }, scroll="5m", client=self.es, index=self.index_name)
+def stream_all_docs(self):
+    return helpers.scan(query={
+        "query": {
+            "match_all": {}
+        }
+    }, scroll="5m", client=self.es, index=self.index_name)

-    def are_empty(self, websites):

-        result = self.es.search(body={
-            "query": {
-                "bool": {
-                    "filter": {
-                        "terms": {
-                            "website_id": websites
-                        },
-                    }
-                }
-            },
-            "aggs": {
-                "websites": {
+def are_empty(self, websites):
+    result = self.es.search(body={
+        "query": {
+            "bool": {
+                "filter": {
                    "terms": {
-                        "field": "website_id",
-                        "size": 100000,
-                        "min_doc_count": 1
-                    }
+                        "website_id": websites
+                    },
                }
-            },
-            "size": 0
-        }, index=self.index_name, request_timeout=30)
+            }
+        },
+        "aggs": {
+            "websites": {
+                "terms": {
+                    "field": "website_id",
+                    "size": 100000,
+                    "min_doc_count": 1
+                }
+            }
+        },
+        "size": 0
+    }, index=self.index_name, request_timeout=30)

-        non_empty_websites = [bucket["key"] for bucket in result["aggregations"]["websites"]["buckets"]]
+    non_empty_websites = [bucket["key"] for bucket in result["aggregations"]["websites"]["buckets"]]

-        for website in websites:
-            if website not in non_empty_websites:
-                yield website
+    for website in websites:
+        if website not in non_empty_websites:
+            yield website
--- a/templates/downloads.html
+++ b/templates/downloads.html
@ -25,7 +25,7 @@
                    <tr>
                        <td><a href="/get_export">out.csv.xz</a></td>
                        <td>{{ export_file_stats.st_size |filesizeformat }}</td>
-                        <td>{{ export_file_stats.st_mtime|datetime_format }}</td>
+                        <td>{{ export_file_stats.st_mtime|datetime_format }} UTC</td>
                    </tr>
                    {% endif %}
                    </tbody>
--- a/templates/home.html
+++ b/templates/home.html
@ -10,7 +10,7 @@

            {% if stats and stats["total_size"] %}
                <p class="lead">{{ stats["total_count"] }} files totalling
-                    ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} website(s)</p>
+                    ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites</p>
            {% endif %}
            {% if current_websites %}
                <p>Currently indexing <code>{{ current_websites }}</code><span class="vim-caret">&nbsp;</span> </p>