Change ES settings, big refactor, removed recaptcha

2025-12-16 08:09:04 +00:00 · 2019-01-13 12:48:39 -05:00
parent d905c3efd5
commit 7f857d641f
18 changed files with 978 additions and 966 deletions
--- a/search/search.py
+++ b/search/search.py
@@ -75,12 +75,13 @@ class ElasticSearchEngine(SearchEngine):
        self.es.indices.create(index=self.index_name)
        self.es.indices.close(index=self.index_name)

-        # File names and paths
+        # Index settings
        self.es.indices.put_settings(body={
            "analysis": {
                "tokenizer": {
                    "my_nGram_tokenizer": {
-                        "type": "nGram", "min_gram": 3, "max_gram": 3}
+                        "type": "nGram", "min_gram": 3, "max_gram": 3
+                    }
                }
            }}, index=self.index_name)
        self.es.indices.put_settings(body={
@@ -93,16 +94,16 @@ class ElasticSearchEngine(SearchEngine):
                }
            }}, index=self.index_name)

-        # Mappings
-        self.es.indices.put_mapping(body={"properties": {
-            "path": {"analyzer": "standard", "type": "text"},
-            "name": {"analyzer": "standard", "type": "text",
-                     "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}},
-            "mtime": {"type": "date", "format": "epoch_second"},
-            "size": {"type": "long"},
-            "website_id": {"type": "integer"},
-            "ext": {"type": "keyword"}
-        }}, doc_type="file", index=self.index_name)
+        self.es.indices.put_mapping(body={
+            "properties": {
+                "path": {"analyzer": "standard", "type": "text"},
+                "name": {"analyzer": "standard", "type": "text",
+                         "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}},
+                "mtime": {"type": "date", "format": "epoch_second"},
+                "size": {"type": "long"},
+                "website_id": {"type": "integer"},
+                "ext": {"type": "keyword"},
+            }}, doc_type="file", index=self.index_name)

        self.es.indices.open(index=self.index_name)

@@ -120,9 +121,9 @@ class ElasticSearchEngine(SearchEngine):

                to_delete = helpers.scan(query={
                    "query": {
-                        "term": {"website_id": website_id}
+                        "match_all": {}
                    }
-                }, scroll="1m", client=self.es, index=self.index_name, request_timeout=120)
+                }, scroll="1m", client=self.es, index=self.index_name, request_timeout=120, routing=website_id)

                buf = []
                counter = 0
@@ -130,12 +131,12 @@ class ElasticSearchEngine(SearchEngine):
                    buf.append(doc)
                    counter += 1

-                    if counter >= 400:
-                        self._delete(buf)
+                    if counter >= 10000:
+                        self._delete(buf, website_id)
                        buf.clear()
                        counter = 0
                if counter > 0:
-                    self._delete(buf)
+                    self._delete(buf, website_id)
                break

            except Exception as e:
@@ -144,9 +145,10 @@ class ElasticSearchEngine(SearchEngine):

        logger.debug("Done deleting for " + str(website_id))

-    def _delete(self, docs):
+    def _delete(self, docs, website_id):
        bulk_string = self.create_bulk_delete_string(docs)
-        result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30)
+        result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30,
+                              routing=website_id)

        if result["errors"]:
            logger.error("Error in ES bulk delete: \n" + result["errors"])
@@ -154,7 +156,7 @@ class ElasticSearchEngine(SearchEngine):

    def import_json(self, in_lines, website_id: int):

-        import_every = 400
+        import_every = 10000
        cooldown_time = 0

        docs = []
@@ -183,7 +185,8 @@ class ElasticSearchEngine(SearchEngine):
            try:
                logger.debug("Indexing " + str(len(docs)) + " docs")
                bulk_string = ElasticSearchEngine.create_bulk_index_string(docs)
-                self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30)
+                self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30,
+                             routing=docs[0]["website_id"])
                break
            except Exception as e:
                logger.error("Error in _index: " + str(e) + ", retrying")
@@ -293,7 +296,7 @@ class ElasticSearchEngine(SearchEngine):
                }
            },
            "size": 0
-        }, index=self.index_name, request_timeout=30)
+        }, index=self.index_name, request_timeout=30, routing=website_id)

        stats = dict()
        stats["total_size"] = result["aggregations"]["total_size"]["value"]
@@ -311,11 +314,10 @@ class ElasticSearchEngine(SearchEngine):
                                    "includes": ["path", "name", "ext"]
                                },
                                "query": {
-                                    "term": {
-                                        "website_id": website_id}
+                                    "match_all": {}
                                }
                            },
-                            index=self.index_name, request_timeout=20)
+                            index=self.index_name, request_timeout=20, routing=website_id)
        for hit in hits:
            src = hit["_source"]
            yield base_url + src["path"] + ("/" if src["path"] != "" else "") + src["name"] + \
@@ -431,7 +433,7 @@ class ElasticSearchEngine(SearchEngine):
                "websites": {
                    "terms": {
                        "field": "website_id",
-                        "size": 400  # TODO: Figure out what size is appropriate
+                        "size": 600  # TODO: Figure out what size is appropriate
                    },
                    "aggs": {
                        "size": {
@@ -451,7 +453,8 @@ class ElasticSearchEngine(SearchEngine):
        stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"]
        stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"]
        stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"]
-        stats["es_search_time_avg"] = stats["es_search_time"] / (stats["es_search_count"] if stats["es_search_count"] != 0 else 1)
+        stats["es_search_time_avg"] = stats["es_search_time"] / (
+            stats["es_search_count"] if stats["es_search_count"] != 0 else 1)

        stats["total_count"] = total_stats["hits"]["total"]
        stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"]
@@ -479,34 +482,5 @@ class ElasticSearchEngine(SearchEngine):
            }
        }, scroll="1m", client=self.es, index=self.index_name, request_timeout=60)

-    def are_empty(self, websites):
-        result = self.es.search(body={
-            "query": {
-                "bool": {
-                    "filter": {
-                        "terms": {
-                            "website_id": websites
-                        },
-                    }
-                }
-            },
-            "aggs": {
-                "websites": {
-                    "terms": {
-                        "field": "website_id",
-                        "size": 100000,
-                        "min_doc_count": 1
-                    }
-                }
-            },
-            "size": 0
-        }, index=self.index_name, request_timeout=30)
-
-        non_empty_websites = [bucket["key"] for bucket in result["aggregations"]["websites"]["buckets"]]
-
-        for website in websites:
-            if website not in non_empty_websites:
-                yield website
-
    def refresh(self):
        self.es.indices.refresh(self.index_name)