Website link list works with elasticsearch

This commit is contained in:
Simon 2018-06-12 21:26:44 -04:00
parent e266a50197
commit bccb1d0dfd
2 changed files with 42 additions and 21 deletions

4
app.py
View File

@ -68,7 +68,6 @@ def website_json_chart(website_id):
website = db.get_website_by_id(website_id) website = db.get_website_by_id(website_id)
print("FIXME: website_json_chart")
if website: if website:
stats = searchEngine.get_stats(website_id) stats = searchEngine.get_stats(website_id)
stats["base_url"] = website.url stats["base_url"] = website.url
@ -85,7 +84,8 @@ def website_links(website_id):
if website: if website:
print("FIXME: website_links") print("FIXME: website_links")
links = [] links = searchEngine.get_link_list(website_id, website.url)
print(links)
return Response("\n".join(links), mimetype="text/plain") return Response("\n".join(links), mimetype="text/plain")
else: else:
abort(404) abort(404)

View File

@ -1,4 +1,5 @@
import elasticsearch import elasticsearch
from elasticsearch import helpers
import os import os
import json import json
@ -54,22 +55,22 @@ class ElasticSearchEngine(SearchEngine):
self.es.indices.close(index=self.index_name) self.es.indices.close(index=self.index_name)
# File names and paths # File names and paths
self.es.indices.put_settings(body= self.es.indices.put_settings(body={
{"analysis": { "analysis": {
"tokenizer": { "tokenizer": {
"my_nGram_tokenizer": { "my_nGram_tokenizer": {
"type": "nGram", "min_gram": 3, "max_gram": 3} "type": "nGram", "min_gram": 3, "max_gram": 3}
}
}}, index=self.index_name)
self.es.indices.put_settings(body=
{"analysis": {
"analyzer": {
"my_nGram": {
"tokenizer": "my_nGram_tokenizer",
"filter": ["lowercase", "asciifolding"]
} }
} }}, index=self.index_name)
}}, index=self.index_name) self.es.indices.put_settings(body={
"analysis": {
"analyzer": {
"my_nGram": {
"tokenizer": "my_nGram_tokenizer",
"filter": ["lowercase", "asciifolding"]
}
}
}}, index=self.index_name)
# Mappings # Mappings
self.es.indices.put_mapping(body={"properties": { self.es.indices.put_mapping(body={"properties": {
@ -90,6 +91,10 @@ class ElasticSearchEngine(SearchEngine):
return self.es.ping() return self.es.ping()
def import_json(self, in_str: str, website_id: int): def import_json(self, in_str: str, website_id: int):
if not in_str:
return
import_every = 1000 import_every = 1000
docs = [] docs = []
@ -97,7 +102,7 @@ class ElasticSearchEngine(SearchEngine):
for line in in_str.splitlines(): for line in in_str.splitlines():
doc = json.loads(line) doc = json.loads(line)
name, ext = os.path.splitext(doc["name"]) name, ext = os.path.splitext(doc["name"])
doc["ext"] = ext[1:] if ext and len(ext) > 1 else "" doc["ext"] = ext[1:].lower() if ext and len(ext) > 1 else ""
doc["name"] = name doc["name"] = name
doc["website_id"] = website_id doc["website_id"] = website_id
docs.append(doc) docs.append(doc)
@ -149,13 +154,10 @@ class ElasticSearchEngine(SearchEngine):
}, },
"size": per_page, "from": page * per_page}, index=self.index_name) "size": per_page, "from": page * per_page}, index=self.index_name)
# todo get scroll time from config
# todo get size from config
return page return page
def get_stats(self, website_id: int, subdir: str = None): def get_stats(self, website_id: int, subdir: str = None):
stats = {}
result = self.es.search(body={ result = self.es.search(body={
"query": { "query": {
"constant_score": { "constant_score": {
@ -186,9 +188,28 @@ class ElasticSearchEngine(SearchEngine):
"size": 0 "size": 0
}) })
stats = dict()
stats["total_size"] = result["aggregations"]["total_size"]["value"] stats["total_size"] = result["aggregations"]["total_size"]["value"]
stats["total_count"] = result["hits"]["total"] stats["total_count"] = result["hits"]["total"]
stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"]) stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"])
for b in result["aggregations"]["ext_group"]["buckets"]] for b in result["aggregations"]["ext_group"]["buckets"]]
return stats return stats
def get_link_list(self, website_id, base_url):
hits = helpers.scan(client=self.es,
query={
"_source": {
"includes": ["path", "name", "ext"]
},
"query": {
"term": {
"website_id": website_id}
}
},
index=self.index_name)
for hit in hits:
src = hit["_source"]
yield base_url + src["path"] + ("/" if src["path"] != "" else "") + src["name"] + \
("." if src["ext"] != "" else "") + src["ext"]