mirror of
https://github.com/simon987/od-database.git
synced 2025-04-20 19:06:42 +00:00
Website link list works with elasticsearch
This commit is contained in:
parent
e266a50197
commit
bccb1d0dfd
4
app.py
4
app.py
@ -68,7 +68,6 @@ def website_json_chart(website_id):
|
|||||||
|
|
||||||
website = db.get_website_by_id(website_id)
|
website = db.get_website_by_id(website_id)
|
||||||
|
|
||||||
print("FIXME: website_json_chart")
|
|
||||||
if website:
|
if website:
|
||||||
stats = searchEngine.get_stats(website_id)
|
stats = searchEngine.get_stats(website_id)
|
||||||
stats["base_url"] = website.url
|
stats["base_url"] = website.url
|
||||||
@ -85,7 +84,8 @@ def website_links(website_id):
|
|||||||
|
|
||||||
if website:
|
if website:
|
||||||
print("FIXME: website_links")
|
print("FIXME: website_links")
|
||||||
links = []
|
links = searchEngine.get_link_list(website_id, website.url)
|
||||||
|
print(links)
|
||||||
return Response("\n".join(links), mimetype="text/plain")
|
return Response("\n".join(links), mimetype="text/plain")
|
||||||
else:
|
else:
|
||||||
abort(404)
|
abort(404)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import elasticsearch
|
import elasticsearch
|
||||||
|
from elasticsearch import helpers
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
|
||||||
@ -54,22 +55,22 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
self.es.indices.close(index=self.index_name)
|
self.es.indices.close(index=self.index_name)
|
||||||
|
|
||||||
# File names and paths
|
# File names and paths
|
||||||
self.es.indices.put_settings(body=
|
self.es.indices.put_settings(body={
|
||||||
{"analysis": {
|
"analysis": {
|
||||||
"tokenizer": {
|
"tokenizer": {
|
||||||
"my_nGram_tokenizer": {
|
"my_nGram_tokenizer": {
|
||||||
"type": "nGram", "min_gram": 3, "max_gram": 3}
|
"type": "nGram", "min_gram": 3, "max_gram": 3}
|
||||||
}
|
|
||||||
}}, index=self.index_name)
|
|
||||||
self.es.indices.put_settings(body=
|
|
||||||
{"analysis": {
|
|
||||||
"analyzer": {
|
|
||||||
"my_nGram": {
|
|
||||||
"tokenizer": "my_nGram_tokenizer",
|
|
||||||
"filter": ["lowercase", "asciifolding"]
|
|
||||||
}
|
}
|
||||||
}
|
}}, index=self.index_name)
|
||||||
}}, index=self.index_name)
|
self.es.indices.put_settings(body={
|
||||||
|
"analysis": {
|
||||||
|
"analyzer": {
|
||||||
|
"my_nGram": {
|
||||||
|
"tokenizer": "my_nGram_tokenizer",
|
||||||
|
"filter": ["lowercase", "asciifolding"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}}, index=self.index_name)
|
||||||
|
|
||||||
# Mappings
|
# Mappings
|
||||||
self.es.indices.put_mapping(body={"properties": {
|
self.es.indices.put_mapping(body={"properties": {
|
||||||
@ -90,6 +91,10 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
return self.es.ping()
|
return self.es.ping()
|
||||||
|
|
||||||
def import_json(self, in_str: str, website_id: int):
|
def import_json(self, in_str: str, website_id: int):
|
||||||
|
|
||||||
|
if not in_str:
|
||||||
|
return
|
||||||
|
|
||||||
import_every = 1000
|
import_every = 1000
|
||||||
|
|
||||||
docs = []
|
docs = []
|
||||||
@ -97,7 +102,7 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
for line in in_str.splitlines():
|
for line in in_str.splitlines():
|
||||||
doc = json.loads(line)
|
doc = json.loads(line)
|
||||||
name, ext = os.path.splitext(doc["name"])
|
name, ext = os.path.splitext(doc["name"])
|
||||||
doc["ext"] = ext[1:] if ext and len(ext) > 1 else ""
|
doc["ext"] = ext[1:].lower() if ext and len(ext) > 1 else ""
|
||||||
doc["name"] = name
|
doc["name"] = name
|
||||||
doc["website_id"] = website_id
|
doc["website_id"] = website_id
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
@ -149,13 +154,10 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
},
|
},
|
||||||
"size": per_page, "from": page * per_page}, index=self.index_name)
|
"size": per_page, "from": page * per_page}, index=self.index_name)
|
||||||
|
|
||||||
# todo get scroll time from config
|
|
||||||
# todo get size from config
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
def get_stats(self, website_id: int, subdir: str = None):
|
def get_stats(self, website_id: int, subdir: str = None):
|
||||||
|
|
||||||
stats = {}
|
|
||||||
result = self.es.search(body={
|
result = self.es.search(body={
|
||||||
"query": {
|
"query": {
|
||||||
"constant_score": {
|
"constant_score": {
|
||||||
@ -186,9 +188,28 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
"size": 0
|
"size": 0
|
||||||
})
|
})
|
||||||
|
|
||||||
|
stats = dict()
|
||||||
stats["total_size"] = result["aggregations"]["total_size"]["value"]
|
stats["total_size"] = result["aggregations"]["total_size"]["value"]
|
||||||
stats["total_count"] = result["hits"]["total"]
|
stats["total_count"] = result["hits"]["total"]
|
||||||
stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"])
|
stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"])
|
||||||
for b in result["aggregations"]["ext_group"]["buckets"]]
|
for b in result["aggregations"]["ext_group"]["buckets"]]
|
||||||
|
|
||||||
return stats
|
return stats
|
||||||
|
|
||||||
|
def get_link_list(self, website_id, base_url):
|
||||||
|
|
||||||
|
hits = helpers.scan(client=self.es,
|
||||||
|
query={
|
||||||
|
"_source": {
|
||||||
|
"includes": ["path", "name", "ext"]
|
||||||
|
},
|
||||||
|
"query": {
|
||||||
|
"term": {
|
||||||
|
"website_id": website_id}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
index=self.index_name)
|
||||||
|
for hit in hits:
|
||||||
|
src = hit["_source"]
|
||||||
|
yield base_url + src["path"] + ("/" if src["path"] != "" else "") + src["name"] + \
|
||||||
|
("." if src["ext"] != "" else "") + src["ext"]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user