mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
Elasticsearch search engine (search & scroll)
This commit is contained in:
parent
72495275b0
commit
011b8455a7
@ -76,7 +76,7 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
if len(urls_to_request) > 3:
|
if len(urls_to_request) > 4:
|
||||||
# Many urls, use multi-threaded solution
|
# Many urls, use multi-threaded solution
|
||||||
pool = ThreadPool(processes=10)
|
pool = ThreadPool(processes=10)
|
||||||
files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
|
files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
|
||||||
|
@ -11,4 +11,5 @@ apscheduler
|
|||||||
bcrypt
|
bcrypt
|
||||||
ftputil
|
ftputil
|
||||||
lxml
|
lxml
|
||||||
elasticsearch
|
elasticsearch
|
||||||
|
python-dateutil
|
@ -1,4 +1,5 @@
|
|||||||
import elasticsearch
|
import elasticsearch
|
||||||
|
from elasticsearch.exceptions import TransportError
|
||||||
|
|
||||||
|
|
||||||
class IndexingError(Exception):
|
class IndexingError(Exception):
|
||||||
@ -13,7 +14,10 @@ class SearchEngine:
|
|||||||
def import_json(self, in_file: str, website_id: int):
|
def import_json(self, in_file: str, website_id: int):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def search(self, query) -> list:
|
def search(self, query) -> {}:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def scroll(self, scroll_id) -> {}:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
@ -40,26 +44,7 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
self.es.indices.create(index=self.index_name)
|
self.es.indices.create(index=self.index_name)
|
||||||
self.es.indices.close(index=self.index_name)
|
self.es.indices.close(index=self.index_name)
|
||||||
|
|
||||||
# Paths
|
# File names and paths
|
||||||
self.es.indices.put_settings(body=
|
|
||||||
{"analysis": {
|
|
||||||
"tokenizer": {
|
|
||||||
"path_tokenizer": {
|
|
||||||
"type": "path_hierarchy"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}}, index=self.index_name)
|
|
||||||
|
|
||||||
self.es.indices.put_settings(body=
|
|
||||||
{"analysis": {
|
|
||||||
"analyzer": {
|
|
||||||
"path_analyser": {
|
|
||||||
"tokenizer": "path_tokenizer", "filter": ["lowercase"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}}, index=self.index_name)
|
|
||||||
|
|
||||||
# File names
|
|
||||||
self.es.indices.put_settings(body=
|
self.es.indices.put_settings(body=
|
||||||
{"analysis": {
|
{"analysis": {
|
||||||
"tokenizer": {
|
"tokenizer": {
|
||||||
@ -79,7 +64,7 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
|
|
||||||
# Mappings
|
# Mappings
|
||||||
self.es.indices.put_mapping(body={"properties": {
|
self.es.indices.put_mapping(body={"properties": {
|
||||||
"path": {"type": "text", "analyzer": "path_analyser"},
|
"path": {"analyzer": "my_nGram", "type": "text"},
|
||||||
"name": {"analyzer": "my_nGram", "type": "text"},
|
"name": {"analyzer": "my_nGram", "type": "text"},
|
||||||
"mtime": {"type": "date", "format": "epoch_millis"},
|
"mtime": {"type": "date", "format": "epoch_millis"},
|
||||||
"size": {"type": "long"},
|
"size": {"type": "long"},
|
||||||
@ -131,5 +116,39 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
result += action_string + doc[:-1] + website_id_string
|
result += action_string + doc[:-1] + website_id_string
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def search(self, query):
|
def search(self, query) -> {}:
|
||||||
pass
|
|
||||||
|
filters = []
|
||||||
|
|
||||||
|
page = self.es.search(body={
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"must": {
|
||||||
|
"multi_match": {
|
||||||
|
"query": query,
|
||||||
|
"fields": ["name", "path"],
|
||||||
|
"operator": "and"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"filter": filters
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"sort": [
|
||||||
|
"_score"
|
||||||
|
],
|
||||||
|
"highlight": {
|
||||||
|
"fields": {
|
||||||
|
"name": {"pre_tags": ["<span class='hl'>"], "post_tags": ["</span>"]},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"size": 40}, index=self.index_name, scroll="8m")
|
||||||
|
|
||||||
|
# todo get scroll time from config
|
||||||
|
# todo get size from config
|
||||||
|
return page
|
||||||
|
|
||||||
|
def scroll(self, scroll_id) -> {}:
|
||||||
|
try:
|
||||||
|
return self.es.scroll(scroll_id=scroll_id, scroll="3m") # todo get scroll time from config
|
||||||
|
except TransportError:
|
||||||
|
return None
|
||||||
|
@ -10,17 +10,18 @@ class SearchTest(TestCase):
|
|||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.search = ElasticSearchEngine("od-database-test")
|
self.search = ElasticSearchEngine("od-database-test")
|
||||||
self.search.reset()
|
self.search.reset()
|
||||||
time.sleep(1)
|
time.sleep(0.5)
|
||||||
|
|
||||||
def test_ping(self):
|
def test_ping(self):
|
||||||
self.assertTrue(self.search.ping(), "Search engine not running")
|
self.assertTrue(self.search.ping(), "Search engine not running")
|
||||||
|
|
||||||
def test_import_json(self):
|
def test_import_and_search(self):
|
||||||
|
|
||||||
files = [
|
files = [
|
||||||
{"name": "a", "size": 1000000000000000000, "path": "c/d", "mtime": 1528765672},
|
{"name": "PaNopTicon", "size": 1000000000000000000, "path": "c/d", "mtime": 1528765672},
|
||||||
{"name": "b", "size": 123, "path": "", "mtime": None},
|
{"name": "BLAckwAter.Park", "size": 123, "path": "", "mtime": None},
|
||||||
{"name": "c", "size": -1, "path": "c", "mtime": 12345}
|
{"name": "10'000 days", "size": -1, "path": "c", "mtime": 12345},
|
||||||
|
{"name": "Dead Racer", "size": 1000, "path": "Speed Machine [FLAC]", "mtime": 12345}
|
||||||
]
|
]
|
||||||
|
|
||||||
with open("tmp.json", "w") as f:
|
with open("tmp.json", "w") as f:
|
||||||
@ -28,11 +29,59 @@ class SearchTest(TestCase):
|
|||||||
f.write(json.dumps(file) + "\n")
|
f.write(json.dumps(file) + "\n")
|
||||||
|
|
||||||
self.search.import_json("tmp.json", 123)
|
self.search.import_json("tmp.json", 123)
|
||||||
time.sleep(3)
|
time.sleep(2)
|
||||||
self.assertEqual(3, self.search.es.count(self.search.index_name, "file")["count"])
|
self.assertEqual(4, self.search.es.count(self.search.index_name, "file")["count"])
|
||||||
|
|
||||||
|
# Search for 'pan' in PaNopTicon and expect 1 result, a scroll id, and an highlight
|
||||||
|
page = self.search.search("pan")
|
||||||
|
self.assertIsNotNone(page["_scroll_id"])
|
||||||
|
self.assertEqual(1, page["hits"]["total"])
|
||||||
|
self.assertIsNotNone(page["hits"]["hits"][0]["highlight"]["name"])
|
||||||
|
|
||||||
|
# Search for 'park' and expect BLAckwAter.Park
|
||||||
|
page = self.search.search("park")
|
||||||
|
self.assertEqual(1, page["hits"]["total"])
|
||||||
|
|
||||||
|
# Search for fla and expect Dead Racer
|
||||||
|
page = self.search.search("fla")
|
||||||
|
self.assertEqual(1, page["hits"]["total"])
|
||||||
|
|
||||||
|
# Search for 10'000 and expect 10'000 days
|
||||||
|
page = self.search.search("10'000")
|
||||||
|
self.assertEqual(1, page["hits"]["total"])
|
||||||
|
|
||||||
os.remove("tmp.json")
|
os.remove("tmp.json")
|
||||||
|
|
||||||
|
def test_scroll(self):
|
||||||
|
|
||||||
|
files = [
|
||||||
|
{"name": "PaNopTicon", "size": 1000000000000000000, "path": "c/d", "mtime": 1528765672},
|
||||||
|
{"name": "BLAckwAter.Park", "size": 123, "path": "", "mtime": None},
|
||||||
|
{"name": "10'000 days", "size": -1, "path": "c", "mtime": 12345},
|
||||||
|
{"name": "Dead Racer", "size": 1000, "path": "Speed Machine [FLAC]", "mtime": 12345}
|
||||||
|
]
|
||||||
|
|
||||||
|
with open("tmp.json", "w") as f:
|
||||||
|
for file in files:
|
||||||
|
f.write(json.dumps(file) + "\n")
|
||||||
|
|
||||||
|
self.search.import_json("tmp.json", 123)
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
page = self.search.search("")
|
||||||
|
scroll_id = page["_scroll_id"]
|
||||||
|
|
||||||
|
# next page
|
||||||
|
next_page = self.search.scroll(scroll_id)
|
||||||
|
next_scroll_id = next_page["_scroll_id"]
|
||||||
|
self.assertIsNotNone(next_scroll_id)
|
||||||
|
|
||||||
|
# again
|
||||||
|
next_page2 = self.search.scroll(next_scroll_id)
|
||||||
|
self.assertIsNotNone(next_page2["_scroll_id"])
|
||||||
|
|
||||||
|
def test_invalid_scroll(self):
|
||||||
|
|
||||||
|
invalid_scroll = "blahblah"
|
||||||
|
|
||||||
|
self.assertIsNone(self.search.scroll(invalid_scroll))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user