mirror of
https://github.com/simon987/od-database.git
synced 2025-04-20 10:56:47 +00:00
Elasticsearch search engine (import from json)
This commit is contained in:
parent
fcfd7d4acc
commit
72495275b0
@ -49,7 +49,7 @@ class TaskManager:
|
|||||||
|
|
||||||
print("Starting task " + task.url)
|
print("Starting task " + task.url)
|
||||||
|
|
||||||
crawler = RemoteDirectoryCrawler(task.url, 10)
|
crawler = RemoteDirectoryCrawler(task.url, 100)
|
||||||
crawl_result = crawler.crawl_directory("12345.json")
|
crawl_result = crawler.crawl_directory("12345.json")
|
||||||
|
|
||||||
result.file_count = crawl_result.file_count
|
result.file_count = crawl_result.file_count
|
||||||
@ -63,7 +63,6 @@ class TaskManager:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def task_complete(result: TaskResult):
|
def task_complete(result: TaskResult):
|
||||||
print("Task done " + str(result))
|
|
||||||
print(result.status_code)
|
print(result.status_code)
|
||||||
print(result.file_count)
|
print(result.file_count)
|
||||||
print(result.start_time)
|
print(result.start_time)
|
||||||
|
@ -12,7 +12,7 @@ class TooManyConnectionsError(Exception):
|
|||||||
|
|
||||||
class File:
|
class File:
|
||||||
|
|
||||||
def __init__(self, name: str, size: int, mtime: str, path: str, is_dir: bool):
|
def __init__(self, name: str, size: int, mtime: int, path: str, is_dir: bool):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.size = size
|
self.size = size
|
||||||
self.mtime = mtime
|
self.mtime = mtime
|
||||||
@ -69,8 +69,7 @@ def export_to_json(q: Queue, out_file: str) -> int:
|
|||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
next_file: File = q.get_nowait()
|
next_file: File = q.get_nowait()
|
||||||
f.write(next_file.to_json())
|
f.write(next_file.to_json() + "\n")
|
||||||
f.write("\n")
|
|
||||||
counter += 1
|
counter += 1
|
||||||
except Empty:
|
except Empty:
|
||||||
break
|
break
|
||||||
|
@ -61,7 +61,7 @@ class FtpDirectory(RemoteDirectory):
|
|||||||
|
|
||||||
results.append(File(
|
results.append(File(
|
||||||
name=file_name,
|
name=file_name,
|
||||||
mtime=stat.st_mtime,
|
mtime=stat.st_mtime, # TODO: check
|
||||||
size=-1 if is_dir else stat.st_size,
|
size=-1 if is_dir else stat.st_size,
|
||||||
is_dir=is_dir,
|
is_dir=is_dir,
|
||||||
path=path
|
path=path
|
||||||
|
@ -8,6 +8,7 @@ import requests
|
|||||||
from requests.exceptions import RequestException
|
from requests.exceptions import RequestException
|
||||||
from multiprocessing.pool import ThreadPool
|
from multiprocessing.pool import ThreadPool
|
||||||
import config
|
import config
|
||||||
|
from dateutil.parser import parse as parse_date
|
||||||
|
|
||||||
|
|
||||||
class Link:
|
class Link:
|
||||||
@ -59,7 +60,7 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
if self._isdir(link):
|
if self._isdir(link):
|
||||||
results.append(File(
|
results.append(File(
|
||||||
name=file_name,
|
name=file_name,
|
||||||
mtime="",
|
mtime=0,
|
||||||
size=-1,
|
size=-1,
|
||||||
is_dir=True,
|
is_dir=True,
|
||||||
path=path
|
path=path
|
||||||
@ -79,6 +80,7 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
# Many urls, use multi-threaded solution
|
# Many urls, use multi-threaded solution
|
||||||
pool = ThreadPool(processes=10)
|
pool = ThreadPool(processes=10)
|
||||||
files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
|
files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
|
||||||
|
pool.close()
|
||||||
for file in files:
|
for file in files:
|
||||||
if file:
|
if file:
|
||||||
results.append(file)
|
results.append(file)
|
||||||
@ -132,12 +134,12 @@ class HttpDirectory(RemoteDirectory):
|
|||||||
stripped_url = url[len(self.base_url) - 1:]
|
stripped_url = url[len(self.base_url) - 1:]
|
||||||
|
|
||||||
path, name = os.path.split(stripped_url)
|
path, name = os.path.split(stripped_url)
|
||||||
|
date = r.headers["Date"] if "Date" in r.headers else "1970-01-01"
|
||||||
return File(
|
return File(
|
||||||
path=unquote(path).strip("/"),
|
path=unquote(path).strip("/"),
|
||||||
name=unquote(name),
|
name=unquote(name),
|
||||||
size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
|
size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
|
||||||
mtime=r.headers["Date"] if "Date" in r.headers else "?",
|
mtime=int(parse_date(date).timestamp()),
|
||||||
is_dir=False
|
is_dir=False
|
||||||
)
|
)
|
||||||
except RequestException:
|
except RequestException:
|
||||||
|
@ -3,7 +3,7 @@ import json
|
|||||||
|
|
||||||
|
|
||||||
payload = json.dumps({
|
payload = json.dumps({
|
||||||
"url": "http://124.158.108.137/ebooks/",
|
"url": "http://138.197.215.189/",
|
||||||
"priority": 2,
|
"priority": 2,
|
||||||
"callback_type": "",
|
"callback_type": "",
|
||||||
"callback_args": "{}"
|
"callback_args": "{}"
|
||||||
|
@ -11,3 +11,4 @@ apscheduler
|
|||||||
bcrypt
|
bcrypt
|
||||||
ftputil
|
ftputil
|
||||||
lxml
|
lxml
|
||||||
|
elasticsearch
|
135
search/search.py
Normal file
135
search/search.py
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
import elasticsearch
|
||||||
|
|
||||||
|
|
||||||
|
class IndexingError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class SearchEngine:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def import_json(self, in_file: str, website_id: int):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def search(self, query) -> list:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def ping(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class ElasticSearchEngine(SearchEngine):
|
||||||
|
|
||||||
|
def __init__(self, index_name):
|
||||||
|
super().__init__()
|
||||||
|
self.index_name = index_name
|
||||||
|
self.es = elasticsearch.Elasticsearch()
|
||||||
|
|
||||||
|
if not self.es.indices.exists(self.index_name):
|
||||||
|
self.init()
|
||||||
|
|
||||||
|
def init(self):
|
||||||
|
print("Elasticsearch first time setup")
|
||||||
|
if self.es.indices.exists(self.index_name):
|
||||||
|
self.es.indices.delete(index=self.index_name)
|
||||||
|
self.es.indices.create(index=self.index_name)
|
||||||
|
self.es.indices.close(index=self.index_name)
|
||||||
|
|
||||||
|
# Paths
|
||||||
|
self.es.indices.put_settings(body=
|
||||||
|
{"analysis": {
|
||||||
|
"tokenizer": {
|
||||||
|
"path_tokenizer": {
|
||||||
|
"type": "path_hierarchy"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}}, index=self.index_name)
|
||||||
|
|
||||||
|
self.es.indices.put_settings(body=
|
||||||
|
{"analysis": {
|
||||||
|
"analyzer": {
|
||||||
|
"path_analyser": {
|
||||||
|
"tokenizer": "path_tokenizer", "filter": ["lowercase"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}}, index=self.index_name)
|
||||||
|
|
||||||
|
# File names
|
||||||
|
self.es.indices.put_settings(body=
|
||||||
|
{"analysis": {
|
||||||
|
"tokenizer": {
|
||||||
|
"my_nGram_tokenizer": {
|
||||||
|
"type": "nGram", "min_gram": 3, "max_gram": 3}
|
||||||
|
}
|
||||||
|
}}, index=self.index_name)
|
||||||
|
self.es.indices.put_settings(body=
|
||||||
|
{"analysis": {
|
||||||
|
"analyzer": {
|
||||||
|
"my_nGram": {
|
||||||
|
"tokenizer": "my_nGram_tokenizer",
|
||||||
|
"filter": ["lowercase", "asciifolding"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}}, index=self.index_name)
|
||||||
|
|
||||||
|
# Mappings
|
||||||
|
self.es.indices.put_mapping(body={"properties": {
|
||||||
|
"path": {"type": "text", "analyzer": "path_analyser"},
|
||||||
|
"name": {"analyzer": "my_nGram", "type": "text"},
|
||||||
|
"mtime": {"type": "date", "format": "epoch_millis"},
|
||||||
|
"size": {"type": "long"},
|
||||||
|
"website_id": {"type": "integer"}
|
||||||
|
}}, doc_type="file", index=self.index_name)
|
||||||
|
|
||||||
|
self.es.indices.open(index=self.index_name)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.init()
|
||||||
|
|
||||||
|
def ping(self):
|
||||||
|
return self.es.ping()
|
||||||
|
|
||||||
|
def import_json(self, in_file: str, website_id: int):
|
||||||
|
import_every = 1000
|
||||||
|
|
||||||
|
with open(in_file, "r") as f:
|
||||||
|
docs = []
|
||||||
|
|
||||||
|
line = f.readline()
|
||||||
|
while line:
|
||||||
|
docs.append(line[:-1]) # Remove trailing new line
|
||||||
|
|
||||||
|
if len(docs) >= import_every:
|
||||||
|
self._index(docs, website_id)
|
||||||
|
docs.clear()
|
||||||
|
line = f.readline()
|
||||||
|
self._index(docs, website_id)
|
||||||
|
|
||||||
|
def _index(self, docs, website_id):
|
||||||
|
print("Indexing " + str(len(docs)) + " docs")
|
||||||
|
bulk_string = ElasticSearchEngine.create_bulk_index_string(docs, website_id)
|
||||||
|
result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file")
|
||||||
|
|
||||||
|
if result["errors"]:
|
||||||
|
print(result)
|
||||||
|
raise IndexingError
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_bulk_index_string(docs: list, website_id: int):
|
||||||
|
|
||||||
|
result = ""
|
||||||
|
|
||||||
|
action_string = '{"index":{}}\n'
|
||||||
|
website_id_string = ',"website_id":' + str(website_id) + '}\n' # Add website_id param to each doc
|
||||||
|
|
||||||
|
for doc in docs:
|
||||||
|
result += action_string + doc[:-1] + website_id_string
|
||||||
|
return result
|
||||||
|
|
||||||
|
def search(self, query):
|
||||||
|
pass
|
@ -1,9 +1,7 @@
|
|||||||
from flask_testing import LiveServerTestCase
|
from flask_testing import LiveServerTestCase
|
||||||
import os
|
|
||||||
import json
|
import json
|
||||||
import requests
|
import requests
|
||||||
from crawl_server.server import app
|
from crawl_server.server import app
|
||||||
from crawl_server.task_manager import TaskManager
|
|
||||||
|
|
||||||
|
|
||||||
class CrawlServerTest(LiveServerTestCase):
|
class CrawlServerTest(LiveServerTestCase):
|
||||||
@ -20,15 +18,6 @@ class CrawlServerTest(LiveServerTestCase):
|
|||||||
app.config['LIVESERVER_PORT'] = 9999
|
app.config['LIVESERVER_PORT'] = 9999
|
||||||
return app
|
return app
|
||||||
|
|
||||||
def test_put_only_accepts_json(self):
|
|
||||||
|
|
||||||
payload = json.dumps({"url": "", "priority": 1, "callback_type": "", "callback_args": "{}"})
|
|
||||||
r = requests.post(self.HOST + "/task/put", data=payload)
|
|
||||||
self.assertEqual(400, r.status_code)
|
|
||||||
|
|
||||||
r2 = requests.post(self.HOST + "/task/put", headers=self.headers, data=payload)
|
|
||||||
self.assertEqual(200, r2.status_code)
|
|
||||||
|
|
||||||
def test_put_task(self):
|
def test_put_task(self):
|
||||||
|
|
||||||
payload = json.dumps({
|
payload = json.dumps({
|
||||||
@ -43,11 +32,15 @@ class CrawlServerTest(LiveServerTestCase):
|
|||||||
r = requests.get(self.HOST + "/task")
|
r = requests.get(self.HOST + "/task")
|
||||||
self.assertEqual(200, r.status_code)
|
self.assertEqual(200, r.status_code)
|
||||||
|
|
||||||
print(r.text)
|
|
||||||
result = json.loads(r.text)[0]
|
result = json.loads(r.text)[0]
|
||||||
self.assertEqual(result["url"], "a")
|
self.assertEqual(result["url"], "a")
|
||||||
self.assertEqual(result["priority"], 2)
|
self.assertEqual(result["priority"], 2)
|
||||||
self.assertEqual(result["callback_type"], "c")
|
self.assertEqual(result["callback_type"], "c")
|
||||||
self.assertEqual(result["callback_args"], '{"d": 4}')
|
self.assertEqual(result["callback_args"], '{"d": 4}')
|
||||||
|
|
||||||
|
payload = json.dumps({"url": "", "priority": 1, "callback_type": "", "callback_args": "{}"})
|
||||||
|
r = requests.post(self.HOST + "/task/put", data=payload)
|
||||||
|
self.assertEqual(400, r.status_code)
|
||||||
|
|
||||||
|
r2 = requests.post(self.HOST + "/task/put", headers=self.headers, data=payload)
|
||||||
|
self.assertEqual(200, r2.status_code)
|
||||||
|
38
test/test_search.py
Normal file
38
test/test_search.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
from unittest import TestCase
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from search.search import ElasticSearchEngine
|
||||||
|
|
||||||
|
|
||||||
|
class SearchTest(TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.search = ElasticSearchEngine("od-database-test")
|
||||||
|
self.search.reset()
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
def test_ping(self):
|
||||||
|
self.assertTrue(self.search.ping(), "Search engine not running")
|
||||||
|
|
||||||
|
def test_import_json(self):
|
||||||
|
|
||||||
|
files = [
|
||||||
|
{"name": "a", "size": 1000000000000000000, "path": "c/d", "mtime": 1528765672},
|
||||||
|
{"name": "b", "size": 123, "path": "", "mtime": None},
|
||||||
|
{"name": "c", "size": -1, "path": "c", "mtime": 12345}
|
||||||
|
]
|
||||||
|
|
||||||
|
with open("tmp.json", "w") as f:
|
||||||
|
for file in files:
|
||||||
|
f.write(json.dumps(file) + "\n")
|
||||||
|
|
||||||
|
self.search.import_json("tmp.json", 123)
|
||||||
|
time.sleep(3)
|
||||||
|
self.assertEqual(3, self.search.es.count(self.search.index_name, "file")["count"])
|
||||||
|
|
||||||
|
os.remove("tmp.json")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user