Elasticsearch search engine (import from json)

2025-09-17 19:56:51 +00:00 · 2018-06-11 22:35:49 -04:00 · 2018-06-11 22:35:49 -04:00 · 72495275b0
commit 72495275b0
parent fcfd7d4acc
9 changed files with 190 additions and 23 deletions
--- a/crawl_server/task_manager.py
+++ b/crawl_server/task_manager.py
@ -49,7 +49,7 @@ class TaskManager:
        print("Starting task " + task.url)
-        crawler = RemoteDirectoryCrawler(task.url, 10)
+        crawler = RemoteDirectoryCrawler(task.url, 100)
        crawl_result = crawler.crawl_directory("12345.json")
        result.file_count = crawl_result.file_count
@ -63,7 +63,6 @@ class TaskManager:
    @staticmethod
    def task_complete(result: TaskResult):
        print("Task done " + str(result))
        print(result.status_code)
        print(result.file_count)
        print(result.start_time)
--- a/crawler/crawler.py
+++ b/crawler/crawler.py
@ -12,7 +12,7 @@ class TooManyConnectionsError(Exception):
 class File:
-    def __init__(self, name: str, size: int, mtime: str, path: str, is_dir: bool):
+    def __init__(self, name: str, size: int, mtime: int, path: str, is_dir: bool):
        self.name = name
        self.size = size
        self.mtime = mtime
@ -69,8 +69,7 @@ def export_to_json(q: Queue, out_file: str) -> int:
        while True:
            try:
                next_file: File = q.get_nowait()
-                f.write(next_file.to_json())
+                f.write(next_file.to_json() + "\n")
                f.write("\n")
                counter += 1
            except Empty:
                break
--- a/crawler/ftp.py
+++ b/crawler/ftp.py
@ -61,7 +61,7 @@ class FtpDirectory(RemoteDirectory):
                    results.append(File(
                        name=file_name,
-                        mtime=stat.st_mtime,
+                        mtime=stat.st_mtime,  # TODO: check
                        size=-1 if is_dir else stat.st_size,
                        is_dir=is_dir,
                        path=path
--- a/crawler/http.py
+++ b/crawler/http.py
@ -8,6 +8,7 @@ import requests
 from requests.exceptions import RequestException
 from multiprocessing.pool import ThreadPool
 import config
 from dateutil.parser import parse as parse_date
 class Link:
@ -59,7 +60,7 @@ class HttpDirectory(RemoteDirectory):
            if self._isdir(link):
                results.append(File(
                    name=file_name,
-                    mtime="",
+                    mtime=0,
                    size=-1,
                    is_dir=True,
                    path=path
@ -79,6 +80,7 @@ class HttpDirectory(RemoteDirectory):
            # Many urls, use multi-threaded solution
            pool = ThreadPool(processes=10)
            files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
            pool.close()
            for file in files:
                if file:
                    results.append(file)
@ -132,12 +134,12 @@ class HttpDirectory(RemoteDirectory):
                stripped_url = url[len(self.base_url) - 1:]
                path, name = os.path.split(stripped_url)
-
+                date = r.headers["Date"] if "Date" in r.headers else "1970-01-01"
                return File(
                    path=unquote(path).strip("/"),
                    name=unquote(name),
                    size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
-                    mtime=r.headers["Date"] if "Date" in r.headers else "?",
+                    mtime=int(parse_date(date).timestamp()),
                    is_dir=False
                )
            except RequestException:
--- a/debug_put.py
+++ b/debug_put.py
@ -3,7 +3,7 @@ import json
 payload = json.dumps({
-    "url": "http://124.158.108.137/ebooks/",
+    "url": "http://138.197.215.189/",
    "priority": 2,
    "callback_type": "",
    "callback_args": "{}"
--- a/requirements.txt
+++ b/requirements.txt
@ -11,3 +11,4 @@ apscheduler
 bcrypt
 ftputil
 lxml
 elasticsearch
--- a/search/search.py
+++ b/search/search.py
@ -0,0 +1,135 @@
 import elasticsearch
 class IndexingError(Exception):
    pass
 class SearchEngine:
    def __init__(self):
        pass
    def import_json(self, in_file: str, website_id: int):
        raise NotImplementedError
    def search(self, query) -> list:
        raise NotImplementedError
    def reset(self):
        raise NotImplementedError
    def ping(self):
        raise NotImplementedError
 class ElasticSearchEngine(SearchEngine):
    def __init__(self, index_name):
        super().__init__()
        self.index_name = index_name
        self.es = elasticsearch.Elasticsearch()
        if not self.es.indices.exists(self.index_name):
            self.init()
    def init(self):
        print("Elasticsearch first time setup")
        if self.es.indices.exists(self.index_name):
            self.es.indices.delete(index=self.index_name)
        self.es.indices.create(index=self.index_name)
        self.es.indices.close(index=self.index_name)
        # Paths
        self.es.indices.put_settings(body=
                                     {"analysis": {
                                         "tokenizer": {
                                             "path_tokenizer": {
                                                 "type": "path_hierarchy"
                                             }
                                         }
                                     }}, index=self.index_name)
        self.es.indices.put_settings(body=
                                     {"analysis": {
                                         "analyzer": {
                                             "path_analyser": {
                                                 "tokenizer": "path_tokenizer", "filter": ["lowercase"]
                                             }
                                         }
                                     }}, index=self.index_name)
        # File names
        self.es.indices.put_settings(body=
                                     {"analysis": {
                                         "tokenizer": {
                                             "my_nGram_tokenizer": {
                                                 "type": "nGram", "min_gram": 3, "max_gram": 3}
                                         }
                                     }}, index=self.index_name)
        self.es.indices.put_settings(body=
                                     {"analysis": {
                                         "analyzer": {
                                             "my_nGram": {
                                                 "tokenizer": "my_nGram_tokenizer",
                                                 "filter": ["lowercase", "asciifolding"]
                                             }
                                         }
                                     }}, index=self.index_name)
        # Mappings
        self.es.indices.put_mapping(body={"properties": {
            "path": {"type": "text", "analyzer": "path_analyser"},
            "name": {"analyzer": "my_nGram", "type": "text"},
            "mtime": {"type": "date", "format": "epoch_millis"},
            "size": {"type": "long"},
            "website_id": {"type": "integer"}
        }}, doc_type="file", index=self.index_name)
        self.es.indices.open(index=self.index_name)
    def reset(self):
        self.init()
    def ping(self):
        return self.es.ping()
    def import_json(self, in_file: str, website_id: int):
        import_every = 1000
        with open(in_file, "r") as f:
            docs = []
            line = f.readline()
            while line:
                docs.append(line[:-1])  # Remove trailing new line
                if len(docs) >= import_every:
                    self._index(docs, website_id)
                    docs.clear()
                line = f.readline()
            self._index(docs, website_id)
    def _index(self, docs, website_id):
        print("Indexing " + str(len(docs)) + " docs")
        bulk_string = ElasticSearchEngine.create_bulk_index_string(docs, website_id)
        result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file")
        if result["errors"]:
            print(result)
            raise IndexingError
    @staticmethod
    def create_bulk_index_string(docs: list, website_id: int):
        result = ""
        action_string = '{"index":{}}\n'
        website_id_string = ',"website_id":' + str(website_id) + '}\n'  # Add website_id param to each doc
        for doc in docs:
            result += action_string + doc[:-1] + website_id_string
        return result
    def search(self, query):
        pass
--- a/test/test_crawl_server.py
+++ b/test/test_crawl_server.py
@ -1,9 +1,7 @@
 from flask_testing import LiveServerTestCase
 import os
 import json
 import requests
 from crawl_server.server import app
 from crawl_server.task_manager import TaskManager
 class CrawlServerTest(LiveServerTestCase):
@ -20,15 +18,6 @@ class CrawlServerTest(LiveServerTestCase):
        app.config['LIVESERVER_PORT'] = 9999
        return app
    def test_put_only_accepts_json(self):
        payload = json.dumps({"url": "", "priority": 1, "callback_type": "", "callback_args": "{}"})
        r = requests.post(self.HOST + "/task/put", data=payload)
        self.assertEqual(400, r.status_code)
        r2 = requests.post(self.HOST + "/task/put", headers=self.headers, data=payload)
        self.assertEqual(200, r2.status_code)
    def test_put_task(self):
        payload = json.dumps({
@ -43,11 +32,15 @@ class CrawlServerTest(LiveServerTestCase):
        r = requests.get(self.HOST + "/task")
        self.assertEqual(200, r.status_code)
        print(r.text)
        result = json.loads(r.text)[0]
        self.assertEqual(result["url"], "a")
        self.assertEqual(result["priority"], 2)
        self.assertEqual(result["callback_type"], "c")
        self.assertEqual(result["callback_args"], '{"d": 4}')
        payload = json.dumps({"url": "", "priority": 1, "callback_type": "", "callback_args": "{}"})
        r = requests.post(self.HOST + "/task/put", data=payload)
        self.assertEqual(400, r.status_code)
        r2 = requests.post(self.HOST + "/task/put", headers=self.headers, data=payload)
        self.assertEqual(200, r2.status_code)
--- a/test/test_search.py
+++ b/test/test_search.py
@ -0,0 +1,38 @@
 from unittest import TestCase
 import time
 import json
 import os
 from search.search import ElasticSearchEngine
 class SearchTest(TestCase):
    def setUp(self):
        self.search = ElasticSearchEngine("od-database-test")
        self.search.reset()
        time.sleep(1)
    def test_ping(self):
        self.assertTrue(self.search.ping(), "Search engine not running")
    def test_import_json(self):
        files = [
            {"name": "a", "size": 1000000000000000000, "path": "c/d", "mtime": 1528765672},
            {"name": "b", "size": 123, "path": "", "mtime": None},
            {"name": "c", "size": -1, "path": "c", "mtime": 12345}
        ]
        with open("tmp.json", "w") as f:
            for file in files:
                f.write(json.dumps(file) + "\n")
        self.search.import_json("tmp.json", 123)
        time.sleep(3)
        self.assertEqual(3, self.search.es.count(self.search.index_name, "file")["count"])
        os.remove("tmp.json")