From 9bde8cb629d57fb02c4a580f9d903334af326e04 Mon Sep 17 00:00:00 2001 From: Simon Date: Wed, 13 Jun 2018 14:11:27 -0400 Subject: [PATCH] uWSGI config and bugfix with file extensions --- app.py | 6 ++--- database.py | 7 ++++-- od-database.ini | 8 +++++++ search/search.py | 10 ++++---- startWSGI.sh | 2 ++ stress_test.py | 56 +++++++++++++++++++++++++++++++++++++++++++ task.py | 21 ++++++++-------- templates/home.html | 17 +------------ templates/search.html | 4 ++-- uwsgi.py | 11 +++++++++ 10 files changed, 104 insertions(+), 38 deletions(-) create mode 100644 od-database.ini create mode 100644 startWSGI.sh create mode 100644 stress_test.py create mode 100644 uwsgi.py diff --git a/app.py b/app.py index 2ce196a..7a8bb53 100644 --- a/app.py +++ b/app.py @@ -113,7 +113,7 @@ def search(): if len(q) >= 3: try: hits = searchEngine.search(q, page, per_page, sort_order) - hits = db.join_search_result(hits) + hits = db.join_website_on_search_result(hits) except InvalidQueryException as e: flash("Invalid query: " + str(e), "warning") return redirect("/search") @@ -299,6 +299,6 @@ if __name__ == '__main__': if config.USE_SSL: context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem') - app.run("0.0.0.0", port=12345, ssl_context=context) + app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True) else: - app.run("0.0.0.0", port=12345) + app.run("0.0.0.0", port=12345, threaded=True) diff --git a/database.py b/database.py index 176ab05..a388bbe 100644 --- a/database.py +++ b/database.py @@ -206,12 +206,15 @@ class Database: result[db_website[0]] = db_website[1] return result - def join_search_result(self, page: dict) -> dict: + def join_website_on_search_result(self, page: dict) -> dict: websites = self.get_all_websites() for hit in page["hits"]["hits"]: - hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]] + if hit["_source"]["website_id"] in websites: + hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]] + else: + hit["_source"]["website_url"] = "NONE" return page diff --git a/od-database.ini b/od-database.ini new file mode 100644 index 0000000..282d72e --- /dev/null +++ b/od-database.ini @@ -0,0 +1,8 @@ +[uwsgi] +socket = 127.0.0.1:3031 +chdir = /home/simon/Dropbox/data/CS/python/od-database/ +wsgi-file = uwsgi.py +processes = 4 +threads = 4 +stats = 127.0.0.1:9191 +callable=app \ No newline at end of file diff --git a/search/search.py b/search/search.py index 60af95a..31b84e5 100644 --- a/search/search.py +++ b/search/search.py @@ -74,8 +74,8 @@ class ElasticSearchEngine(SearchEngine): # Mappings self.es.indices.put_mapping(body={"properties": { - "path": {"analyzer": "my_nGram", "type": "text"}, - "name": {"analyzer": "my_nGram", "type": "text"}, + "path": {"analyzer": "standard", "type": "text"}, + "name": {"analyzer": "standard", "type": "text", "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}}, "mtime": {"type": "date", "format": "epoch_millis"}, "size": {"type": "long"}, "website_id": {"type": "integer"}, @@ -95,7 +95,7 @@ class ElasticSearchEngine(SearchEngine): if not in_str: return - import_every = 1000 + import_every = 5000 docs = [] @@ -138,8 +138,8 @@ class ElasticSearchEngine(SearchEngine): "must": { "multi_match": { "query": query, - "fields": ["name", "path"], - "operator": "and" + "fields": ["name^5", "name.nGram^2", "path"], + "operator": "or" } }, "filter": filters diff --git a/startWSGI.sh b/startWSGI.sh new file mode 100644 index 0000000..a7ad37d --- /dev/null +++ b/startWSGI.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +uwsgi od-database.ini \ No newline at end of file diff --git a/stress_test.py b/stress_test.py new file mode 100644 index 0000000..bc57f07 --- /dev/null +++ b/stress_test.py @@ -0,0 +1,56 @@ +import os +import json +import sys +from search.search import ElasticSearchEngine +from concurrent.futures import ThreadPoolExecutor +import requests +import random + + +def dump_local_filesystem(root_dir: str): + + docs = [] + + for root, dirs, files in os.walk(root_dir): + + for filename in files: + full_path = os.path.join(root, filename) + stats = os.stat(full_path) + + doc = dict() + doc["name"] = filename + doc["path"] = root + doc["mtime"] = stats.st_mtime + doc["size"] = stats.st_size + + docs.append(doc) + + with open("local_filesystem.json", "w") as f: + f.writelines(json.dumps(doc) + "\n" for doc in docs) + + +def index_file_list(path: str, website_id): + + es = ElasticSearchEngine("od-database") + with open(path, "r") as f: + es.import_json(f.read(), website_id) + + +def search(term=""): + requests.get("http://localhost/?&sort_order=score&per_page=100q=" + term, verify=False) + print(term) + + +def random_searches(count=10000000, max_workers=1000): + + terms = requests.get("https://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain")\ + .text.splitlines() + + pool = ThreadPoolExecutor(max_workers=max_workers) + pool.map(search, random.choices(terms, k=count)) + + + +# dump_local_filesystem("/mnt/") +# index_file_list("local_filesystem.json", 10) +# random_searches(100000) diff --git a/task.py b/task.py index 2c22288..abfd2c5 100644 --- a/task.py +++ b/task.py @@ -28,7 +28,7 @@ class CrawlServer: except ConnectionError: return False - def get_completed_tasks(self) -> list: + def fetch_completed_tasks(self) -> list: try: r = requests.get(self.url + "/task/completed") @@ -36,9 +36,10 @@ class CrawlServer: TaskResult(r["status_code"], r["file_count"], r["start_time"], r["end_time"], r["website_id"]) for r in json.loads(r.text)] except ConnectionError: + print("Crawl server cannot be reached " + self.url) return [] - def get_queued_tasks(self) -> list: + def fetch_queued_tasks(self) -> list: try: r = requests.get(self.url + "/task/") @@ -49,7 +50,7 @@ class CrawlServer: except ConnectionError: return [] - def get_current_tasks(self): + def fetch_current_tasks(self): try: r = requests.get(self.url + "/task/current") @@ -58,14 +59,13 @@ class CrawlServer: for t in json.loads(r.text) ] except ConnectionError: - print("Server cannot be reached " + self.url) return [] - def get_file_list(self, website_id) -> str: + def fetch_website_files(self, website_id) -> str: try: r = requests.get(self.url + "/file_list/" + str(website_id) + "/") - return r.text + return r.text if r.status_code == 200 else "" except ConnectionError: return "" @@ -73,6 +73,7 @@ class CrawlServer: class TaskDispatcher: def __init__(self): + # TODO: remove reddit reddit = praw.Reddit('opendirectories-bot', user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)') self.reddit_bot = RedditBot("crawled.txt", reddit) @@ -91,9 +92,9 @@ class TaskDispatcher: def check_completed_tasks(self): for server in self.crawl_servers: - for task in server.get_completed_tasks(): + for task in server.fetch_completed_tasks(): print("Completed task") - file_list = server.get_file_list(task.website_id) + file_list = server.fetch_website_files(task.website_id) self.search.import_json(file_list, task.website_id) def dispatch_task(self, task: Task): @@ -108,7 +109,7 @@ class TaskDispatcher: queued_tasks = [] for server in self.crawl_servers: - queued_tasks.extend(server.get_queued_tasks()) + queued_tasks.extend(server.fetch_queued_tasks()) return queued_tasks @@ -117,7 +118,7 @@ class TaskDispatcher: current_tasks = [] for server in self.crawl_servers: - current_tasks.extend(server.get_current_tasks()) + current_tasks.extend(server.fetch_current_tasks()) return current_tasks diff --git a/templates/home.html b/templates/home.html index 9a2e373..9c9d084 100644 --- a/templates/home.html +++ b/templates/home.html @@ -26,7 +26,7 @@
- +
@@ -34,20 +34,5 @@ -
-
Full-text Query Syntax
-
- -

The following query types are allowed (More information - here):

-

Exact term: "foo"

-

Term with prefix: "foo*"

-

File names: "name:foo"

-

Paths: "path:foo"

-

Starts with term: "^foo"

-

NEAR group: "NEAR(foo bar, 3)"

-
-
- {% endblock body %} diff --git a/templates/search.html b/templates/search.html index 8bc7485..6aa216f 100644 --- a/templates/search.html +++ b/templates/search.html @@ -58,8 +58,8 @@ {# File name & link #} - - {{ hl_name |safe }}{{ src["ext"] }} + + {{ hl_name |safe }}{{ ("." if src["ext"] != "" else "") + src["ext"] }} {# File type badge #} {% set mime = get_mime(src["path"]) %} diff --git a/uwsgi.py b/uwsgi.py new file mode 100644 index 0000000..433d3be --- /dev/null +++ b/uwsgi.py @@ -0,0 +1,11 @@ +from app import app +import config +import ssl + +if __name__ == '__main__': + if not config.USE_SSL: + context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) + context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem') + app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True) + else: + app.run("0.0.0.0", port=12345, threaded=True)