uWSGI config and bugfix with file extensions

2025-10-26 12:06:52 +00:00 · 2018-06-13 14:11:27 -04:00 · 2018-06-13 14:11:27 -04:00 · 9bde8cb629
commit 9bde8cb629
parent e91572a06f
10 changed files with 104 additions and 38 deletions
--- a/app.py
+++ b/app.py
@ -113,7 +113,7 @@ def search():
    if len(q) >= 3:
        try:
            hits = searchEngine.search(q, page, per_page, sort_order)
-            hits = db.join_search_result(hits)
+            hits = db.join_website_on_search_result(hits)
        except InvalidQueryException as e:
            flash("<strong>Invalid query:</strong> " + str(e), "warning")
            return redirect("/search")
@ -299,6 +299,6 @@ if __name__ == '__main__':
    if config.USE_SSL:
        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
        context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem')
-        app.run("0.0.0.0", port=12345, ssl_context=context)
+        app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True)
    else:
-        app.run("0.0.0.0", port=12345)
+        app.run("0.0.0.0", port=12345, threaded=True)
--- a/database.py
+++ b/database.py
@ -206,12 +206,15 @@ class Database:
                result[db_website[0]] = db_website[1]
            return result
-    def join_search_result(self, page: dict) -> dict:
+    def join_website_on_search_result(self, page: dict) -> dict:
        websites = self.get_all_websites()
        for hit in page["hits"]["hits"]:
            if hit["_source"]["website_id"] in websites:
                hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]]
            else:
                hit["_source"]["website_url"] = "NONE"
        return page
--- a/od-database.ini
+++ b/od-database.ini
@ -0,0 +1,8 @@
 [uwsgi]
 socket = 127.0.0.1:3031
 chdir = /home/simon/Dropbox/data/CS/python/od-database/
 wsgi-file = uwsgi.py
 processes = 4
 threads = 4
 stats = 127.0.0.1:9191
 callable=app
--- a/search/search.py
+++ b/search/search.py
@ -74,8 +74,8 @@ class ElasticSearchEngine(SearchEngine):
        # Mappings
        self.es.indices.put_mapping(body={"properties": {
-            "path": {"analyzer": "my_nGram", "type": "text"},
+            "path": {"analyzer": "standard", "type": "text"},
-            "name": {"analyzer": "my_nGram", "type": "text"},
+            "name": {"analyzer": "standard", "type": "text", "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}},
            "mtime": {"type": "date", "format": "epoch_millis"},
            "size": {"type": "long"},
            "website_id": {"type": "integer"},
@ -95,7 +95,7 @@ class ElasticSearchEngine(SearchEngine):
        if not in_str:
            return
-        import_every = 1000
+        import_every = 5000
        docs = []
@ -138,8 +138,8 @@ class ElasticSearchEngine(SearchEngine):
                    "must": {
                        "multi_match": {
                            "query": query,
-                            "fields": ["name", "path"],
+                            "fields": ["name^5", "name.nGram^2", "path"],
-                            "operator": "and"
+                            "operator": "or"
                        }
                    },
                    "filter": filters
--- a/startWSGI.sh
+++ b/startWSGI.sh
@ -0,0 +1,2 @@
 #!/usr/bin/env bash
 uwsgi od-database.ini
--- a/stress_test.py
+++ b/stress_test.py
@ -0,0 +1,56 @@
 import os
 import json
 import sys
 from search.search import ElasticSearchEngine
 from concurrent.futures import ThreadPoolExecutor
 import requests
 import random
 def dump_local_filesystem(root_dir: str):
    docs = []
    for root, dirs, files in os.walk(root_dir):
        for filename in files:
            full_path = os.path.join(root, filename)
            stats = os.stat(full_path)
            doc = dict()
            doc["name"] = filename
            doc["path"] = root
            doc["mtime"] = stats.st_mtime
            doc["size"] = stats.st_size
            docs.append(doc)
    with open("local_filesystem.json", "w") as f:
            f.writelines(json.dumps(doc) + "\n" for doc in docs)
 def index_file_list(path: str, website_id):
    es = ElasticSearchEngine("od-database")
    with open(path, "r") as f:
        es.import_json(f.read(), website_id)
 def search(term=""):
    requests.get("http://localhost/?&sort_order=score&per_page=100q=" + term, verify=False)
    print(term)
 def random_searches(count=10000000, max_workers=1000):
    terms = requests.get("https://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain")\
        .text.splitlines()
    pool = ThreadPoolExecutor(max_workers=max_workers)
    pool.map(search, random.choices(terms, k=count))
 # dump_local_filesystem("/mnt/")
 # index_file_list("local_filesystem.json", 10)
 # random_searches(100000)
--- a/task.py
+++ b/task.py
@ -28,7 +28,7 @@ class CrawlServer:
        except ConnectionError:
            return False
-    def get_completed_tasks(self) -> list:
+    def fetch_completed_tasks(self) -> list:
        try:
            r = requests.get(self.url + "/task/completed")
@ -36,9 +36,10 @@ class CrawlServer:
                TaskResult(r["status_code"], r["file_count"], r["start_time"], r["end_time"], r["website_id"])
                for r in json.loads(r.text)]
        except ConnectionError:
            print("Crawl server cannot be reached " + self.url)
            return []
-    def get_queued_tasks(self) -> list:
+    def fetch_queued_tasks(self) -> list:
        try:
            r = requests.get(self.url + "/task/")
@ -49,7 +50,7 @@ class CrawlServer:
        except ConnectionError:
            return []
-    def get_current_tasks(self):
+    def fetch_current_tasks(self):
        try:
            r = requests.get(self.url + "/task/current")
@ -58,14 +59,13 @@ class CrawlServer:
                for t in json.loads(r.text)
            ]
        except ConnectionError:
            print("Server cannot be reached " + self.url)
            return []
-    def get_file_list(self, website_id) -> str:
+    def fetch_website_files(self, website_id) -> str:
        try:
            r = requests.get(self.url + "/file_list/" + str(website_id) + "/")
-            return r.text
+            return r.text if r.status_code == 200 else ""
        except ConnectionError:
            return ""
@ -73,6 +73,7 @@ class CrawlServer:
 class TaskDispatcher:
    def __init__(self):
        # TODO: remove reddit
        reddit = praw.Reddit('opendirectories-bot',
                             user_agent='github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)')
        self.reddit_bot = RedditBot("crawled.txt", reddit)
@ -91,9 +92,9 @@ class TaskDispatcher:
    def check_completed_tasks(self):
        for server in self.crawl_servers:
-            for task in server.get_completed_tasks():
+            for task in server.fetch_completed_tasks():
                print("Completed task")
-                file_list = server.get_file_list(task.website_id)
+                file_list = server.fetch_website_files(task.website_id)
                self.search.import_json(file_list, task.website_id)
    def dispatch_task(self, task: Task):
@ -108,7 +109,7 @@ class TaskDispatcher:
        queued_tasks = []
        for server in self.crawl_servers:
-            queued_tasks.extend(server.get_queued_tasks())
+            queued_tasks.extend(server.fetch_queued_tasks())
        return queued_tasks
@ -117,7 +118,7 @@ class TaskDispatcher:
        current_tasks = []
        for server in self.crawl_servers:
-            current_tasks.extend(server.get_current_tasks())
+            current_tasks.extend(server.fetch_current_tasks())
        return current_tasks
--- a/templates/home.html
+++ b/templates/home.html
@ -26,7 +26,7 @@
                <form action="/search">
                    <div class="form-group">
-                        <input class="form-control" name="q" id="q" placeholder="Full-text Query">
+                        <input class="form-control" name="q" id="q" placeholder="Query">
                    </div>
                    <input class="btn btn-primary btn-shadow" type="submit" value="Search">
@ -34,20 +34,5 @@
            </div>
        </div>
        <div class="card">
            <div class="card-header">Full-text Query Syntax</div>
            <div class="card-body">
                <p>The following query types are allowed (More information
                    <a href="https://www.sqlite.org/fts5.html#full_text_query_syntax">here</a>):</p>
                <p>Exact term: <code> "foo"</code></p>
                <p>Term with prefix: <code> "foo*"</code></p>
                <p>File names: <code> "name:foo"</code></p>
                <p>Paths: <code> "path:foo"</code></p>
                <p>Starts with term: <code> "^foo"</code></p>
                <p>NEAR group: <code> "NEAR(foo bar, 3)"</code></p>
            </div>
        </div>
    </div>
 {% endblock body %}
--- a/templates/search.html
+++ b/templates/search.html
@ -58,8 +58,8 @@
                                <tr>
                                    <td>
                                        {# File name & link #}
-                                        <a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + src["ext"] }}" title="{{ src["name"] + src["ext"] }}">
+                                        <a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + ("." if src["ext"] != "" else "") + src["ext"] }}" title="{{ src["name"] + src["ext"] }}">
-                                            {{ hl_name |safe }}{{ src["ext"] }}
+                                            {{ hl_name |safe }}{{ ("." if src["ext"] != "" else "") + src["ext"] }}
                                        </a>
                                        {# File type badge #}
                                        {% set mime = get_mime(src["path"]) %}
--- a/uwsgi.py
+++ b/uwsgi.py
@ -0,0 +1,11 @@
 from app import app
 import config
 import ssl
 if __name__ == '__main__':
    if not config.USE_SSL:
        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
        context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem')
        app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True)
    else:
        app.run("0.0.0.0", port=12345, threaded=True)
		`@ -0,0 +1,2 @@`
							`#!/usr/bin/env bash`
							`uwsgi od-database.ini`