From 9bde8cb629d57fb02c4a580f9d903334af326e04 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Wed, 13 Jun 2018 14:11:27 -0400
Subject: [PATCH] uWSGI config and bugfix with file extensions

---
 app.py                |  6 ++---
 database.py           |  7 ++++--
 od-database.ini       |  8 +++++++
 search/search.py      | 10 ++++----
 startWSGI.sh          |  2 ++
 stress_test.py        | 56 +++++++++++++++++++++++++++++++++++++++++++
 task.py               | 21 ++++++++--------
 templates/home.html   | 17 +------------
 templates/search.html |  4 ++--
 uwsgi.py              | 11 +++++++++
 10 files changed, 104 insertions(+), 38 deletions(-)
 create mode 100644 od-database.ini
 create mode 100644 startWSGI.sh
 create mode 100644 stress_test.py
 create mode 100644 uwsgi.py
diff --git a/app.py b/app.py
index 2ce196a..7a8bb53 100644
--- a/app.py
+++ b/app.py
@@ -113,7 +113,7 @@ def search():
     if len(q) >= 3:
         try:
             hits = searchEngine.search(q, page, per_page, sort_order)
-            hits = db.join_search_result(hits)
+            hits = db.join_website_on_search_result(hits)
         except InvalidQueryException as e:
             flash("<strong>Invalid query:</strong> " + str(e), "warning")
             return redirect("/search")
@@ -299,6 +299,6 @@ if __name__ == '__main__':
     if config.USE_SSL:
         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
         context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem')
-        app.run("0.0.0.0", port=12345, ssl_context=context)
+        app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True)
     else:
-        app.run("0.0.0.0", port=12345)
+        app.run("0.0.0.0", port=12345, threaded=True)
diff --git a/database.py b/database.py
index 176ab05..a388bbe 100644
--- a/database.py
+++ b/database.py
@@ -206,12 +206,15 @@ class Database:
                 result[db_website[0]] = db_website[1]
             return result
 
-    def join_search_result(self, page: dict) -> dict:
+    def join_website_on_search_result(self, page: dict) -> dict:
 
         websites = self.get_all_websites()
 
         for hit in page["hits"]["hits"]:
-            hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]]
+            if hit["_source"]["website_id"] in websites:
+                hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]]
+            else:
+                hit["_source"]["website_url"] = "NONE"
 
         return page
 
diff --git a/od-database.ini b/od-database.ini
new file mode 100644
index 0000000..282d72e
--- /dev/null
+++ b/od-database.ini
@@ -0,0 +1,8 @@
+[uwsgi]
+socket = 127.0.0.1:3031
+chdir = /home/simon/Dropbox/data/CS/python/od-database/
+wsgi-file = uwsgi.py
+processes = 4
+threads = 4
+stats = 127.0.0.1:9191
+callable=app
\ No newline at end of file
diff --git a/search/search.py b/search/search.py
index 60af95a..31b84e5 100644
--- a/search/search.py
+++ b/search/search.py
@@ -74,8 +74,8 @@ class ElasticSearchEngine(SearchEngine):
 
         # Mappings
         self.es.indices.put_mapping(body={"properties": {
-            "path": {"analyzer": "my_nGram", "type": "text"},
-            "name": {"analyzer": "my_nGram", "type": "text"},
+            "path": {"analyzer": "standard", "type": "text"},
+            "name": {"analyzer": "standard", "type": "text", "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}},
             "mtime": {"type": "date", "format": "epoch_millis"},
             "size": {"type": "long"},
             "website_id": {"type": "integer"},
@@ -95,7 +95,7 @@ class ElasticSearchEngine(SearchEngine):
         if not in_str:
             return
 
-        import_every = 1000
+        import_every = 5000
 
         docs = []
 
@@ -138,8 +138,8 @@ class ElasticSearchEngine(SearchEngine):
                     "must": {
                         "multi_match": {
                             "query": query,
-                            "fields": ["name", "path"],
-                            "operator": "and"
+                            "fields": ["name^5", "name.nGram^2", "path"],
+                            "operator": "or"
                         }
                     },
                     "filter": filters
diff --git a/startWSGI.sh b/startWSGI.sh
new file mode 100644
index 0000000..a7ad37d
--- /dev/null
+++ b/startWSGI.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+uwsgi od-database.ini
\ No newline at end of file
diff --git a/stress_test.py b/stress_test.py
new file mode 100644
index 0000000..bc57f07
--- /dev/null
+++ b/stress_test.py
@@ -0,0 +1,56 @@
+import os
+import json
+import sys
+from search.search import ElasticSearchEngine
+from concurrent.futures import ThreadPoolExecutor
+import requests
+import random
+
+
+def dump_local_filesystem(root_dir: str):
+
+    docs = []
+
+    for root, dirs, files in os.walk(root_dir):
+
+        for filename in files:
+            full_path = os.path.join(root, filename)
+            stats = os.stat(full_path)
+
+            doc = dict()
+            doc["name"] = filename
+            doc["path"] = root
+            doc["mtime"] = stats.st_mtime
+            doc["size"] = stats.st_size
+
+            docs.append(doc)
+
+    with open("local_filesystem.json", "w") as f:
+            f.writelines(json.dumps(doc) + "\n" for doc in docs)
+
+
+def index_file_list(path: str, website_id):
+
+    es = ElasticSearchEngine("od-database")
+    with open(path, "r") as f:
+        es.import_json(f.read(), website_id)
+
+
+def search(term=""):
+    requests.get("http://localhost/?&sort_order=score&per_page=100q=" + term, verify=False)
+    print(term)
+
+
+def random_searches(count=10000000, max_workers=1000):
+
+    terms = requests.get("https://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain")\
+        .text.splitlines()
+
+    pool = ThreadPoolExecutor(max_workers=max_workers)
+    pool.map(search, random.choices(terms, k=count))
+
+
+
+# dump_local_filesystem("/mnt/")
+# index_file_list("local_filesystem.json", 10)
+# random_searches(100000)
diff --git a/task.py b/task.py
index 2c22288..abfd2c5 100644
--- a/task.py
+++ b/task.py
@@ -28,7 +28,7 @@ class CrawlServer:
         except ConnectionError:
             return False
 
-    def get_completed_tasks(self) -> list:
+    def fetch_completed_tasks(self) -> list:
 
         try:
             r = requests.get(self.url + "/task/completed")
@@ -36,9 +36,10 @@ class CrawlServer:
                 TaskResult(r["status_code"], r["file_count"], r["start_time"], r["end_time"], r["website_id"])
                 for r in json.loads(r.text)]
         except ConnectionError:
+            print("Crawl server cannot be reached " + self.url)
             return []
 
-    def get_queued_tasks(self) -> list:
+    def fetch_queued_tasks(self) -> list:
 
         try:
             r = requests.get(self.url + "/task/")
@@ -49,7 +50,7 @@ class CrawlServer:
         except ConnectionError:
             return []
 
-    def get_current_tasks(self):
+    def fetch_current_tasks(self):
 
         try:
             r = requests.get(self.url + "/task/current")
@@ -58,14 +59,13 @@ class CrawlServer:
                 for t in json.loads(r.text)
             ]
         except ConnectionError:
-            print("Server cannot be reached " + self.url)
             return []
 
-    def get_file_list(self, website_id) -> str:
+    def fetch_website_files(self, website_id) -> str:
 
         try:
             r = requests.get(self.url + "/file_list/" + str(website_id) + "/")
-            return r.text
+            return r.text if r.status_code == 200 else ""
         except ConnectionError:
             return ""
 
@@ -73,6 +73,7 @@ class CrawlServer:
 class TaskDispatcher:
 
     def __init__(self):
+        # TODO: remove reddit
         reddit = praw.Reddit('opendirectories-bot',
                              user_agent='github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)')
         self.reddit_bot = RedditBot("crawled.txt", reddit)
@@ -91,9 +92,9 @@ class TaskDispatcher:
     def check_completed_tasks(self):
 
         for server in self.crawl_servers:
-            for task in server.get_completed_tasks():
+            for task in server.fetch_completed_tasks():
                 print("Completed task")
-                file_list = server.get_file_list(task.website_id)
+                file_list = server.fetch_website_files(task.website_id)
                 self.search.import_json(file_list, task.website_id)
 
     def dispatch_task(self, task: Task):
@@ -108,7 +109,7 @@ class TaskDispatcher:
         queued_tasks = []
 
         for server in self.crawl_servers:
-            queued_tasks.extend(server.get_queued_tasks())
+            queued_tasks.extend(server.fetch_queued_tasks())
 
         return queued_tasks
 
@@ -117,7 +118,7 @@ class TaskDispatcher:
 
         current_tasks = []
         for server in self.crawl_servers:
-            current_tasks.extend(server.get_current_tasks())
+            current_tasks.extend(server.fetch_current_tasks())
 
         return current_tasks
 
diff --git a/templates/home.html b/templates/home.html
index 9a2e373..9c9d084 100644
--- a/templates/home.html
+++ b/templates/home.html
@@ -26,7 +26,7 @@
                 <form action="/search">
 
                     <div class="form-group">
-                        <input class="form-control" name="q" id="q" placeholder="Full-text Query">
+                        <input class="form-control" name="q" id="q" placeholder="Query">
                     </div>
 
                     <input class="btn btn-primary btn-shadow" type="submit" value="Search">
@@ -34,20 +34,5 @@
             </div>
         </div>
 
-        <div class="card">
-            <div class="card-header">Full-text Query Syntax</div>
-            <div class="card-body">
-
-                <p>The following query types are allowed (More information
-                    <a href="https://www.sqlite.org/fts5.html#full_text_query_syntax">here</a>):</p>
-                <p>Exact term: <code> "foo"</code></p>
-                <p>Term with prefix: <code> "foo*"</code></p>
-                <p>File names: <code> "name:foo"</code></p>
-                <p>Paths: <code> "path:foo"</code></p>
-                <p>Starts with term: <code> "^foo"</code></p>
-                <p>NEAR group: <code> "NEAR(foo bar, 3)"</code></p>
-            </div>
-        </div>
-
     </div>
 {% endblock body %}
diff --git a/templates/search.html b/templates/search.html
index 8bc7485..6aa216f 100644
--- a/templates/search.html
+++ b/templates/search.html
@@ -58,8 +58,8 @@
                                 <tr>
                                     <td>
                                         {# File name & link #}
-                                        <a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + src["ext"] }}" title="{{ src["name"] + src["ext"] }}">
-                                            {{ hl_name |safe }}{{ src["ext"] }}
+                                        <a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + ("." if src["ext"] != "" else "") + src["ext"] }}" title="{{ src["name"] + src["ext"] }}">
+                                            {{ hl_name |safe }}{{ ("." if src["ext"] != "" else "") + src["ext"] }}
                                         </a>
                                         {# File type badge #}
                                         {% set mime = get_mime(src["path"]) %}
diff --git a/uwsgi.py b/uwsgi.py
new file mode 100644
index 0000000..433d3be
--- /dev/null
+++ b/uwsgi.py
@@ -0,0 +1,11 @@
+from app import app
+import config
+import ssl
+
+if __name__ == '__main__':
+    if not config.USE_SSL:
+        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
+        context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem')
+        app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True)
+    else:
+        app.run("0.0.0.0", port=12345, threaded=True)