uWSGI config and bugfix with file extensions

This commit is contained in:
Simon 2018-06-13 14:11:27 -04:00
parent e91572a06f
commit 9bde8cb629
10 changed files with 104 additions and 38 deletions

6
app.py
View File

@ -113,7 +113,7 @@ def search():
if len(q) >= 3:
try:
hits = searchEngine.search(q, page, per_page, sort_order)
hits = db.join_search_result(hits)
hits = db.join_website_on_search_result(hits)
except InvalidQueryException as e:
flash("<strong>Invalid query:</strong> " + str(e), "warning")
return redirect("/search")
@ -299,6 +299,6 @@ if __name__ == '__main__':
if config.USE_SSL:
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem')
app.run("0.0.0.0", port=12345, ssl_context=context)
app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True)
else:
app.run("0.0.0.0", port=12345)
app.run("0.0.0.0", port=12345, threaded=True)

View File

@ -206,12 +206,15 @@ class Database:
result[db_website[0]] = db_website[1]
return result
def join_search_result(self, page: dict) -> dict:
def join_website_on_search_result(self, page: dict) -> dict:
websites = self.get_all_websites()
for hit in page["hits"]["hits"]:
hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]]
if hit["_source"]["website_id"] in websites:
hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]]
else:
hit["_source"]["website_url"] = "NONE"
return page

8
od-database.ini Normal file
View File

@ -0,0 +1,8 @@
[uwsgi]
socket = 127.0.0.1:3031
chdir = /home/simon/Dropbox/data/CS/python/od-database/
wsgi-file = uwsgi.py
processes = 4
threads = 4
stats = 127.0.0.1:9191
callable=app

View File

@ -74,8 +74,8 @@ class ElasticSearchEngine(SearchEngine):
# Mappings
self.es.indices.put_mapping(body={"properties": {
"path": {"analyzer": "my_nGram", "type": "text"},
"name": {"analyzer": "my_nGram", "type": "text"},
"path": {"analyzer": "standard", "type": "text"},
"name": {"analyzer": "standard", "type": "text", "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}},
"mtime": {"type": "date", "format": "epoch_millis"},
"size": {"type": "long"},
"website_id": {"type": "integer"},
@ -95,7 +95,7 @@ class ElasticSearchEngine(SearchEngine):
if not in_str:
return
import_every = 1000
import_every = 5000
docs = []
@ -138,8 +138,8 @@ class ElasticSearchEngine(SearchEngine):
"must": {
"multi_match": {
"query": query,
"fields": ["name", "path"],
"operator": "and"
"fields": ["name^5", "name.nGram^2", "path"],
"operator": "or"
}
},
"filter": filters

2
startWSGI.sh Normal file
View File

@ -0,0 +1,2 @@
#!/usr/bin/env bash
uwsgi od-database.ini

56
stress_test.py Normal file
View File

@ -0,0 +1,56 @@
import os
import json
import sys
from search.search import ElasticSearchEngine
from concurrent.futures import ThreadPoolExecutor
import requests
import random
def dump_local_filesystem(root_dir: str):
docs = []
for root, dirs, files in os.walk(root_dir):
for filename in files:
full_path = os.path.join(root, filename)
stats = os.stat(full_path)
doc = dict()
doc["name"] = filename
doc["path"] = root
doc["mtime"] = stats.st_mtime
doc["size"] = stats.st_size
docs.append(doc)
with open("local_filesystem.json", "w") as f:
f.writelines(json.dumps(doc) + "\n" for doc in docs)
def index_file_list(path: str, website_id):
es = ElasticSearchEngine("od-database")
with open(path, "r") as f:
es.import_json(f.read(), website_id)
def search(term=""):
requests.get("http://localhost/?&sort_order=score&per_page=100q=" + term, verify=False)
print(term)
def random_searches(count=10000000, max_workers=1000):
terms = requests.get("https://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain")\
.text.splitlines()
pool = ThreadPoolExecutor(max_workers=max_workers)
pool.map(search, random.choices(terms, k=count))
# dump_local_filesystem("/mnt/")
# index_file_list("local_filesystem.json", 10)
# random_searches(100000)

21
task.py
View File

@ -28,7 +28,7 @@ class CrawlServer:
except ConnectionError:
return False
def get_completed_tasks(self) -> list:
def fetch_completed_tasks(self) -> list:
try:
r = requests.get(self.url + "/task/completed")
@ -36,9 +36,10 @@ class CrawlServer:
TaskResult(r["status_code"], r["file_count"], r["start_time"], r["end_time"], r["website_id"])
for r in json.loads(r.text)]
except ConnectionError:
print("Crawl server cannot be reached " + self.url)
return []
def get_queued_tasks(self) -> list:
def fetch_queued_tasks(self) -> list:
try:
r = requests.get(self.url + "/task/")
@ -49,7 +50,7 @@ class CrawlServer:
except ConnectionError:
return []
def get_current_tasks(self):
def fetch_current_tasks(self):
try:
r = requests.get(self.url + "/task/current")
@ -58,14 +59,13 @@ class CrawlServer:
for t in json.loads(r.text)
]
except ConnectionError:
print("Server cannot be reached " + self.url)
return []
def get_file_list(self, website_id) -> str:
def fetch_website_files(self, website_id) -> str:
try:
r = requests.get(self.url + "/file_list/" + str(website_id) + "/")
return r.text
return r.text if r.status_code == 200 else ""
except ConnectionError:
return ""
@ -73,6 +73,7 @@ class CrawlServer:
class TaskDispatcher:
def __init__(self):
# TODO: remove reddit
reddit = praw.Reddit('opendirectories-bot',
user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)')
self.reddit_bot = RedditBot("crawled.txt", reddit)
@ -91,9 +92,9 @@ class TaskDispatcher:
def check_completed_tasks(self):
for server in self.crawl_servers:
for task in server.get_completed_tasks():
for task in server.fetch_completed_tasks():
print("Completed task")
file_list = server.get_file_list(task.website_id)
file_list = server.fetch_website_files(task.website_id)
self.search.import_json(file_list, task.website_id)
def dispatch_task(self, task: Task):
@ -108,7 +109,7 @@ class TaskDispatcher:
queued_tasks = []
for server in self.crawl_servers:
queued_tasks.extend(server.get_queued_tasks())
queued_tasks.extend(server.fetch_queued_tasks())
return queued_tasks
@ -117,7 +118,7 @@ class TaskDispatcher:
current_tasks = []
for server in self.crawl_servers:
current_tasks.extend(server.get_current_tasks())
current_tasks.extend(server.fetch_current_tasks())
return current_tasks

View File

@ -26,7 +26,7 @@
<form action="/search">
<div class="form-group">
<input class="form-control" name="q" id="q" placeholder="Full-text Query">
<input class="form-control" name="q" id="q" placeholder="Query">
</div>
<input class="btn btn-primary btn-shadow" type="submit" value="Search">
@ -34,20 +34,5 @@
</div>
</div>
<div class="card">
<div class="card-header">Full-text Query Syntax</div>
<div class="card-body">
<p>The following query types are allowed (More information
<a href="https://www.sqlite.org/fts5.html#full_text_query_syntax">here</a>):</p>
<p>Exact term: <code> "foo"</code></p>
<p>Term with prefix: <code> "foo*"</code></p>
<p>File names: <code> "name:foo"</code></p>
<p>Paths: <code> "path:foo"</code></p>
<p>Starts with term: <code> "^foo"</code></p>
<p>NEAR group: <code> "NEAR(foo bar, 3)"</code></p>
</div>
</div>
</div>
{% endblock body %}

View File

@ -58,8 +58,8 @@
<tr>
<td>
{# File name & link #}
<a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + src["ext"] }}" title="{{ src["name"] + src["ext"] }}">
{{ hl_name |safe }}{{ src["ext"] }}
<a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + ("." if src["ext"] != "" else "") + src["ext"] }}" title="{{ src["name"] + src["ext"] }}">
{{ hl_name |safe }}{{ ("." if src["ext"] != "" else "") + src["ext"] }}
</a>
{# File type badge #}
{% set mime = get_mime(src["path"]) %}

11
uwsgi.py Normal file
View File

@ -0,0 +1,11 @@
from app import app
import config
import ssl
if __name__ == '__main__':
if not config.USE_SSL:
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem')
app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True)
else:
app.run("0.0.0.0", port=12345, threaded=True)