mirror of
https://github.com/simon987/od-database.git
synced 2025-04-17 17:36:48 +00:00
uWSGI config and bugfix with file extensions
This commit is contained in:
parent
e91572a06f
commit
9bde8cb629
6
app.py
6
app.py
@ -113,7 +113,7 @@ def search():
|
||||
if len(q) >= 3:
|
||||
try:
|
||||
hits = searchEngine.search(q, page, per_page, sort_order)
|
||||
hits = db.join_search_result(hits)
|
||||
hits = db.join_website_on_search_result(hits)
|
||||
except InvalidQueryException as e:
|
||||
flash("<strong>Invalid query:</strong> " + str(e), "warning")
|
||||
return redirect("/search")
|
||||
@ -299,6 +299,6 @@ if __name__ == '__main__':
|
||||
if config.USE_SSL:
|
||||
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
|
||||
context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem')
|
||||
app.run("0.0.0.0", port=12345, ssl_context=context)
|
||||
app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True)
|
||||
else:
|
||||
app.run("0.0.0.0", port=12345)
|
||||
app.run("0.0.0.0", port=12345, threaded=True)
|
||||
|
@ -206,12 +206,15 @@ class Database:
|
||||
result[db_website[0]] = db_website[1]
|
||||
return result
|
||||
|
||||
def join_search_result(self, page: dict) -> dict:
|
||||
def join_website_on_search_result(self, page: dict) -> dict:
|
||||
|
||||
websites = self.get_all_websites()
|
||||
|
||||
for hit in page["hits"]["hits"]:
|
||||
hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]]
|
||||
if hit["_source"]["website_id"] in websites:
|
||||
hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]]
|
||||
else:
|
||||
hit["_source"]["website_url"] = "NONE"
|
||||
|
||||
return page
|
||||
|
||||
|
8
od-database.ini
Normal file
8
od-database.ini
Normal file
@ -0,0 +1,8 @@
|
||||
[uwsgi]
|
||||
socket = 127.0.0.1:3031
|
||||
chdir = /home/simon/Dropbox/data/CS/python/od-database/
|
||||
wsgi-file = uwsgi.py
|
||||
processes = 4
|
||||
threads = 4
|
||||
stats = 127.0.0.1:9191
|
||||
callable=app
|
@ -74,8 +74,8 @@ class ElasticSearchEngine(SearchEngine):
|
||||
|
||||
# Mappings
|
||||
self.es.indices.put_mapping(body={"properties": {
|
||||
"path": {"analyzer": "my_nGram", "type": "text"},
|
||||
"name": {"analyzer": "my_nGram", "type": "text"},
|
||||
"path": {"analyzer": "standard", "type": "text"},
|
||||
"name": {"analyzer": "standard", "type": "text", "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}},
|
||||
"mtime": {"type": "date", "format": "epoch_millis"},
|
||||
"size": {"type": "long"},
|
||||
"website_id": {"type": "integer"},
|
||||
@ -95,7 +95,7 @@ class ElasticSearchEngine(SearchEngine):
|
||||
if not in_str:
|
||||
return
|
||||
|
||||
import_every = 1000
|
||||
import_every = 5000
|
||||
|
||||
docs = []
|
||||
|
||||
@ -138,8 +138,8 @@ class ElasticSearchEngine(SearchEngine):
|
||||
"must": {
|
||||
"multi_match": {
|
||||
"query": query,
|
||||
"fields": ["name", "path"],
|
||||
"operator": "and"
|
||||
"fields": ["name^5", "name.nGram^2", "path"],
|
||||
"operator": "or"
|
||||
}
|
||||
},
|
||||
"filter": filters
|
||||
|
2
startWSGI.sh
Normal file
2
startWSGI.sh
Normal file
@ -0,0 +1,2 @@
|
||||
#!/usr/bin/env bash
|
||||
uwsgi od-database.ini
|
56
stress_test.py
Normal file
56
stress_test.py
Normal file
@ -0,0 +1,56 @@
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
from search.search import ElasticSearchEngine
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import requests
|
||||
import random
|
||||
|
||||
|
||||
def dump_local_filesystem(root_dir: str):
|
||||
|
||||
docs = []
|
||||
|
||||
for root, dirs, files in os.walk(root_dir):
|
||||
|
||||
for filename in files:
|
||||
full_path = os.path.join(root, filename)
|
||||
stats = os.stat(full_path)
|
||||
|
||||
doc = dict()
|
||||
doc["name"] = filename
|
||||
doc["path"] = root
|
||||
doc["mtime"] = stats.st_mtime
|
||||
doc["size"] = stats.st_size
|
||||
|
||||
docs.append(doc)
|
||||
|
||||
with open("local_filesystem.json", "w") as f:
|
||||
f.writelines(json.dumps(doc) + "\n" for doc in docs)
|
||||
|
||||
|
||||
def index_file_list(path: str, website_id):
|
||||
|
||||
es = ElasticSearchEngine("od-database")
|
||||
with open(path, "r") as f:
|
||||
es.import_json(f.read(), website_id)
|
||||
|
||||
|
||||
def search(term=""):
|
||||
requests.get("http://localhost/?&sort_order=score&per_page=100q=" + term, verify=False)
|
||||
print(term)
|
||||
|
||||
|
||||
def random_searches(count=10000000, max_workers=1000):
|
||||
|
||||
terms = requests.get("https://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain")\
|
||||
.text.splitlines()
|
||||
|
||||
pool = ThreadPoolExecutor(max_workers=max_workers)
|
||||
pool.map(search, random.choices(terms, k=count))
|
||||
|
||||
|
||||
|
||||
# dump_local_filesystem("/mnt/")
|
||||
# index_file_list("local_filesystem.json", 10)
|
||||
# random_searches(100000)
|
21
task.py
21
task.py
@ -28,7 +28,7 @@ class CrawlServer:
|
||||
except ConnectionError:
|
||||
return False
|
||||
|
||||
def get_completed_tasks(self) -> list:
|
||||
def fetch_completed_tasks(self) -> list:
|
||||
|
||||
try:
|
||||
r = requests.get(self.url + "/task/completed")
|
||||
@ -36,9 +36,10 @@ class CrawlServer:
|
||||
TaskResult(r["status_code"], r["file_count"], r["start_time"], r["end_time"], r["website_id"])
|
||||
for r in json.loads(r.text)]
|
||||
except ConnectionError:
|
||||
print("Crawl server cannot be reached " + self.url)
|
||||
return []
|
||||
|
||||
def get_queued_tasks(self) -> list:
|
||||
def fetch_queued_tasks(self) -> list:
|
||||
|
||||
try:
|
||||
r = requests.get(self.url + "/task/")
|
||||
@ -49,7 +50,7 @@ class CrawlServer:
|
||||
except ConnectionError:
|
||||
return []
|
||||
|
||||
def get_current_tasks(self):
|
||||
def fetch_current_tasks(self):
|
||||
|
||||
try:
|
||||
r = requests.get(self.url + "/task/current")
|
||||
@ -58,14 +59,13 @@ class CrawlServer:
|
||||
for t in json.loads(r.text)
|
||||
]
|
||||
except ConnectionError:
|
||||
print("Server cannot be reached " + self.url)
|
||||
return []
|
||||
|
||||
def get_file_list(self, website_id) -> str:
|
||||
def fetch_website_files(self, website_id) -> str:
|
||||
|
||||
try:
|
||||
r = requests.get(self.url + "/file_list/" + str(website_id) + "/")
|
||||
return r.text
|
||||
return r.text if r.status_code == 200 else ""
|
||||
except ConnectionError:
|
||||
return ""
|
||||
|
||||
@ -73,6 +73,7 @@ class CrawlServer:
|
||||
class TaskDispatcher:
|
||||
|
||||
def __init__(self):
|
||||
# TODO: remove reddit
|
||||
reddit = praw.Reddit('opendirectories-bot',
|
||||
user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)')
|
||||
self.reddit_bot = RedditBot("crawled.txt", reddit)
|
||||
@ -91,9 +92,9 @@ class TaskDispatcher:
|
||||
def check_completed_tasks(self):
|
||||
|
||||
for server in self.crawl_servers:
|
||||
for task in server.get_completed_tasks():
|
||||
for task in server.fetch_completed_tasks():
|
||||
print("Completed task")
|
||||
file_list = server.get_file_list(task.website_id)
|
||||
file_list = server.fetch_website_files(task.website_id)
|
||||
self.search.import_json(file_list, task.website_id)
|
||||
|
||||
def dispatch_task(self, task: Task):
|
||||
@ -108,7 +109,7 @@ class TaskDispatcher:
|
||||
queued_tasks = []
|
||||
|
||||
for server in self.crawl_servers:
|
||||
queued_tasks.extend(server.get_queued_tasks())
|
||||
queued_tasks.extend(server.fetch_queued_tasks())
|
||||
|
||||
return queued_tasks
|
||||
|
||||
@ -117,7 +118,7 @@ class TaskDispatcher:
|
||||
|
||||
current_tasks = []
|
||||
for server in self.crawl_servers:
|
||||
current_tasks.extend(server.get_current_tasks())
|
||||
current_tasks.extend(server.fetch_current_tasks())
|
||||
|
||||
return current_tasks
|
||||
|
||||
|
@ -26,7 +26,7 @@
|
||||
<form action="/search">
|
||||
|
||||
<div class="form-group">
|
||||
<input class="form-control" name="q" id="q" placeholder="Full-text Query">
|
||||
<input class="form-control" name="q" id="q" placeholder="Query">
|
||||
</div>
|
||||
|
||||
<input class="btn btn-primary btn-shadow" type="submit" value="Search">
|
||||
@ -34,20 +34,5 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-header">Full-text Query Syntax</div>
|
||||
<div class="card-body">
|
||||
|
||||
<p>The following query types are allowed (More information
|
||||
<a href="https://www.sqlite.org/fts5.html#full_text_query_syntax">here</a>):</p>
|
||||
<p>Exact term: <code> "foo"</code></p>
|
||||
<p>Term with prefix: <code> "foo*"</code></p>
|
||||
<p>File names: <code> "name:foo"</code></p>
|
||||
<p>Paths: <code> "path:foo"</code></p>
|
||||
<p>Starts with term: <code> "^foo"</code></p>
|
||||
<p>NEAR group: <code> "NEAR(foo bar, 3)"</code></p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
{% endblock body %}
|
||||
|
@ -58,8 +58,8 @@
|
||||
<tr>
|
||||
<td>
|
||||
{# File name & link #}
|
||||
<a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + src["ext"] }}" title="{{ src["name"] + src["ext"] }}">
|
||||
{{ hl_name |safe }}{{ src["ext"] }}
|
||||
<a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + ("." if src["ext"] != "" else "") + src["ext"] }}" title="{{ src["name"] + src["ext"] }}">
|
||||
{{ hl_name |safe }}{{ ("." if src["ext"] != "" else "") + src["ext"] }}
|
||||
</a>
|
||||
{# File type badge #}
|
||||
{% set mime = get_mime(src["path"]) %}
|
||||
|
11
uwsgi.py
Normal file
11
uwsgi.py
Normal file
@ -0,0 +1,11 @@
|
||||
from app import app
|
||||
import config
|
||||
import ssl
|
||||
|
||||
if __name__ == '__main__':
|
||||
if not config.USE_SSL:
|
||||
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
|
||||
context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem')
|
||||
app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True)
|
||||
else:
|
||||
app.run("0.0.0.0", port=12345, threaded=True)
|
Loading…
x
Reference in New Issue
Block a user