mirror of
https://github.com/simon987/od-database.git
synced 2025-04-20 10:56:47 +00:00
uWSGI config and bugfix with file extensions
This commit is contained in:
parent
e91572a06f
commit
9bde8cb629
6
app.py
6
app.py
@ -113,7 +113,7 @@ def search():
|
|||||||
if len(q) >= 3:
|
if len(q) >= 3:
|
||||||
try:
|
try:
|
||||||
hits = searchEngine.search(q, page, per_page, sort_order)
|
hits = searchEngine.search(q, page, per_page, sort_order)
|
||||||
hits = db.join_search_result(hits)
|
hits = db.join_website_on_search_result(hits)
|
||||||
except InvalidQueryException as e:
|
except InvalidQueryException as e:
|
||||||
flash("<strong>Invalid query:</strong> " + str(e), "warning")
|
flash("<strong>Invalid query:</strong> " + str(e), "warning")
|
||||||
return redirect("/search")
|
return redirect("/search")
|
||||||
@ -299,6 +299,6 @@ if __name__ == '__main__':
|
|||||||
if config.USE_SSL:
|
if config.USE_SSL:
|
||||||
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
|
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
|
||||||
context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem')
|
context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem')
|
||||||
app.run("0.0.0.0", port=12345, ssl_context=context)
|
app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True)
|
||||||
else:
|
else:
|
||||||
app.run("0.0.0.0", port=12345)
|
app.run("0.0.0.0", port=12345, threaded=True)
|
||||||
|
@ -206,12 +206,15 @@ class Database:
|
|||||||
result[db_website[0]] = db_website[1]
|
result[db_website[0]] = db_website[1]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def join_search_result(self, page: dict) -> dict:
|
def join_website_on_search_result(self, page: dict) -> dict:
|
||||||
|
|
||||||
websites = self.get_all_websites()
|
websites = self.get_all_websites()
|
||||||
|
|
||||||
for hit in page["hits"]["hits"]:
|
for hit in page["hits"]["hits"]:
|
||||||
|
if hit["_source"]["website_id"] in websites:
|
||||||
hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]]
|
hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]]
|
||||||
|
else:
|
||||||
|
hit["_source"]["website_url"] = "NONE"
|
||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
8
od-database.ini
Normal file
8
od-database.ini
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
[uwsgi]
|
||||||
|
socket = 127.0.0.1:3031
|
||||||
|
chdir = /home/simon/Dropbox/data/CS/python/od-database/
|
||||||
|
wsgi-file = uwsgi.py
|
||||||
|
processes = 4
|
||||||
|
threads = 4
|
||||||
|
stats = 127.0.0.1:9191
|
||||||
|
callable=app
|
@ -74,8 +74,8 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
|
|
||||||
# Mappings
|
# Mappings
|
||||||
self.es.indices.put_mapping(body={"properties": {
|
self.es.indices.put_mapping(body={"properties": {
|
||||||
"path": {"analyzer": "my_nGram", "type": "text"},
|
"path": {"analyzer": "standard", "type": "text"},
|
||||||
"name": {"analyzer": "my_nGram", "type": "text"},
|
"name": {"analyzer": "standard", "type": "text", "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}},
|
||||||
"mtime": {"type": "date", "format": "epoch_millis"},
|
"mtime": {"type": "date", "format": "epoch_millis"},
|
||||||
"size": {"type": "long"},
|
"size": {"type": "long"},
|
||||||
"website_id": {"type": "integer"},
|
"website_id": {"type": "integer"},
|
||||||
@ -95,7 +95,7 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
if not in_str:
|
if not in_str:
|
||||||
return
|
return
|
||||||
|
|
||||||
import_every = 1000
|
import_every = 5000
|
||||||
|
|
||||||
docs = []
|
docs = []
|
||||||
|
|
||||||
@ -138,8 +138,8 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
"must": {
|
"must": {
|
||||||
"multi_match": {
|
"multi_match": {
|
||||||
"query": query,
|
"query": query,
|
||||||
"fields": ["name", "path"],
|
"fields": ["name^5", "name.nGram^2", "path"],
|
||||||
"operator": "and"
|
"operator": "or"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"filter": filters
|
"filter": filters
|
||||||
|
2
startWSGI.sh
Normal file
2
startWSGI.sh
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
uwsgi od-database.ini
|
56
stress_test.py
Normal file
56
stress_test.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from search.search import ElasticSearchEngine
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
import requests
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
def dump_local_filesystem(root_dir: str):
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
|
||||||
|
for root, dirs, files in os.walk(root_dir):
|
||||||
|
|
||||||
|
for filename in files:
|
||||||
|
full_path = os.path.join(root, filename)
|
||||||
|
stats = os.stat(full_path)
|
||||||
|
|
||||||
|
doc = dict()
|
||||||
|
doc["name"] = filename
|
||||||
|
doc["path"] = root
|
||||||
|
doc["mtime"] = stats.st_mtime
|
||||||
|
doc["size"] = stats.st_size
|
||||||
|
|
||||||
|
docs.append(doc)
|
||||||
|
|
||||||
|
with open("local_filesystem.json", "w") as f:
|
||||||
|
f.writelines(json.dumps(doc) + "\n" for doc in docs)
|
||||||
|
|
||||||
|
|
||||||
|
def index_file_list(path: str, website_id):
|
||||||
|
|
||||||
|
es = ElasticSearchEngine("od-database")
|
||||||
|
with open(path, "r") as f:
|
||||||
|
es.import_json(f.read(), website_id)
|
||||||
|
|
||||||
|
|
||||||
|
def search(term=""):
|
||||||
|
requests.get("http://localhost/?&sort_order=score&per_page=100q=" + term, verify=False)
|
||||||
|
print(term)
|
||||||
|
|
||||||
|
|
||||||
|
def random_searches(count=10000000, max_workers=1000):
|
||||||
|
|
||||||
|
terms = requests.get("https://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain")\
|
||||||
|
.text.splitlines()
|
||||||
|
|
||||||
|
pool = ThreadPoolExecutor(max_workers=max_workers)
|
||||||
|
pool.map(search, random.choices(terms, k=count))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# dump_local_filesystem("/mnt/")
|
||||||
|
# index_file_list("local_filesystem.json", 10)
|
||||||
|
# random_searches(100000)
|
21
task.py
21
task.py
@ -28,7 +28,7 @@ class CrawlServer:
|
|||||||
except ConnectionError:
|
except ConnectionError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def get_completed_tasks(self) -> list:
|
def fetch_completed_tasks(self) -> list:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = requests.get(self.url + "/task/completed")
|
r = requests.get(self.url + "/task/completed")
|
||||||
@ -36,9 +36,10 @@ class CrawlServer:
|
|||||||
TaskResult(r["status_code"], r["file_count"], r["start_time"], r["end_time"], r["website_id"])
|
TaskResult(r["status_code"], r["file_count"], r["start_time"], r["end_time"], r["website_id"])
|
||||||
for r in json.loads(r.text)]
|
for r in json.loads(r.text)]
|
||||||
except ConnectionError:
|
except ConnectionError:
|
||||||
|
print("Crawl server cannot be reached " + self.url)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def get_queued_tasks(self) -> list:
|
def fetch_queued_tasks(self) -> list:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = requests.get(self.url + "/task/")
|
r = requests.get(self.url + "/task/")
|
||||||
@ -49,7 +50,7 @@ class CrawlServer:
|
|||||||
except ConnectionError:
|
except ConnectionError:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def get_current_tasks(self):
|
def fetch_current_tasks(self):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = requests.get(self.url + "/task/current")
|
r = requests.get(self.url + "/task/current")
|
||||||
@ -58,14 +59,13 @@ class CrawlServer:
|
|||||||
for t in json.loads(r.text)
|
for t in json.loads(r.text)
|
||||||
]
|
]
|
||||||
except ConnectionError:
|
except ConnectionError:
|
||||||
print("Server cannot be reached " + self.url)
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def get_file_list(self, website_id) -> str:
|
def fetch_website_files(self, website_id) -> str:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = requests.get(self.url + "/file_list/" + str(website_id) + "/")
|
r = requests.get(self.url + "/file_list/" + str(website_id) + "/")
|
||||||
return r.text
|
return r.text if r.status_code == 200 else ""
|
||||||
except ConnectionError:
|
except ConnectionError:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
@ -73,6 +73,7 @@ class CrawlServer:
|
|||||||
class TaskDispatcher:
|
class TaskDispatcher:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
# TODO: remove reddit
|
||||||
reddit = praw.Reddit('opendirectories-bot',
|
reddit = praw.Reddit('opendirectories-bot',
|
||||||
user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)')
|
user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)')
|
||||||
self.reddit_bot = RedditBot("crawled.txt", reddit)
|
self.reddit_bot = RedditBot("crawled.txt", reddit)
|
||||||
@ -91,9 +92,9 @@ class TaskDispatcher:
|
|||||||
def check_completed_tasks(self):
|
def check_completed_tasks(self):
|
||||||
|
|
||||||
for server in self.crawl_servers:
|
for server in self.crawl_servers:
|
||||||
for task in server.get_completed_tasks():
|
for task in server.fetch_completed_tasks():
|
||||||
print("Completed task")
|
print("Completed task")
|
||||||
file_list = server.get_file_list(task.website_id)
|
file_list = server.fetch_website_files(task.website_id)
|
||||||
self.search.import_json(file_list, task.website_id)
|
self.search.import_json(file_list, task.website_id)
|
||||||
|
|
||||||
def dispatch_task(self, task: Task):
|
def dispatch_task(self, task: Task):
|
||||||
@ -108,7 +109,7 @@ class TaskDispatcher:
|
|||||||
queued_tasks = []
|
queued_tasks = []
|
||||||
|
|
||||||
for server in self.crawl_servers:
|
for server in self.crawl_servers:
|
||||||
queued_tasks.extend(server.get_queued_tasks())
|
queued_tasks.extend(server.fetch_queued_tasks())
|
||||||
|
|
||||||
return queued_tasks
|
return queued_tasks
|
||||||
|
|
||||||
@ -117,7 +118,7 @@ class TaskDispatcher:
|
|||||||
|
|
||||||
current_tasks = []
|
current_tasks = []
|
||||||
for server in self.crawl_servers:
|
for server in self.crawl_servers:
|
||||||
current_tasks.extend(server.get_current_tasks())
|
current_tasks.extend(server.fetch_current_tasks())
|
||||||
|
|
||||||
return current_tasks
|
return current_tasks
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@
|
|||||||
<form action="/search">
|
<form action="/search">
|
||||||
|
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<input class="form-control" name="q" id="q" placeholder="Full-text Query">
|
<input class="form-control" name="q" id="q" placeholder="Query">
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<input class="btn btn-primary btn-shadow" type="submit" value="Search">
|
<input class="btn btn-primary btn-shadow" type="submit" value="Search">
|
||||||
@ -34,20 +34,5 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">Full-text Query Syntax</div>
|
|
||||||
<div class="card-body">
|
|
||||||
|
|
||||||
<p>The following query types are allowed (More information
|
|
||||||
<a href="https://www.sqlite.org/fts5.html#full_text_query_syntax">here</a>):</p>
|
|
||||||
<p>Exact term: <code> "foo"</code></p>
|
|
||||||
<p>Term with prefix: <code> "foo*"</code></p>
|
|
||||||
<p>File names: <code> "name:foo"</code></p>
|
|
||||||
<p>Paths: <code> "path:foo"</code></p>
|
|
||||||
<p>Starts with term: <code> "^foo"</code></p>
|
|
||||||
<p>NEAR group: <code> "NEAR(foo bar, 3)"</code></p>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
{% endblock body %}
|
{% endblock body %}
|
||||||
|
@ -58,8 +58,8 @@
|
|||||||
<tr>
|
<tr>
|
||||||
<td>
|
<td>
|
||||||
{# File name & link #}
|
{# File name & link #}
|
||||||
<a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + src["ext"] }}" title="{{ src["name"] + src["ext"] }}">
|
<a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + ("." if src["ext"] != "" else "") + src["ext"] }}" title="{{ src["name"] + src["ext"] }}">
|
||||||
{{ hl_name |safe }}{{ src["ext"] }}
|
{{ hl_name |safe }}{{ ("." if src["ext"] != "" else "") + src["ext"] }}
|
||||||
</a>
|
</a>
|
||||||
{# File type badge #}
|
{# File type badge #}
|
||||||
{% set mime = get_mime(src["path"]) %}
|
{% set mime = get_mime(src["path"]) %}
|
||||||
|
11
uwsgi.py
Normal file
11
uwsgi.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from app import app
|
||||||
|
import config
|
||||||
|
import ssl
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if not config.USE_SSL:
|
||||||
|
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
|
||||||
|
context.load_cert_chain('certificates/cert.pem', 'certificates/privkey.pem')
|
||||||
|
app.run("0.0.0.0", port=12345, ssl_context=context, threaded=True)
|
||||||
|
else:
|
||||||
|
app.run("0.0.0.0", port=12345, threaded=True)
|
Loading…
x
Reference in New Issue
Block a user