Added website url & date in search results & fixed threading problem

This commit is contained in:
Simon 2018-06-12 17:48:15 -04:00
parent 0127b3a51d
commit 4b60ac62fc
6 changed files with 87 additions and 60 deletions

10
app.py
View File

@ -26,7 +26,7 @@ searchEngine = ElasticSearchEngine("od-database")
@app.template_filter("datetime_format") @app.template_filter("datetime_format")
def datetime_format(value, format='%Y-%m-%d %H:%M UTC'): def datetime_format(value, format='%Y-%m-%d'):
return time.strftime(format, time.gmtime(value)) return time.strftime(format, time.gmtime(value))
@ -107,18 +107,16 @@ def search():
per_page = int(per_page) if per_page.isdigit() else "50" per_page = int(per_page) if per_page.isdigit() else "50"
per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50 per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50
if q: if len(q) >= 3:
try: try:
# hits = sea.search(q, per_page, page, sort_order) hits = searchEngine.search(q, page, per_page, sort_order)
hits = searchEngine.search(q, page, per_page) hits = db.join_search_result(hits)
except InvalidQueryException as e: except InvalidQueryException as e:
flash("<strong>Invalid query:</strong> " + str(e), "warning") flash("<strong>Invalid query:</strong> " + str(e), "warning")
return redirect("/search") return redirect("/search")
else: else:
hits = None hits = None
print(hits)
return render_template("search.html", return render_template("search.html",
results=hits, q=q, p=page, sort_order=sort_order, results=hits, q=q, p=page, sort_order=sort_order,
per_page=per_page, results_set=config.RESULTS_PER_PAGE) per_page=per_page, results_set=config.RESULTS_PER_PAGE)

View File

@ -1,5 +1,5 @@
from crawl_server.database import TaskManagerDatabase, Task, TaskResult from crawl_server.database import TaskManagerDatabase, Task, TaskResult
from multiprocessing import Pool from concurrent.futures import ProcessPoolExecutor
from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.schedulers.background import BackgroundScheduler
from datetime import datetime from datetime import datetime
from crawl_server.crawler import RemoteDirectoryCrawler from crawl_server.crawler import RemoteDirectoryCrawler
@ -10,7 +10,7 @@ class TaskManager:
def __init__(self, db_path, max_processes=8): def __init__(self, db_path, max_processes=8):
self.db_path = db_path self.db_path = db_path
self.db = TaskManagerDatabase(db_path) self.db = TaskManagerDatabase(db_path)
self.pool = Pool(processes=max_processes) self.pool = ProcessPoolExecutor(max_workers=max_processes)
self.current_tasks = [] self.current_tasks = []
@ -39,12 +39,10 @@ class TaskManager:
print("pooled " + task.url) print("pooled " + task.url)
self.pool.apply_async( self.pool.submit(
TaskManager.run_task, TaskManager.run_task,
args=(task, self.db_path), task, self.db_path
callback=TaskManager.task_complete, ).add_done_callback(TaskManager.task_complete)
error_callback=TaskManager.task_error
)
@staticmethod @staticmethod
def run_task(task, db_path): def run_task(task, db_path):
@ -63,19 +61,20 @@ class TaskManager:
result.end_time = datetime.utcnow() result.end_time = datetime.utcnow()
print("End task " + task.url) print("End task " + task.url)
return dict(result=result, db_path=db_path) return result, db_path
@staticmethod @staticmethod
def task_complete(kwargs): def task_complete(result):
result = kwargs["result"]
db_path = kwargs["db_path"] task_result, db_path = result.result()
print(result.status_code)
print(result.file_count) print(task_result.status_code)
print(result.start_time) print(task_result.file_count)
print(result.end_time) print(task_result.start_time)
print(task_result.end_time)
db = TaskManagerDatabase(db_path) db = TaskManagerDatabase(db_path)
db.log_result(result) db.log_result(task_result)
print("Logged result to DB") print("Logged result to DB")
@staticmethod @staticmethod

View File

@ -191,6 +191,31 @@ class Database:
cursor.execute("DELETE FROM ApiToken WHERE token=?", (token, )) cursor.execute("DELETE FROM ApiToken WHERE token=?", (token, ))
conn.commit() conn.commit()
def _get_websites(self) -> dict:
# todo: mem cache that
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("SELECT id, url FROM Website")
result = {}
for db_website in cursor.fetchall():
result[db_website[0]] = db_website[1]
return result
def join_search_result(self, page: dict) -> dict:
websites = self._get_websites()
for hit in page["hits"]["hits"]:
hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]]
return page

View File

@ -1,4 +1,6 @@
import elasticsearch import elasticsearch
import os
import json
from elasticsearch.exceptions import TransportError from elasticsearch.exceptions import TransportError
@ -14,10 +16,7 @@ class SearchEngine:
def import_json(self, in_str: str, website_id: int): def import_json(self, in_str: str, website_id: int):
raise NotImplementedError raise NotImplementedError
def search(self, query) -> {}: def search(self, query, page, per_page, sort_order) -> {}:
raise NotImplementedError
def scroll(self, scroll_id) -> {}:
raise NotImplementedError raise NotImplementedError
def reset(self): def reset(self):
@ -29,6 +28,15 @@ class SearchEngine:
class ElasticSearchEngine(SearchEngine): class ElasticSearchEngine(SearchEngine):
SORT_ORDERS = {
"score": ["_score"],
"size_asc": [{"size": {"order": "asc"}}],
"size_dsc": [{"size": {"order": "desc"}}],
"date_asc": [{"mtime": {"order": "asc"}}],
"date_desc": [{"mtime": {"order": "desc"}}],
"none": []
}
def __init__(self, index_name): def __init__(self, index_name):
super().__init__() super().__init__()
self.index_name = index_name self.index_name = index_name
@ -68,7 +76,8 @@ class ElasticSearchEngine(SearchEngine):
"name": {"analyzer": "my_nGram", "type": "text"}, "name": {"analyzer": "my_nGram", "type": "text"},
"mtime": {"type": "date", "format": "epoch_millis"}, "mtime": {"type": "date", "format": "epoch_millis"},
"size": {"type": "long"}, "size": {"type": "long"},
"website_id": {"type": "integer"} "website_id": {"type": "integer"},
"ext": {"type": "keyword"}
}}, doc_type="file", index=self.index_name) }}, doc_type="file", index=self.index_name)
self.es.indices.open(index=self.index_name) self.es.indices.open(index=self.index_name)
@ -85,16 +94,21 @@ class ElasticSearchEngine(SearchEngine):
docs = [] docs = []
for line in in_str.splitlines(): for line in in_str.splitlines():
docs.append(line) doc = json.loads(line)
name, ext = os.path.splitext(doc["name"])
doc["ext"] = ext if ext else ""
doc["name"] = name
doc["website_id"] = website_id
docs.append(doc)
if len(docs) >= import_every: if len(docs) >= import_every:
self._index(docs, website_id) self._index(docs)
docs.clear() docs.clear()
self._index(docs, website_id) self._index(docs)
def _index(self, docs, website_id): def _index(self, docs):
print("Indexing " + str(len(docs)) + " docs") print("Indexing " + str(len(docs)) + " docs")
bulk_string = ElasticSearchEngine.create_bulk_index_string(docs, website_id) bulk_string = ElasticSearchEngine.create_bulk_index_string(docs)
result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file") result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file")
if result["errors"]: if result["errors"]:
@ -102,17 +116,15 @@ class ElasticSearchEngine(SearchEngine):
raise IndexingError raise IndexingError
@staticmethod @staticmethod
def create_bulk_index_string(docs: list, website_id: int): def create_bulk_index_string(docs: list):
action_string = '{"index":{}}\n' action_string = '{"index":{}}\n'
website_id_string = ',"website_id":' + str(website_id) + '}\n' # Add website_id param to each doc return "\n".join("".join([action_string, json.dumps(doc)]) for doc in docs)
return "\n".join("".join([action_string, doc[:-1], website_id_string]) for doc in docs) def search(self, query, page, per_page, sort_order) -> {}:
def search(self, query, page, per_page) -> {}:
filters = [] filters = []
sort_by = ["_score"] sort_by = ElasticSearchEngine.SORT_ORDERS.get(sort_order, [])
page = self.es.search(body={ page = self.es.search(body={
"query": { "query": {

View File

@ -11,7 +11,7 @@
padding-bottom: 0.3rem; padding-bottom: 0.3rem;
} }
.table td { .table td {
padding: .40rem .75rem; padding: 2px 0;
} }
.bg-application { .bg-application {
@ -75,4 +75,4 @@
.hl { .hl {
background: #fff217; background: #fff217;
} }

View File

@ -18,17 +18,12 @@
<div class="form-group col-md-2"> <div class="form-group col-md-2">
<select class="form-control" name="sort_order" title="Sort order"> <select class="form-control" name="sort_order" title="Sort order">
<option disabled>Select sort order</option> <option disabled>Select sort order</option>
<option value="score" {{ "selected" if sort_order == "score" else "" }}>Relevance <option value="score" {{ "selected" if sort_order == "score" else "" }}>Relevance </option>
</option> <option value="size_asc" {{ "selected" if sort_order == "size_asc" else "" }}>Size ascending </option>
<option value="size_asc" {{ "selected" if sort_order == "size_asc" else "" }}>Size <option value="size_dsc" {{ "selected" if sort_order == "size_dsc" else "" }}>Size descending</option>
ascending <option value="date_asc" {{ "selected" if sort_order == "date_asc" else "" }}>Date ascending</option>
</option> <option value="date_dsc" {{ "selected" if sort_order == "date_dsc" else "" }}>Date descending</option>
<option value="size_dsc" {{ "selected" if sort_order == "size_dsc" else "" }}>Size <option value="none" {{ "selected" if sort_order == "none" else "" }}>No order (faster)</option>
descending
</option>
<option value="none" {{ "selected" if sort_order == "none" else "" }}>No order
(faster)
</option>
</select> </select>
</div> </div>
<div class="form-group col-md-2"> <div class="form-group col-md-2">
@ -47,7 +42,7 @@
</div> </div>
</div> </div>
{% if results["hits"]["total"] > 0 %} {% if results and results["hits"]["total"] > 0 %}
<div class="card"> <div class="card">
<div class="card-body"> <div class="card-body">
@ -60,13 +55,11 @@
{% set hl_name = hit["highlight"]["name"][0] if "name" in hit["highlight"] else src["name"] %} {% set hl_name = hit["highlight"]["name"][0] if "name" in hit["highlight"] else src["name"] %}
{% set hl_path = hit["highlight"]["path"][0] if "path" in hit["highlight"] else src["path"] %} {% set hl_path = hit["highlight"]["path"][0] if "path" in hit["highlight"] else src["path"] %}
{# TODO: website url + path #}
{% set path = src["path"] %}
<tr> <tr>
<td> <td>
{# File name & link #} {# File name & link #}
<a href="{{ path + "/" + src["name"] }}" title="{{ src["name"] }}"> <a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + src["ext"] }}" title="{{ src["name"] + src["ext"] }}">
{{ hl_name |safe }} {{ hl_name |safe }}{{ src["ext"] }}
</a> </a>
{# File type badge #} {# File type badge #}
{% set mime = get_mime(src["path"]) %} {% set mime = get_mime(src["path"]) %}
@ -78,13 +71,13 @@
{# File path #} {# File path #}
<div class="text-muted" title="{{ path }}" style="font-size: 10px;"> <div class="text-muted" title="{{ path }}" style="font-size: 10px;">
<a style="color: #6c757d" title="See files from this website" <a style="color: #6c757d" title="See files from this website"
{# todo: website url #} href="/website/{{ src["website_id"] }}">{{ src["website_url"] }}</a>{{ hl_path|safe }}
href="/website/{{ src["website_id"] }}">{{ hl_path | safe }}</a>{{ truncate_path(src["path"], 60) }}
</div> </div>
</td> </td>
{# File size #} {# File size & date #}
<td style="white-space: nowrap; vertical-align: top; text-align: right; font-size: 14px"> <td style="white-space: nowrap; vertical-align: top; text-align: right; font-size: 14px">
{{ src["size"] | filesizeformat if src["size"] >= 0 else "?" }} <div>{{ src["size"] | filesizeformat if src["size"] >= 0 else "?" }}</div>
<code>{{ src["mtime"] | datetime_format }}</code>
</td> </td>
</tr> </tr>
{% endfor %} {% endfor %}