mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 02:16:47 +00:00
Added website url & date in search results & fixed threading problem
This commit is contained in:
parent
0127b3a51d
commit
4b60ac62fc
10
app.py
10
app.py
@ -26,7 +26,7 @@ searchEngine = ElasticSearchEngine("od-database")
|
|||||||
|
|
||||||
|
|
||||||
@app.template_filter("datetime_format")
|
@app.template_filter("datetime_format")
|
||||||
def datetime_format(value, format='%Y-%m-%d %H:%M UTC'):
|
def datetime_format(value, format='%Y-%m-%d'):
|
||||||
return time.strftime(format, time.gmtime(value))
|
return time.strftime(format, time.gmtime(value))
|
||||||
|
|
||||||
|
|
||||||
@ -107,18 +107,16 @@ def search():
|
|||||||
per_page = int(per_page) if per_page.isdigit() else "50"
|
per_page = int(per_page) if per_page.isdigit() else "50"
|
||||||
per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50
|
per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50
|
||||||
|
|
||||||
if q:
|
if len(q) >= 3:
|
||||||
try:
|
try:
|
||||||
# hits = sea.search(q, per_page, page, sort_order)
|
hits = searchEngine.search(q, page, per_page, sort_order)
|
||||||
hits = searchEngine.search(q, page, per_page)
|
hits = db.join_search_result(hits)
|
||||||
except InvalidQueryException as e:
|
except InvalidQueryException as e:
|
||||||
flash("<strong>Invalid query:</strong> " + str(e), "warning")
|
flash("<strong>Invalid query:</strong> " + str(e), "warning")
|
||||||
return redirect("/search")
|
return redirect("/search")
|
||||||
else:
|
else:
|
||||||
hits = None
|
hits = None
|
||||||
|
|
||||||
print(hits)
|
|
||||||
|
|
||||||
return render_template("search.html",
|
return render_template("search.html",
|
||||||
results=hits, q=q, p=page, sort_order=sort_order,
|
results=hits, q=q, p=page, sort_order=sort_order,
|
||||||
per_page=per_page, results_set=config.RESULTS_PER_PAGE)
|
per_page=per_page, results_set=config.RESULTS_PER_PAGE)
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
|
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
|
||||||
from multiprocessing import Pool
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
from apscheduler.schedulers.background import BackgroundScheduler
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from crawl_server.crawler import RemoteDirectoryCrawler
|
from crawl_server.crawler import RemoteDirectoryCrawler
|
||||||
@ -10,7 +10,7 @@ class TaskManager:
|
|||||||
def __init__(self, db_path, max_processes=8):
|
def __init__(self, db_path, max_processes=8):
|
||||||
self.db_path = db_path
|
self.db_path = db_path
|
||||||
self.db = TaskManagerDatabase(db_path)
|
self.db = TaskManagerDatabase(db_path)
|
||||||
self.pool = Pool(processes=max_processes)
|
self.pool = ProcessPoolExecutor(max_workers=max_processes)
|
||||||
|
|
||||||
self.current_tasks = []
|
self.current_tasks = []
|
||||||
|
|
||||||
@ -39,12 +39,10 @@ class TaskManager:
|
|||||||
|
|
||||||
print("pooled " + task.url)
|
print("pooled " + task.url)
|
||||||
|
|
||||||
self.pool.apply_async(
|
self.pool.submit(
|
||||||
TaskManager.run_task,
|
TaskManager.run_task,
|
||||||
args=(task, self.db_path),
|
task, self.db_path
|
||||||
callback=TaskManager.task_complete,
|
).add_done_callback(TaskManager.task_complete)
|
||||||
error_callback=TaskManager.task_error
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def run_task(task, db_path):
|
def run_task(task, db_path):
|
||||||
@ -63,19 +61,20 @@ class TaskManager:
|
|||||||
result.end_time = datetime.utcnow()
|
result.end_time = datetime.utcnow()
|
||||||
print("End task " + task.url)
|
print("End task " + task.url)
|
||||||
|
|
||||||
return dict(result=result, db_path=db_path)
|
return result, db_path
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def task_complete(kwargs):
|
def task_complete(result):
|
||||||
result = kwargs["result"]
|
|
||||||
db_path = kwargs["db_path"]
|
task_result, db_path = result.result()
|
||||||
print(result.status_code)
|
|
||||||
print(result.file_count)
|
print(task_result.status_code)
|
||||||
print(result.start_time)
|
print(task_result.file_count)
|
||||||
print(result.end_time)
|
print(task_result.start_time)
|
||||||
|
print(task_result.end_time)
|
||||||
|
|
||||||
db = TaskManagerDatabase(db_path)
|
db = TaskManagerDatabase(db_path)
|
||||||
db.log_result(result)
|
db.log_result(task_result)
|
||||||
print("Logged result to DB")
|
print("Logged result to DB")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
25
database.py
25
database.py
@ -191,6 +191,31 @@ class Database:
|
|||||||
cursor.execute("DELETE FROM ApiToken WHERE token=?", (token, ))
|
cursor.execute("DELETE FROM ApiToken WHERE token=?", (token, ))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
def _get_websites(self) -> dict:
|
||||||
|
|
||||||
|
# todo: mem cache that
|
||||||
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute("SELECT id, url FROM Website")
|
||||||
|
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
for db_website in cursor.fetchall():
|
||||||
|
result[db_website[0]] = db_website[1]
|
||||||
|
return result
|
||||||
|
|
||||||
|
def join_search_result(self, page: dict) -> dict:
|
||||||
|
|
||||||
|
websites = self._get_websites()
|
||||||
|
|
||||||
|
for hit in page["hits"]["hits"]:
|
||||||
|
hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]]
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
import elasticsearch
|
import elasticsearch
|
||||||
|
import os
|
||||||
|
import json
|
||||||
from elasticsearch.exceptions import TransportError
|
from elasticsearch.exceptions import TransportError
|
||||||
|
|
||||||
|
|
||||||
@ -14,10 +16,7 @@ class SearchEngine:
|
|||||||
def import_json(self, in_str: str, website_id: int):
|
def import_json(self, in_str: str, website_id: int):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def search(self, query) -> {}:
|
def search(self, query, page, per_page, sort_order) -> {}:
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def scroll(self, scroll_id) -> {}:
|
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
@ -29,6 +28,15 @@ class SearchEngine:
|
|||||||
|
|
||||||
class ElasticSearchEngine(SearchEngine):
|
class ElasticSearchEngine(SearchEngine):
|
||||||
|
|
||||||
|
SORT_ORDERS = {
|
||||||
|
"score": ["_score"],
|
||||||
|
"size_asc": [{"size": {"order": "asc"}}],
|
||||||
|
"size_dsc": [{"size": {"order": "desc"}}],
|
||||||
|
"date_asc": [{"mtime": {"order": "asc"}}],
|
||||||
|
"date_desc": [{"mtime": {"order": "desc"}}],
|
||||||
|
"none": []
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, index_name):
|
def __init__(self, index_name):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.index_name = index_name
|
self.index_name = index_name
|
||||||
@ -68,7 +76,8 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
"name": {"analyzer": "my_nGram", "type": "text"},
|
"name": {"analyzer": "my_nGram", "type": "text"},
|
||||||
"mtime": {"type": "date", "format": "epoch_millis"},
|
"mtime": {"type": "date", "format": "epoch_millis"},
|
||||||
"size": {"type": "long"},
|
"size": {"type": "long"},
|
||||||
"website_id": {"type": "integer"}
|
"website_id": {"type": "integer"},
|
||||||
|
"ext": {"type": "keyword"}
|
||||||
}}, doc_type="file", index=self.index_name)
|
}}, doc_type="file", index=self.index_name)
|
||||||
|
|
||||||
self.es.indices.open(index=self.index_name)
|
self.es.indices.open(index=self.index_name)
|
||||||
@ -85,16 +94,21 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
docs = []
|
docs = []
|
||||||
|
|
||||||
for line in in_str.splitlines():
|
for line in in_str.splitlines():
|
||||||
docs.append(line)
|
doc = json.loads(line)
|
||||||
|
name, ext = os.path.splitext(doc["name"])
|
||||||
|
doc["ext"] = ext if ext else ""
|
||||||
|
doc["name"] = name
|
||||||
|
doc["website_id"] = website_id
|
||||||
|
docs.append(doc)
|
||||||
|
|
||||||
if len(docs) >= import_every:
|
if len(docs) >= import_every:
|
||||||
self._index(docs, website_id)
|
self._index(docs)
|
||||||
docs.clear()
|
docs.clear()
|
||||||
self._index(docs, website_id)
|
self._index(docs)
|
||||||
|
|
||||||
def _index(self, docs, website_id):
|
def _index(self, docs):
|
||||||
print("Indexing " + str(len(docs)) + " docs")
|
print("Indexing " + str(len(docs)) + " docs")
|
||||||
bulk_string = ElasticSearchEngine.create_bulk_index_string(docs, website_id)
|
bulk_string = ElasticSearchEngine.create_bulk_index_string(docs)
|
||||||
result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file")
|
result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file")
|
||||||
|
|
||||||
if result["errors"]:
|
if result["errors"]:
|
||||||
@ -102,17 +116,15 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
raise IndexingError
|
raise IndexingError
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_bulk_index_string(docs: list, website_id: int):
|
def create_bulk_index_string(docs: list):
|
||||||
|
|
||||||
action_string = '{"index":{}}\n'
|
action_string = '{"index":{}}\n'
|
||||||
website_id_string = ',"website_id":' + str(website_id) + '}\n' # Add website_id param to each doc
|
return "\n".join("".join([action_string, json.dumps(doc)]) for doc in docs)
|
||||||
|
|
||||||
return "\n".join("".join([action_string, doc[:-1], website_id_string]) for doc in docs)
|
def search(self, query, page, per_page, sort_order) -> {}:
|
||||||
|
|
||||||
def search(self, query, page, per_page) -> {}:
|
|
||||||
|
|
||||||
filters = []
|
filters = []
|
||||||
sort_by = ["_score"]
|
sort_by = ElasticSearchEngine.SORT_ORDERS.get(sort_order, [])
|
||||||
|
|
||||||
page = self.es.search(body={
|
page = self.es.search(body={
|
||||||
"query": {
|
"query": {
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
padding-bottom: 0.3rem;
|
padding-bottom: 0.3rem;
|
||||||
}
|
}
|
||||||
.table td {
|
.table td {
|
||||||
padding: .40rem .75rem;
|
padding: 2px 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
.bg-application {
|
.bg-application {
|
||||||
@ -75,4 +75,4 @@
|
|||||||
|
|
||||||
.hl {
|
.hl {
|
||||||
background: #fff217;
|
background: #fff217;
|
||||||
}
|
}
|
||||||
|
@ -18,17 +18,12 @@
|
|||||||
<div class="form-group col-md-2">
|
<div class="form-group col-md-2">
|
||||||
<select class="form-control" name="sort_order" title="Sort order">
|
<select class="form-control" name="sort_order" title="Sort order">
|
||||||
<option disabled>Select sort order</option>
|
<option disabled>Select sort order</option>
|
||||||
<option value="score" {{ "selected" if sort_order == "score" else "" }}>Relevance
|
<option value="score" {{ "selected" if sort_order == "score" else "" }}>Relevance </option>
|
||||||
</option>
|
<option value="size_asc" {{ "selected" if sort_order == "size_asc" else "" }}>Size ascending </option>
|
||||||
<option value="size_asc" {{ "selected" if sort_order == "size_asc" else "" }}>Size
|
<option value="size_dsc" {{ "selected" if sort_order == "size_dsc" else "" }}>Size descending</option>
|
||||||
ascending
|
<option value="date_asc" {{ "selected" if sort_order == "date_asc" else "" }}>Date ascending</option>
|
||||||
</option>
|
<option value="date_dsc" {{ "selected" if sort_order == "date_dsc" else "" }}>Date descending</option>
|
||||||
<option value="size_dsc" {{ "selected" if sort_order == "size_dsc" else "" }}>Size
|
<option value="none" {{ "selected" if sort_order == "none" else "" }}>No order (faster)</option>
|
||||||
descending
|
|
||||||
</option>
|
|
||||||
<option value="none" {{ "selected" if sort_order == "none" else "" }}>No order
|
|
||||||
(faster)
|
|
||||||
</option>
|
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-group col-md-2">
|
<div class="form-group col-md-2">
|
||||||
@ -47,7 +42,7 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{% if results["hits"]["total"] > 0 %}
|
{% if results and results["hits"]["total"] > 0 %}
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="card-body">
|
<div class="card-body">
|
||||||
|
|
||||||
@ -60,13 +55,11 @@
|
|||||||
{% set hl_name = hit["highlight"]["name"][0] if "name" in hit["highlight"] else src["name"] %}
|
{% set hl_name = hit["highlight"]["name"][0] if "name" in hit["highlight"] else src["name"] %}
|
||||||
{% set hl_path = hit["highlight"]["path"][0] if "path" in hit["highlight"] else src["path"] %}
|
{% set hl_path = hit["highlight"]["path"][0] if "path" in hit["highlight"] else src["path"] %}
|
||||||
|
|
||||||
{# TODO: website url + path #}
|
|
||||||
{% set path = src["path"] %}
|
|
||||||
<tr>
|
<tr>
|
||||||
<td>
|
<td>
|
||||||
{# File name & link #}
|
{# File name & link #}
|
||||||
<a href="{{ path + "/" + src["name"] }}" title="{{ src["name"] }}">
|
<a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + src["ext"] }}" title="{{ src["name"] + src["ext"] }}">
|
||||||
{{ hl_name |safe }}
|
{{ hl_name |safe }}{{ src["ext"] }}
|
||||||
</a>
|
</a>
|
||||||
{# File type badge #}
|
{# File type badge #}
|
||||||
{% set mime = get_mime(src["path"]) %}
|
{% set mime = get_mime(src["path"]) %}
|
||||||
@ -78,13 +71,13 @@
|
|||||||
{# File path #}
|
{# File path #}
|
||||||
<div class="text-muted" title="{{ path }}" style="font-size: 10px;">
|
<div class="text-muted" title="{{ path }}" style="font-size: 10px;">
|
||||||
<a style="color: #6c757d" title="See files from this website"
|
<a style="color: #6c757d" title="See files from this website"
|
||||||
{# todo: website url #}
|
href="/website/{{ src["website_id"] }}">{{ src["website_url"] }}</a>{{ hl_path|safe }}
|
||||||
href="/website/{{ src["website_id"] }}">{{ hl_path | safe }}</a>{{ truncate_path(src["path"], 60) }}
|
|
||||||
</div>
|
</div>
|
||||||
</td>
|
</td>
|
||||||
{# File size #}
|
{# File size & date #}
|
||||||
<td style="white-space: nowrap; vertical-align: top; text-align: right; font-size: 14px">
|
<td style="white-space: nowrap; vertical-align: top; text-align: right; font-size: 14px">
|
||||||
{{ src["size"] | filesizeformat if src["size"] >= 0 else "?" }}
|
<div>{{ src["size"] | filesizeformat if src["size"] >= 0 else "?" }}</div>
|
||||||
|
<code>{{ src["mtime"] | datetime_format }}</code>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user