mirror of
				https://github.com/simon987/od-database.git
				synced 2025-11-03 14:36:53 +00:00 
			
		
		
		
	Added website url & date in search results & fixed threading problem
This commit is contained in:
		
							parent
							
								
									0127b3a51d
								
							
						
					
					
						commit
						4b60ac62fc
					
				
							
								
								
									
										10
									
								
								app.py
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								app.py
									
									
									
									
									
								
							@ -26,7 +26,7 @@ searchEngine = ElasticSearchEngine("od-database")
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.template_filter("datetime_format")
 | 
					@app.template_filter("datetime_format")
 | 
				
			||||||
def datetime_format(value, format='%Y-%m-%d %H:%M UTC'):
 | 
					def datetime_format(value, format='%Y-%m-%d'):
 | 
				
			||||||
    return time.strftime(format, time.gmtime(value))
 | 
					    return time.strftime(format, time.gmtime(value))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -107,18 +107,16 @@ def search():
 | 
				
			|||||||
    per_page = int(per_page) if per_page.isdigit() else "50"
 | 
					    per_page = int(per_page) if per_page.isdigit() else "50"
 | 
				
			||||||
    per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50
 | 
					    per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if q:
 | 
					    if len(q) >= 3:
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            # hits = sea.search(q, per_page, page, sort_order)
 | 
					            hits = searchEngine.search(q, page, per_page, sort_order)
 | 
				
			||||||
            hits = searchEngine.search(q, page, per_page)
 | 
					            hits = db.join_search_result(hits)
 | 
				
			||||||
        except InvalidQueryException as e:
 | 
					        except InvalidQueryException as e:
 | 
				
			||||||
            flash("<strong>Invalid query:</strong> " + str(e), "warning")
 | 
					            flash("<strong>Invalid query:</strong> " + str(e), "warning")
 | 
				
			||||||
            return redirect("/search")
 | 
					            return redirect("/search")
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        hits = None
 | 
					        hits = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print(hits)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return render_template("search.html",
 | 
					    return render_template("search.html",
 | 
				
			||||||
                           results=hits, q=q, p=page, sort_order=sort_order,
 | 
					                           results=hits, q=q, p=page, sort_order=sort_order,
 | 
				
			||||||
                           per_page=per_page, results_set=config.RESULTS_PER_PAGE)
 | 
					                           per_page=per_page, results_set=config.RESULTS_PER_PAGE)
 | 
				
			||||||
 | 
				
			|||||||
@ -1,5 +1,5 @@
 | 
				
			|||||||
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
 | 
					from crawl_server.database import TaskManagerDatabase, Task, TaskResult
 | 
				
			||||||
from multiprocessing import Pool
 | 
					from concurrent.futures import ProcessPoolExecutor
 | 
				
			||||||
from apscheduler.schedulers.background import BackgroundScheduler
 | 
					from apscheduler.schedulers.background import BackgroundScheduler
 | 
				
			||||||
from datetime import datetime
 | 
					from datetime import datetime
 | 
				
			||||||
from crawl_server.crawler import RemoteDirectoryCrawler
 | 
					from crawl_server.crawler import RemoteDirectoryCrawler
 | 
				
			||||||
@ -10,7 +10,7 @@ class TaskManager:
 | 
				
			|||||||
    def __init__(self, db_path, max_processes=8):
 | 
					    def __init__(self, db_path, max_processes=8):
 | 
				
			||||||
        self.db_path = db_path
 | 
					        self.db_path = db_path
 | 
				
			||||||
        self.db = TaskManagerDatabase(db_path)
 | 
					        self.db = TaskManagerDatabase(db_path)
 | 
				
			||||||
        self.pool = Pool(processes=max_processes)
 | 
					        self.pool = ProcessPoolExecutor(max_workers=max_processes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.current_tasks = []
 | 
					        self.current_tasks = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -39,12 +39,10 @@ class TaskManager:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
            print("pooled " + task.url)
 | 
					            print("pooled " + task.url)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            self.pool.apply_async(
 | 
					            self.pool.submit(
 | 
				
			||||||
                TaskManager.run_task,
 | 
					                TaskManager.run_task,
 | 
				
			||||||
                args=(task, self.db_path),
 | 
					                task, self.db_path
 | 
				
			||||||
                callback=TaskManager.task_complete,
 | 
					            ).add_done_callback(TaskManager.task_complete)
 | 
				
			||||||
                error_callback=TaskManager.task_error
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def run_task(task, db_path):
 | 
					    def run_task(task, db_path):
 | 
				
			||||||
@ -63,19 +61,20 @@ class TaskManager:
 | 
				
			|||||||
        result.end_time = datetime.utcnow()
 | 
					        result.end_time = datetime.utcnow()
 | 
				
			||||||
        print("End task " + task.url)
 | 
					        print("End task " + task.url)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return dict(result=result, db_path=db_path)
 | 
					        return result, db_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def task_complete(kwargs):
 | 
					    def task_complete(result):
 | 
				
			||||||
        result = kwargs["result"]
 | 
					
 | 
				
			||||||
        db_path = kwargs["db_path"]
 | 
					        task_result, db_path = result.result()
 | 
				
			||||||
        print(result.status_code)
 | 
					
 | 
				
			||||||
        print(result.file_count)
 | 
					        print(task_result.status_code)
 | 
				
			||||||
        print(result.start_time)
 | 
					        print(task_result.file_count)
 | 
				
			||||||
        print(result.end_time)
 | 
					        print(task_result.start_time)
 | 
				
			||||||
 | 
					        print(task_result.end_time)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        db = TaskManagerDatabase(db_path)
 | 
					        db = TaskManagerDatabase(db_path)
 | 
				
			||||||
        db.log_result(result)
 | 
					        db.log_result(task_result)
 | 
				
			||||||
        print("Logged result to DB")
 | 
					        print("Logged result to DB")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										25
									
								
								database.py
									
									
									
									
									
								
							
							
						
						
									
										25
									
								
								database.py
									
									
									
									
									
								
							@ -191,6 +191,31 @@ class Database:
 | 
				
			|||||||
            cursor.execute("DELETE FROM ApiToken WHERE token=?", (token, ))
 | 
					            cursor.execute("DELETE FROM ApiToken WHERE token=?", (token, ))
 | 
				
			||||||
            conn.commit()
 | 
					            conn.commit()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _get_websites(self) -> dict:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # todo: mem cache that
 | 
				
			||||||
 | 
					        with sqlite3.connect(self.db_path) as conn:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            cursor = conn.cursor()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            cursor.execute("SELECT id, url FROM Website")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            result = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            for db_website in cursor.fetchall():
 | 
				
			||||||
 | 
					                result[db_website[0]] = db_website[1]
 | 
				
			||||||
 | 
					            return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def join_search_result(self, page: dict) -> dict:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        websites = self._get_websites()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for hit in page["hits"]["hits"]:
 | 
				
			||||||
 | 
					            hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return page
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -1,4 +1,6 @@
 | 
				
			|||||||
import elasticsearch
 | 
					import elasticsearch
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
from elasticsearch.exceptions import TransportError
 | 
					from elasticsearch.exceptions import TransportError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -14,10 +16,7 @@ class SearchEngine:
 | 
				
			|||||||
    def import_json(self, in_str: str, website_id: int):
 | 
					    def import_json(self, in_str: str, website_id: int):
 | 
				
			||||||
        raise NotImplementedError
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def search(self, query) -> {}:
 | 
					    def search(self, query, page, per_page, sort_order) -> {}:
 | 
				
			||||||
        raise NotImplementedError
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def scroll(self, scroll_id) -> {}:
 | 
					 | 
				
			||||||
        raise NotImplementedError
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def reset(self):
 | 
					    def reset(self):
 | 
				
			||||||
@ -29,6 +28,15 @@ class SearchEngine:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
class ElasticSearchEngine(SearchEngine):
 | 
					class ElasticSearchEngine(SearchEngine):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    SORT_ORDERS = {
 | 
				
			||||||
 | 
					        "score": ["_score"],
 | 
				
			||||||
 | 
					        "size_asc": [{"size": {"order": "asc"}}],
 | 
				
			||||||
 | 
					        "size_dsc": [{"size": {"order": "desc"}}],
 | 
				
			||||||
 | 
					        "date_asc": [{"mtime": {"order": "asc"}}],
 | 
				
			||||||
 | 
					        "date_desc": [{"mtime": {"order": "desc"}}],
 | 
				
			||||||
 | 
					        "none": []
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, index_name):
 | 
					    def __init__(self, index_name):
 | 
				
			||||||
        super().__init__()
 | 
					        super().__init__()
 | 
				
			||||||
        self.index_name = index_name
 | 
					        self.index_name = index_name
 | 
				
			||||||
@ -68,7 +76,8 @@ class ElasticSearchEngine(SearchEngine):
 | 
				
			|||||||
            "name": {"analyzer": "my_nGram", "type": "text"},
 | 
					            "name": {"analyzer": "my_nGram", "type": "text"},
 | 
				
			||||||
            "mtime": {"type": "date", "format": "epoch_millis"},
 | 
					            "mtime": {"type": "date", "format": "epoch_millis"},
 | 
				
			||||||
            "size": {"type": "long"},
 | 
					            "size": {"type": "long"},
 | 
				
			||||||
            "website_id": {"type": "integer"}
 | 
					            "website_id": {"type": "integer"},
 | 
				
			||||||
 | 
					            "ext": {"type": "keyword"}
 | 
				
			||||||
        }}, doc_type="file", index=self.index_name)
 | 
					        }}, doc_type="file", index=self.index_name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.es.indices.open(index=self.index_name)
 | 
					        self.es.indices.open(index=self.index_name)
 | 
				
			||||||
@ -85,16 +94,21 @@ class ElasticSearchEngine(SearchEngine):
 | 
				
			|||||||
        docs = []
 | 
					        docs = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for line in in_str.splitlines():
 | 
					        for line in in_str.splitlines():
 | 
				
			||||||
            docs.append(line)
 | 
					            doc = json.loads(line)
 | 
				
			||||||
 | 
					            name, ext = os.path.splitext(doc["name"])
 | 
				
			||||||
 | 
					            doc["ext"] = ext if ext else ""
 | 
				
			||||||
 | 
					            doc["name"] = name
 | 
				
			||||||
 | 
					            doc["website_id"] = website_id
 | 
				
			||||||
 | 
					            docs.append(doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if len(docs) >= import_every:
 | 
					            if len(docs) >= import_every:
 | 
				
			||||||
                self._index(docs, website_id)
 | 
					                self._index(docs)
 | 
				
			||||||
                docs.clear()
 | 
					                docs.clear()
 | 
				
			||||||
        self._index(docs, website_id)
 | 
					        self._index(docs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _index(self, docs, website_id):
 | 
					    def _index(self, docs):
 | 
				
			||||||
        print("Indexing " + str(len(docs)) + " docs")
 | 
					        print("Indexing " + str(len(docs)) + " docs")
 | 
				
			||||||
        bulk_string = ElasticSearchEngine.create_bulk_index_string(docs, website_id)
 | 
					        bulk_string = ElasticSearchEngine.create_bulk_index_string(docs)
 | 
				
			||||||
        result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file")
 | 
					        result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if result["errors"]:
 | 
					        if result["errors"]:
 | 
				
			||||||
@ -102,17 +116,15 @@ class ElasticSearchEngine(SearchEngine):
 | 
				
			|||||||
            raise IndexingError
 | 
					            raise IndexingError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def create_bulk_index_string(docs: list, website_id: int):
 | 
					    def create_bulk_index_string(docs: list):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        action_string = '{"index":{}}\n'
 | 
					        action_string = '{"index":{}}\n'
 | 
				
			||||||
        website_id_string = ',"website_id":' + str(website_id) + '}\n'  # Add website_id param to each doc
 | 
					        return "\n".join("".join([action_string, json.dumps(doc)]) for doc in docs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return "\n".join("".join([action_string, doc[:-1], website_id_string]) for doc in docs)
 | 
					    def search(self, query, page, per_page, sort_order) -> {}:
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def search(self, query, page, per_page) -> {}:
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        filters = []
 | 
					        filters = []
 | 
				
			||||||
        sort_by = ["_score"]
 | 
					        sort_by = ElasticSearchEngine.SORT_ORDERS.get(sort_order, [])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        page = self.es.search(body={
 | 
					        page = self.es.search(body={
 | 
				
			||||||
            "query": {
 | 
					            "query": {
 | 
				
			||||||
 | 
				
			|||||||
@ -11,7 +11,7 @@
 | 
				
			|||||||
    padding-bottom: 0.3rem;
 | 
					    padding-bottom: 0.3rem;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
.table td {
 | 
					.table td {
 | 
				
			||||||
    padding: .40rem .75rem;
 | 
					    padding: 2px 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.bg-application {
 | 
					.bg-application {
 | 
				
			||||||
@ -75,4 +75,4 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
.hl {
 | 
					.hl {
 | 
				
			||||||
    background: #fff217;
 | 
					    background: #fff217;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -18,17 +18,12 @@
 | 
				
			|||||||
                        <div class="form-group col-md-2">
 | 
					                        <div class="form-group col-md-2">
 | 
				
			||||||
                            <select class="form-control" name="sort_order" title="Sort order">
 | 
					                            <select class="form-control" name="sort_order" title="Sort order">
 | 
				
			||||||
                                <option disabled>Select sort order</option>
 | 
					                                <option disabled>Select sort order</option>
 | 
				
			||||||
                                <option value="score" {{ "selected" if sort_order == "score" else "" }}>Relevance
 | 
					                                <option value="score" {{ "selected" if sort_order == "score" else "" }}>Relevance </option>
 | 
				
			||||||
                                </option>
 | 
					                                <option value="size_asc" {{ "selected" if sort_order == "size_asc" else "" }}>Size ascending </option>
 | 
				
			||||||
                                <option value="size_asc" {{ "selected" if sort_order == "size_asc" else "" }}>Size
 | 
					                                <option value="size_dsc" {{ "selected" if sort_order == "size_dsc" else "" }}>Size descending</option>
 | 
				
			||||||
                                    ascending
 | 
					                                <option value="date_asc" {{ "selected" if sort_order == "date_asc" else "" }}>Date ascending</option>
 | 
				
			||||||
                                </option>
 | 
					                                <option value="date_dsc" {{ "selected" if sort_order == "date_dsc" else "" }}>Date descending</option>
 | 
				
			||||||
                                <option value="size_dsc" {{ "selected" if sort_order == "size_dsc" else "" }}>Size
 | 
					                                <option value="none" {{ "selected" if sort_order == "none" else "" }}>No order (faster)</option>
 | 
				
			||||||
                                    descending
 | 
					 | 
				
			||||||
                                </option>
 | 
					 | 
				
			||||||
                                <option value="none" {{ "selected" if sort_order == "none" else "" }}>No order
 | 
					 | 
				
			||||||
                                    (faster)
 | 
					 | 
				
			||||||
                                </option>
 | 
					 | 
				
			||||||
                            </select>
 | 
					                            </select>
 | 
				
			||||||
                        </div>
 | 
					                        </div>
 | 
				
			||||||
                        <div class="form-group col-md-2">
 | 
					                        <div class="form-group col-md-2">
 | 
				
			||||||
@ -47,7 +42,7 @@
 | 
				
			|||||||
            </div>
 | 
					            </div>
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        {% if results["hits"]["total"] > 0 %}
 | 
					        {% if results and results["hits"]["total"] > 0 %}
 | 
				
			||||||
            <div class="card">
 | 
					            <div class="card">
 | 
				
			||||||
                <div class="card-body">
 | 
					                <div class="card-body">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -60,13 +55,11 @@
 | 
				
			|||||||
                                {% set hl_name = hit["highlight"]["name"][0] if "name" in hit["highlight"] else src["name"] %}
 | 
					                                {% set hl_name = hit["highlight"]["name"][0] if "name" in hit["highlight"] else src["name"] %}
 | 
				
			||||||
                                {% set hl_path = hit["highlight"]["path"][0] if "path" in hit["highlight"] else src["path"] %}
 | 
					                                {% set hl_path = hit["highlight"]["path"][0] if "path" in hit["highlight"] else src["path"] %}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                                {# TODO: website url + path #}
 | 
					 | 
				
			||||||
                                {% set path = src["path"] %}
 | 
					 | 
				
			||||||
                                <tr>
 | 
					                                <tr>
 | 
				
			||||||
                                    <td>
 | 
					                                    <td>
 | 
				
			||||||
                                        {# File name & link #}
 | 
					                                        {# File name & link #}
 | 
				
			||||||
                                        <a href="{{ path + "/" + src["name"] }}" title="{{ src["name"] }}">
 | 
					                                        <a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + src["ext"] }}" title="{{ src["name"] + src["ext"] }}">
 | 
				
			||||||
                                            {{ hl_name |safe }}
 | 
					                                            {{ hl_name |safe }}{{ src["ext"] }}
 | 
				
			||||||
                                        </a>
 | 
					                                        </a>
 | 
				
			||||||
                                        {# File type badge #}
 | 
					                                        {# File type badge #}
 | 
				
			||||||
                                        {% set mime = get_mime(src["path"]) %}
 | 
					                                        {% set mime = get_mime(src["path"]) %}
 | 
				
			||||||
@ -78,13 +71,13 @@
 | 
				
			|||||||
                                        {# File path #}
 | 
					                                        {# File path #}
 | 
				
			||||||
                                        <div class="text-muted" title="{{ path }}" style="font-size: 10px;">
 | 
					                                        <div class="text-muted" title="{{ path }}" style="font-size: 10px;">
 | 
				
			||||||
                                            <a style="color: #6c757d" title="See files from this website"
 | 
					                                            <a style="color: #6c757d" title="See files from this website"
 | 
				
			||||||
                                                    {# todo: website url #}
 | 
					                                               href="/website/{{ src["website_id"] }}">{{ src["website_url"] }}</a>{{ hl_path|safe }}
 | 
				
			||||||
                                               href="/website/{{ src["website_id"] }}">{{ hl_path | safe }}</a>{{ truncate_path(src["path"], 60) }}
 | 
					 | 
				
			||||||
                                        </div>
 | 
					                                        </div>
 | 
				
			||||||
                                    </td>
 | 
					                                    </td>
 | 
				
			||||||
                                    {# File size #}
 | 
					                                    {# File size & date #}
 | 
				
			||||||
                                    <td style="white-space: nowrap; vertical-align: top; text-align: right; font-size: 14px">
 | 
					                                    <td style="white-space: nowrap; vertical-align: top; text-align: right; font-size: 14px">
 | 
				
			||||||
                                        {{ src["size"] | filesizeformat if src["size"] >= 0 else "?" }}
 | 
					                                        <div>{{ src["size"] | filesizeformat if src["size"] >= 0 else "?" }}</div>
 | 
				
			||||||
 | 
					                                        <code>{{ src["mtime"] | datetime_format }}</code>
 | 
				
			||||||
                                    </td>
 | 
					                                    </td>
 | 
				
			||||||
                                </tr>
 | 
					                                </tr>
 | 
				
			||||||
                            {% endfor %}
 | 
					                            {% endfor %}
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user