mirror of
				https://github.com/simon987/od-database.git
				synced 2025-10-25 19:56:51 +00:00 
			
		
		
		
	Added website url & date in search results & fixed threading problem
This commit is contained in:
		
							parent
							
								
									0127b3a51d
								
							
						
					
					
						commit
						4b60ac62fc
					
				
							
								
								
									
										10
									
								
								app.py
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								app.py
									
									
									
									
									
								
							| @ -26,7 +26,7 @@ searchEngine = ElasticSearchEngine("od-database") | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @app.template_filter("datetime_format") | @app.template_filter("datetime_format") | ||||||
| def datetime_format(value, format='%Y-%m-%d %H:%M UTC'): | def datetime_format(value, format='%Y-%m-%d'): | ||||||
|     return time.strftime(format, time.gmtime(value)) |     return time.strftime(format, time.gmtime(value)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -107,18 +107,16 @@ def search(): | |||||||
|     per_page = int(per_page) if per_page.isdigit() else "50" |     per_page = int(per_page) if per_page.isdigit() else "50" | ||||||
|     per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50 |     per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50 | ||||||
| 
 | 
 | ||||||
|     if q: |     if len(q) >= 3: | ||||||
|         try: |         try: | ||||||
|             # hits = sea.search(q, per_page, page, sort_order) |             hits = searchEngine.search(q, page, per_page, sort_order) | ||||||
|             hits = searchEngine.search(q, page, per_page) |             hits = db.join_search_result(hits) | ||||||
|         except InvalidQueryException as e: |         except InvalidQueryException as e: | ||||||
|             flash("<strong>Invalid query:</strong> " + str(e), "warning") |             flash("<strong>Invalid query:</strong> " + str(e), "warning") | ||||||
|             return redirect("/search") |             return redirect("/search") | ||||||
|     else: |     else: | ||||||
|         hits = None |         hits = None | ||||||
| 
 | 
 | ||||||
|     print(hits) |  | ||||||
| 
 |  | ||||||
|     return render_template("search.html", |     return render_template("search.html", | ||||||
|                            results=hits, q=q, p=page, sort_order=sort_order, |                            results=hits, q=q, p=page, sort_order=sort_order, | ||||||
|                            per_page=per_page, results_set=config.RESULTS_PER_PAGE) |                            per_page=per_page, results_set=config.RESULTS_PER_PAGE) | ||||||
|  | |||||||
| @ -1,5 +1,5 @@ | |||||||
| from crawl_server.database import TaskManagerDatabase, Task, TaskResult | from crawl_server.database import TaskManagerDatabase, Task, TaskResult | ||||||
| from multiprocessing import Pool | from concurrent.futures import ProcessPoolExecutor | ||||||
| from apscheduler.schedulers.background import BackgroundScheduler | from apscheduler.schedulers.background import BackgroundScheduler | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from crawl_server.crawler import RemoteDirectoryCrawler | from crawl_server.crawler import RemoteDirectoryCrawler | ||||||
| @ -10,7 +10,7 @@ class TaskManager: | |||||||
|     def __init__(self, db_path, max_processes=8): |     def __init__(self, db_path, max_processes=8): | ||||||
|         self.db_path = db_path |         self.db_path = db_path | ||||||
|         self.db = TaskManagerDatabase(db_path) |         self.db = TaskManagerDatabase(db_path) | ||||||
|         self.pool = Pool(processes=max_processes) |         self.pool = ProcessPoolExecutor(max_workers=max_processes) | ||||||
| 
 | 
 | ||||||
|         self.current_tasks = [] |         self.current_tasks = [] | ||||||
| 
 | 
 | ||||||
| @ -39,12 +39,10 @@ class TaskManager: | |||||||
| 
 | 
 | ||||||
|             print("pooled " + task.url) |             print("pooled " + task.url) | ||||||
| 
 | 
 | ||||||
|             self.pool.apply_async( |             self.pool.submit( | ||||||
|                 TaskManager.run_task, |                 TaskManager.run_task, | ||||||
|                 args=(task, self.db_path), |                 task, self.db_path | ||||||
|                 callback=TaskManager.task_complete, |             ).add_done_callback(TaskManager.task_complete) | ||||||
|                 error_callback=TaskManager.task_error |  | ||||||
|             ) |  | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def run_task(task, db_path): |     def run_task(task, db_path): | ||||||
| @ -63,19 +61,20 @@ class TaskManager: | |||||||
|         result.end_time = datetime.utcnow() |         result.end_time = datetime.utcnow() | ||||||
|         print("End task " + task.url) |         print("End task " + task.url) | ||||||
| 
 | 
 | ||||||
|         return dict(result=result, db_path=db_path) |         return result, db_path | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def task_complete(kwargs): |     def task_complete(result): | ||||||
|         result = kwargs["result"] | 
 | ||||||
|         db_path = kwargs["db_path"] |         task_result, db_path = result.result() | ||||||
|         print(result.status_code) | 
 | ||||||
|         print(result.file_count) |         print(task_result.status_code) | ||||||
|         print(result.start_time) |         print(task_result.file_count) | ||||||
|         print(result.end_time) |         print(task_result.start_time) | ||||||
|  |         print(task_result.end_time) | ||||||
| 
 | 
 | ||||||
|         db = TaskManagerDatabase(db_path) |         db = TaskManagerDatabase(db_path) | ||||||
|         db.log_result(result) |         db.log_result(task_result) | ||||||
|         print("Logged result to DB") |         print("Logged result to DB") | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|  | |||||||
							
								
								
									
										25
									
								
								database.py
									
									
									
									
									
								
							
							
						
						
									
										25
									
								
								database.py
									
									
									
									
									
								
							| @ -191,6 +191,31 @@ class Database: | |||||||
|             cursor.execute("DELETE FROM ApiToken WHERE token=?", (token, )) |             cursor.execute("DELETE FROM ApiToken WHERE token=?", (token, )) | ||||||
|             conn.commit() |             conn.commit() | ||||||
| 
 | 
 | ||||||
|  |     def _get_websites(self) -> dict: | ||||||
|  | 
 | ||||||
|  |         # todo: mem cache that | ||||||
|  |         with sqlite3.connect(self.db_path) as conn: | ||||||
|  | 
 | ||||||
|  |             cursor = conn.cursor() | ||||||
|  | 
 | ||||||
|  |             cursor.execute("SELECT id, url FROM Website") | ||||||
|  | 
 | ||||||
|  |             result = {} | ||||||
|  | 
 | ||||||
|  |             for db_website in cursor.fetchall(): | ||||||
|  |                 result[db_website[0]] = db_website[1] | ||||||
|  |             return result | ||||||
|  | 
 | ||||||
|  |     def join_search_result(self, page: dict) -> dict: | ||||||
|  | 
 | ||||||
|  |         websites = self._get_websites() | ||||||
|  | 
 | ||||||
|  |         for hit in page["hits"]["hits"]: | ||||||
|  |             hit["_source"]["website_url"] = websites[hit["_source"]["website_id"]] | ||||||
|  | 
 | ||||||
|  |         return page | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -1,4 +1,6 @@ | |||||||
| import elasticsearch | import elasticsearch | ||||||
|  | import os | ||||||
|  | import json | ||||||
| from elasticsearch.exceptions import TransportError | from elasticsearch.exceptions import TransportError | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -14,10 +16,7 @@ class SearchEngine: | |||||||
|     def import_json(self, in_str: str, website_id: int): |     def import_json(self, in_str: str, website_id: int): | ||||||
|         raise NotImplementedError |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|     def search(self, query) -> {}: |     def search(self, query, page, per_page, sort_order) -> {}: | ||||||
|         raise NotImplementedError |  | ||||||
| 
 |  | ||||||
|     def scroll(self, scroll_id) -> {}: |  | ||||||
|         raise NotImplementedError |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|     def reset(self): |     def reset(self): | ||||||
| @ -29,6 +28,15 @@ class SearchEngine: | |||||||
| 
 | 
 | ||||||
| class ElasticSearchEngine(SearchEngine): | class ElasticSearchEngine(SearchEngine): | ||||||
| 
 | 
 | ||||||
|  |     SORT_ORDERS = { | ||||||
|  |         "score": ["_score"], | ||||||
|  |         "size_asc": [{"size": {"order": "asc"}}], | ||||||
|  |         "size_dsc": [{"size": {"order": "desc"}}], | ||||||
|  |         "date_asc": [{"mtime": {"order": "asc"}}], | ||||||
|  |         "date_desc": [{"mtime": {"order": "desc"}}], | ||||||
|  |         "none": [] | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     def __init__(self, index_name): |     def __init__(self, index_name): | ||||||
|         super().__init__() |         super().__init__() | ||||||
|         self.index_name = index_name |         self.index_name = index_name | ||||||
| @ -68,7 +76,8 @@ class ElasticSearchEngine(SearchEngine): | |||||||
|             "name": {"analyzer": "my_nGram", "type": "text"}, |             "name": {"analyzer": "my_nGram", "type": "text"}, | ||||||
|             "mtime": {"type": "date", "format": "epoch_millis"}, |             "mtime": {"type": "date", "format": "epoch_millis"}, | ||||||
|             "size": {"type": "long"}, |             "size": {"type": "long"}, | ||||||
|             "website_id": {"type": "integer"} |             "website_id": {"type": "integer"}, | ||||||
|  |             "ext": {"type": "keyword"} | ||||||
|         }}, doc_type="file", index=self.index_name) |         }}, doc_type="file", index=self.index_name) | ||||||
| 
 | 
 | ||||||
|         self.es.indices.open(index=self.index_name) |         self.es.indices.open(index=self.index_name) | ||||||
| @ -85,16 +94,21 @@ class ElasticSearchEngine(SearchEngine): | |||||||
|         docs = [] |         docs = [] | ||||||
| 
 | 
 | ||||||
|         for line in in_str.splitlines(): |         for line in in_str.splitlines(): | ||||||
|             docs.append(line) |             doc = json.loads(line) | ||||||
|  |             name, ext = os.path.splitext(doc["name"]) | ||||||
|  |             doc["ext"] = ext if ext else "" | ||||||
|  |             doc["name"] = name | ||||||
|  |             doc["website_id"] = website_id | ||||||
|  |             docs.append(doc) | ||||||
| 
 | 
 | ||||||
|             if len(docs) >= import_every: |             if len(docs) >= import_every: | ||||||
|                 self._index(docs, website_id) |                 self._index(docs) | ||||||
|                 docs.clear() |                 docs.clear() | ||||||
|         self._index(docs, website_id) |         self._index(docs) | ||||||
| 
 | 
 | ||||||
|     def _index(self, docs, website_id): |     def _index(self, docs): | ||||||
|         print("Indexing " + str(len(docs)) + " docs") |         print("Indexing " + str(len(docs)) + " docs") | ||||||
|         bulk_string = ElasticSearchEngine.create_bulk_index_string(docs, website_id) |         bulk_string = ElasticSearchEngine.create_bulk_index_string(docs) | ||||||
|         result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file") |         result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file") | ||||||
| 
 | 
 | ||||||
|         if result["errors"]: |         if result["errors"]: | ||||||
| @ -102,17 +116,15 @@ class ElasticSearchEngine(SearchEngine): | |||||||
|             raise IndexingError |             raise IndexingError | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def create_bulk_index_string(docs: list, website_id: int): |     def create_bulk_index_string(docs: list): | ||||||
| 
 | 
 | ||||||
|         action_string = '{"index":{}}\n' |         action_string = '{"index":{}}\n' | ||||||
|         website_id_string = ',"website_id":' + str(website_id) + '}\n'  # Add website_id param to each doc |         return "\n".join("".join([action_string, json.dumps(doc)]) for doc in docs) | ||||||
| 
 | 
 | ||||||
|         return "\n".join("".join([action_string, doc[:-1], website_id_string]) for doc in docs) |     def search(self, query, page, per_page, sort_order) -> {}: | ||||||
| 
 |  | ||||||
|     def search(self, query, page, per_page) -> {}: |  | ||||||
| 
 | 
 | ||||||
|         filters = [] |         filters = [] | ||||||
|         sort_by = ["_score"] |         sort_by = ElasticSearchEngine.SORT_ORDERS.get(sort_order, []) | ||||||
| 
 | 
 | ||||||
|         page = self.es.search(body={ |         page = self.es.search(body={ | ||||||
|             "query": { |             "query": { | ||||||
|  | |||||||
| @ -11,7 +11,7 @@ | |||||||
|     padding-bottom: 0.3rem; |     padding-bottom: 0.3rem; | ||||||
| } | } | ||||||
| .table td { | .table td { | ||||||
|     padding: .40rem .75rem; |     padding: 2px 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| .bg-application { | .bg-application { | ||||||
| @ -75,4 +75,4 @@ | |||||||
| 
 | 
 | ||||||
| .hl { | .hl { | ||||||
|     background: #fff217; |     background: #fff217; | ||||||
| } | } | ||||||
|  | |||||||
| @ -18,17 +18,12 @@ | |||||||
|                         <div class="form-group col-md-2"> |                         <div class="form-group col-md-2"> | ||||||
|                             <select class="form-control" name="sort_order" title="Sort order"> |                             <select class="form-control" name="sort_order" title="Sort order"> | ||||||
|                                 <option disabled>Select sort order</option> |                                 <option disabled>Select sort order</option> | ||||||
|                                 <option value="score" {{ "selected" if sort_order == "score" else "" }}>Relevance |                                 <option value="score" {{ "selected" if sort_order == "score" else "" }}>Relevance </option> | ||||||
|                                 </option> |                                 <option value="size_asc" {{ "selected" if sort_order == "size_asc" else "" }}>Size ascending </option> | ||||||
|                                 <option value="size_asc" {{ "selected" if sort_order == "size_asc" else "" }}>Size |                                 <option value="size_dsc" {{ "selected" if sort_order == "size_dsc" else "" }}>Size descending</option> | ||||||
|                                     ascending |                                 <option value="date_asc" {{ "selected" if sort_order == "date_asc" else "" }}>Date ascending</option> | ||||||
|                                 </option> |                                 <option value="date_dsc" {{ "selected" if sort_order == "date_dsc" else "" }}>Date descending</option> | ||||||
|                                 <option value="size_dsc" {{ "selected" if sort_order == "size_dsc" else "" }}>Size |                                 <option value="none" {{ "selected" if sort_order == "none" else "" }}>No order (faster)</option> | ||||||
|                                     descending |  | ||||||
|                                 </option> |  | ||||||
|                                 <option value="none" {{ "selected" if sort_order == "none" else "" }}>No order |  | ||||||
|                                     (faster) |  | ||||||
|                                 </option> |  | ||||||
|                             </select> |                             </select> | ||||||
|                         </div> |                         </div> | ||||||
|                         <div class="form-group col-md-2"> |                         <div class="form-group col-md-2"> | ||||||
| @ -47,7 +42,7 @@ | |||||||
|             </div> |             </div> | ||||||
|         </div> |         </div> | ||||||
| 
 | 
 | ||||||
|         {% if results["hits"]["total"] > 0 %} |         {% if results and results["hits"]["total"] > 0 %} | ||||||
|             <div class="card"> |             <div class="card"> | ||||||
|                 <div class="card-body"> |                 <div class="card-body"> | ||||||
| 
 | 
 | ||||||
| @ -60,13 +55,11 @@ | |||||||
|                                 {% set hl_name = hit["highlight"]["name"][0] if "name" in hit["highlight"] else src["name"] %} |                                 {% set hl_name = hit["highlight"]["name"][0] if "name" in hit["highlight"] else src["name"] %} | ||||||
|                                 {% set hl_path = hit["highlight"]["path"][0] if "path" in hit["highlight"] else src["path"] %} |                                 {% set hl_path = hit["highlight"]["path"][0] if "path" in hit["highlight"] else src["path"] %} | ||||||
| 
 | 
 | ||||||
|                                 {# TODO: website url + path #} |  | ||||||
|                                 {% set path = src["path"] %} |  | ||||||
|                                 <tr> |                                 <tr> | ||||||
|                                     <td> |                                     <td> | ||||||
|                                         {# File name & link #} |                                         {# File name & link #} | ||||||
|                                         <a href="{{ path + "/" + src["name"] }}" title="{{ src["name"] }}"> |                                         <a href="{{ src["website_url"] + src["path"] + "/" + src["name"] + src["ext"] }}" title="{{ src["name"] + src["ext"] }}"> | ||||||
|                                             {{ hl_name |safe }} |                                             {{ hl_name |safe }}{{ src["ext"] }} | ||||||
|                                         </a> |                                         </a> | ||||||
|                                         {# File type badge #} |                                         {# File type badge #} | ||||||
|                                         {% set mime = get_mime(src["path"]) %} |                                         {% set mime = get_mime(src["path"]) %} | ||||||
| @ -78,13 +71,13 @@ | |||||||
|                                         {# File path #} |                                         {# File path #} | ||||||
|                                         <div class="text-muted" title="{{ path }}" style="font-size: 10px;"> |                                         <div class="text-muted" title="{{ path }}" style="font-size: 10px;"> | ||||||
|                                             <a style="color: #6c757d" title="See files from this website" |                                             <a style="color: #6c757d" title="See files from this website" | ||||||
|                                                     {# todo: website url #} |                                                href="/website/{{ src["website_id"] }}">{{ src["website_url"] }}</a>{{ hl_path|safe }} | ||||||
|                                                href="/website/{{ src["website_id"] }}">{{ hl_path | safe }}</a>{{ truncate_path(src["path"], 60) }} |  | ||||||
|                                         </div> |                                         </div> | ||||||
|                                     </td> |                                     </td> | ||||||
|                                     {# File size #} |                                     {# File size & date #} | ||||||
|                                     <td style="white-space: nowrap; vertical-align: top; text-align: right; font-size: 14px"> |                                     <td style="white-space: nowrap; vertical-align: top; text-align: right; font-size: 14px"> | ||||||
|                                         {{ src["size"] | filesizeformat if src["size"] >= 0 else "?" }} |                                         <div>{{ src["size"] | filesizeformat if src["size"] >= 0 else "?" }}</div> | ||||||
|  |                                         <code>{{ src["mtime"] | datetime_format }}</code> | ||||||
|                                     </td> |                                     </td> | ||||||
|                                 </tr> |                                 </tr> | ||||||
|                             {% endfor %} |                             {% endfor %} | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user