mirror of
				https://github.com/simon987/od-database.git
				synced 2025-10-26 03:56:52 +00:00 
			
		
		
		
	Search filter
This commit is contained in:
		
							parent
							
								
									a461b22ffc
								
							
						
					
					
						commit
						6e80791264
					
				
							
								
								
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @ -0,0 +1,3 @@ | |||||||
|  | [submodule "fold_to_ascii"] | ||||||
|  | 	path = fold_to_ascii | ||||||
|  | 	url = https://github.com/spanishdict/fold_to_ascii | ||||||
							
								
								
									
										20
									
								
								app.py
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								app.py
									
									
									
									
									
								
							| @ -5,13 +5,13 @@ from urllib.parse import urlparse | |||||||
| import os | import os | ||||||
| import time | import time | ||||||
| import datetime | import datetime | ||||||
| from database import Database, Website, InvalidQueryException | from database import Database, Website | ||||||
| from flask_recaptcha import ReCaptcha | from flask_recaptcha import ReCaptcha | ||||||
| import od_util | import od_util | ||||||
| import config | import config | ||||||
| from flask_caching import Cache | from flask_caching import Cache | ||||||
| from tasks import TaskManager, Task, TaskResult | from tasks import TaskManager, Task, TaskResult | ||||||
| from search.search import ElasticSearchEngine | from search.search import ElasticSearchEngine, InvalidQueryException | ||||||
| from callbacks import PostCrawlCallbackFactory | from callbacks import PostCrawlCallbackFactory | ||||||
| 
 | 
 | ||||||
| app = Flask(__name__) | app = Flask(__name__) | ||||||
| @ -287,11 +287,10 @@ def search(): | |||||||
| 
 | 
 | ||||||
|         if len(q) >= 3: |         if len(q) >= 3: | ||||||
| 
 | 
 | ||||||
|  |             blocked = False | ||||||
|  |             hits = None | ||||||
|             response = request.args.get("g-recaptcha-response", "") |             response = request.args.get("g-recaptcha-response", "") | ||||||
|             if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response): |             if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response): | ||||||
|                 db.log_search(request.remote_addr, |  | ||||||
|                               request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None, |  | ||||||
|                               q, extensions, page) |  | ||||||
| 
 | 
 | ||||||
|                 try: |                 try: | ||||||
|                     hits = searchEngine.search(q, page, per_page, sort_order, |                     hits = searchEngine.search(q, page, per_page, sort_order, | ||||||
| @ -299,14 +298,19 @@ def search(): | |||||||
|                     hits = db.join_website_on_search_result(hits) |                     hits = db.join_website_on_search_result(hits) | ||||||
|                 except InvalidQueryException as e: |                 except InvalidQueryException as e: | ||||||
|                     flash("<strong>Invalid query:</strong> " + str(e), "warning") |                     flash("<strong>Invalid query:</strong> " + str(e), "warning") | ||||||
|                     return redirect("/search") |                     blocked = True | ||||||
|                 except Exception: |                 except Exception: | ||||||
|                     flash("Query failed, this could mean that the search server is overloaded or is not reachable. " |                     flash("Query failed, this could mean that the search server is overloaded or is not reachable. " | ||||||
|                           "Please try again later", "danger") |                           "Please try again later", "danger") | ||||||
|                     hits = None | 
 | ||||||
|  |                 db.log_search(request.remote_addr, | ||||||
|  |                               request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None, | ||||||
|  |                               q, extensions, page, blocked, | ||||||
|  |                               hits["hits"]["total"] if hits else -1, hits["took"] if hits else -1) | ||||||
|  |                 if blocked: | ||||||
|  |                     return redirect("/search") | ||||||
|             else: |             else: | ||||||
|                 flash("<strong>Error:</strong> Invalid captcha please try again", "danger") |                 flash("<strong>Error:</strong> Invalid captcha please try again", "danger") | ||||||
|                 hits = None |  | ||||||
| 
 | 
 | ||||||
|         else: |         else: | ||||||
|             hits = None |             hits = None | ||||||
|  | |||||||
| @ -7,8 +7,6 @@ import bcrypt | |||||||
| import uuid | import uuid | ||||||
| import tasks | import tasks | ||||||
| 
 | 
 | ||||||
| class InvalidQueryException(Exception): |  | ||||||
|     pass |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class BlacklistedWebsite: | class BlacklistedWebsite: | ||||||
| @ -351,13 +349,13 @@ class Database: | |||||||
| 
 | 
 | ||||||
|         return stats |         return stats | ||||||
| 
 | 
 | ||||||
|     def log_search(self, remote_addr, forwarded_for, q, exts, page): |     def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took): | ||||||
| 
 | 
 | ||||||
|         with sqlite3.connect(self.db_path) as conn: |         with sqlite3.connect(self.db_path) as conn: | ||||||
|             cursor = conn.cursor() |             cursor = conn.cursor() | ||||||
| 
 | 
 | ||||||
|             cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page) VALUES " |             cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page, blocked, results, took) " | ||||||
|                            "(?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page)) |                            "VALUES (?,?,?,?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took)) | ||||||
| 
 | 
 | ||||||
|             conn.commit() |             conn.commit() | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -45,7 +45,10 @@ CREATE TABLE SearchLogEntry ( | |||||||
|   forwarded_for TEXT, |   forwarded_for TEXT, | ||||||
|   query TEXT, |   query TEXT, | ||||||
|   extensions TEXT, |   extensions TEXT, | ||||||
|   page INT |   page INT, | ||||||
|  |   blocked INT DEFAULT 0, | ||||||
|  |   results INT DEFAULT 0, | ||||||
|  |   took INT DEFAULT 0 | ||||||
| ); | ); | ||||||
| 
 | 
 | ||||||
| CREATE TABLE Queue ( | CREATE TABLE Queue ( | ||||||
|  | |||||||
							
								
								
									
										28
									
								
								search/filter.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								search/filter.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,28 @@ | |||||||
|  | import os | ||||||
|  | 
 | ||||||
|  | from fold_to_ascii.fold_to_ascii import mapping | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class SearchFilter: | ||||||
|  | 
 | ||||||
|  |     def __init__(self): | ||||||
|  | 
 | ||||||
|  |         self.blacklisted_terms = set() | ||||||
|  |         self.table = str.maketrans(dict(mapping.translate_table)) | ||||||
|  | 
 | ||||||
|  |         if os.path.exists("search_blacklist.txt"): | ||||||
|  |             with open("search_blacklist.txt") as f: | ||||||
|  |                 self.blacklisted_terms.update(line.strip() for line in f.readlines() if line[0] != "#" and line.strip()) | ||||||
|  | 
 | ||||||
|  |     def should_block(self, query) -> bool: | ||||||
|  | 
 | ||||||
|  |         query = query.translate(self.table) | ||||||
|  |         query = query.lower() | ||||||
|  | 
 | ||||||
|  |         for raw_token in query.split(): | ||||||
|  | 
 | ||||||
|  |             token = raw_token.strip("\"'/\\").strip() | ||||||
|  |             if token in self.blacklisted_terms: | ||||||
|  |                 return True | ||||||
|  | 
 | ||||||
|  |         return False | ||||||
| @ -5,6 +5,12 @@ import os | |||||||
| import ujson | import ujson | ||||||
| from apscheduler.schedulers.background import BackgroundScheduler | from apscheduler.schedulers.background import BackgroundScheduler | ||||||
| 
 | 
 | ||||||
|  | from search.filter import SearchFilter | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class InvalidQueryException(Exception): | ||||||
|  |     pass | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class IndexingError(Exception): | class IndexingError(Exception): | ||||||
|     pass |     pass | ||||||
| @ -49,6 +55,7 @@ class ElasticSearchEngine(SearchEngine): | |||||||
|         super().__init__() |         super().__init__() | ||||||
|         self.index_name = index_name |         self.index_name = index_name | ||||||
|         self.es = elasticsearch.Elasticsearch() |         self.es = elasticsearch.Elasticsearch() | ||||||
|  |         self.filter = SearchFilter() | ||||||
| 
 | 
 | ||||||
|         if not self.es.indices.exists(self.index_name): |         if not self.es.indices.exists(self.index_name): | ||||||
|             self.init() |             self.init() | ||||||
| @ -165,6 +172,10 @@ class ElasticSearchEngine(SearchEngine): | |||||||
|     def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, |     def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, | ||||||
|                date_max) -> {}: |                date_max) -> {}: | ||||||
| 
 | 
 | ||||||
|  |         if self.filter.should_block(query): | ||||||
|  |             raise InvalidQueryException("One or more terms in your query is blocked by the search filter. " | ||||||
|  |                                         "This incident has been reported.") | ||||||
|  | 
 | ||||||
|         filters = [] |         filters = [] | ||||||
|         if extensions: |         if extensions: | ||||||
|             filters.append({"terms": {"ext": extensions}}) |             filters.append({"terms": {"ext": extensions}}) | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user