mirror of
				https://github.com/simon987/od-database.git
				synced 2025-10-25 19:56:51 +00:00 
			
		
		
		
	Search filter
This commit is contained in:
		
							parent
							
								
									a461b22ffc
								
							
						
					
					
						commit
						6e80791264
					
				
							
								
								
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @ -0,0 +1,3 @@ | ||||
| [submodule "fold_to_ascii"] | ||||
| 	path = fold_to_ascii | ||||
| 	url = https://github.com/spanishdict/fold_to_ascii | ||||
							
								
								
									
										20
									
								
								app.py
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								app.py
									
									
									
									
									
								
							| @ -5,13 +5,13 @@ from urllib.parse import urlparse | ||||
| import os | ||||
| import time | ||||
| import datetime | ||||
| from database import Database, Website, InvalidQueryException | ||||
| from database import Database, Website | ||||
| from flask_recaptcha import ReCaptcha | ||||
| import od_util | ||||
| import config | ||||
| from flask_caching import Cache | ||||
| from tasks import TaskManager, Task, TaskResult | ||||
| from search.search import ElasticSearchEngine | ||||
| from search.search import ElasticSearchEngine, InvalidQueryException | ||||
| from callbacks import PostCrawlCallbackFactory | ||||
| 
 | ||||
| app = Flask(__name__) | ||||
| @ -287,11 +287,10 @@ def search(): | ||||
| 
 | ||||
|         if len(q) >= 3: | ||||
| 
 | ||||
|             blocked = False | ||||
|             hits = None | ||||
|             response = request.args.get("g-recaptcha-response", "") | ||||
|             if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response): | ||||
|                 db.log_search(request.remote_addr, | ||||
|                               request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None, | ||||
|                               q, extensions, page) | ||||
| 
 | ||||
|                 try: | ||||
|                     hits = searchEngine.search(q, page, per_page, sort_order, | ||||
| @ -299,14 +298,19 @@ def search(): | ||||
|                     hits = db.join_website_on_search_result(hits) | ||||
|                 except InvalidQueryException as e: | ||||
|                     flash("<strong>Invalid query:</strong> " + str(e), "warning") | ||||
|                     return redirect("/search") | ||||
|                     blocked = True | ||||
|                 except Exception: | ||||
|                     flash("Query failed, this could mean that the search server is overloaded or is not reachable. " | ||||
|                           "Please try again later", "danger") | ||||
|                     hits = None | ||||
| 
 | ||||
|                 db.log_search(request.remote_addr, | ||||
|                               request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None, | ||||
|                               q, extensions, page, blocked, | ||||
|                               hits["hits"]["total"] if hits else -1, hits["took"] if hits else -1) | ||||
|                 if blocked: | ||||
|                     return redirect("/search") | ||||
|             else: | ||||
|                 flash("<strong>Error:</strong> Invalid captcha please try again", "danger") | ||||
|                 hits = None | ||||
| 
 | ||||
|         else: | ||||
|             hits = None | ||||
|  | ||||
| @ -7,8 +7,6 @@ import bcrypt | ||||
| import uuid | ||||
| import tasks | ||||
| 
 | ||||
| class InvalidQueryException(Exception): | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| class BlacklistedWebsite: | ||||
| @ -351,13 +349,13 @@ class Database: | ||||
| 
 | ||||
|         return stats | ||||
| 
 | ||||
|     def log_search(self, remote_addr, forwarded_for, q, exts, page): | ||||
|     def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took): | ||||
| 
 | ||||
|         with sqlite3.connect(self.db_path) as conn: | ||||
|             cursor = conn.cursor() | ||||
| 
 | ||||
|             cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page) VALUES " | ||||
|                            "(?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page)) | ||||
|             cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page, blocked, results, took) " | ||||
|                            "VALUES (?,?,?,?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took)) | ||||
| 
 | ||||
|             conn.commit() | ||||
| 
 | ||||
|  | ||||
| @ -45,7 +45,10 @@ CREATE TABLE SearchLogEntry ( | ||||
|   forwarded_for TEXT, | ||||
|   query TEXT, | ||||
|   extensions TEXT, | ||||
|   page INT | ||||
|   page INT, | ||||
|   blocked INT DEFAULT 0, | ||||
|   results INT DEFAULT 0, | ||||
|   took INT DEFAULT 0 | ||||
| ); | ||||
| 
 | ||||
| CREATE TABLE Queue ( | ||||
|  | ||||
							
								
								
									
										28
									
								
								search/filter.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								search/filter.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,28 @@ | ||||
| import os | ||||
| 
 | ||||
| from fold_to_ascii.fold_to_ascii import mapping | ||||
| 
 | ||||
| 
 | ||||
| class SearchFilter: | ||||
| 
 | ||||
|     def __init__(self): | ||||
| 
 | ||||
|         self.blacklisted_terms = set() | ||||
|         self.table = str.maketrans(dict(mapping.translate_table)) | ||||
| 
 | ||||
|         if os.path.exists("search_blacklist.txt"): | ||||
|             with open("search_blacklist.txt") as f: | ||||
|                 self.blacklisted_terms.update(line.strip() for line in f.readlines() if line[0] != "#" and line.strip()) | ||||
| 
 | ||||
|     def should_block(self, query) -> bool: | ||||
| 
 | ||||
|         query = query.translate(self.table) | ||||
|         query = query.lower() | ||||
| 
 | ||||
|         for raw_token in query.split(): | ||||
| 
 | ||||
|             token = raw_token.strip("\"'/\\").strip() | ||||
|             if token in self.blacklisted_terms: | ||||
|                 return True | ||||
| 
 | ||||
|         return False | ||||
| @ -5,6 +5,12 @@ import os | ||||
| import ujson | ||||
| from apscheduler.schedulers.background import BackgroundScheduler | ||||
| 
 | ||||
| from search.filter import SearchFilter | ||||
| 
 | ||||
| 
 | ||||
| class InvalidQueryException(Exception): | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| class IndexingError(Exception): | ||||
|     pass | ||||
| @ -49,6 +55,7 @@ class ElasticSearchEngine(SearchEngine): | ||||
|         super().__init__() | ||||
|         self.index_name = index_name | ||||
|         self.es = elasticsearch.Elasticsearch() | ||||
|         self.filter = SearchFilter() | ||||
| 
 | ||||
|         if not self.es.indices.exists(self.index_name): | ||||
|             self.init() | ||||
| @ -165,6 +172,10 @@ class ElasticSearchEngine(SearchEngine): | ||||
|     def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, | ||||
|                date_max) -> {}: | ||||
| 
 | ||||
|         if self.filter.should_block(query): | ||||
|             raise InvalidQueryException("One or more terms in your query is blocked by the search filter. " | ||||
|                                         "This incident has been reported.") | ||||
| 
 | ||||
|         filters = [] | ||||
|         if extensions: | ||||
|             filters.append({"terms": {"ext": extensions}}) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user