From 6e80791264c0078644e5b86d681ad1eb147e5abf Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 16 Nov 2018 16:49:23 -0500 Subject: [PATCH] Search filter --- .gitmodules | 3 +++ app.py | 20 ++++++++++++-------- database.py | 8 +++----- init_script.sql | 5 ++++- search/filter.py | 28 ++++++++++++++++++++++++++++ search/search.py | 11 +++++++++++ 6 files changed, 61 insertions(+), 14 deletions(-) create mode 100644 .gitmodules create mode 100644 search/filter.py diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..6be6199 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "fold_to_ascii"] + path = fold_to_ascii + url = https://github.com/spanishdict/fold_to_ascii diff --git a/app.py b/app.py index 7a93bf6..1278af5 100644 --- a/app.py +++ b/app.py @@ -5,13 +5,13 @@ from urllib.parse import urlparse import os import time import datetime -from database import Database, Website, InvalidQueryException +from database import Database, Website from flask_recaptcha import ReCaptcha import od_util import config from flask_caching import Cache from tasks import TaskManager, Task, TaskResult -from search.search import ElasticSearchEngine +from search.search import ElasticSearchEngine, InvalidQueryException from callbacks import PostCrawlCallbackFactory app = Flask(__name__) @@ -287,11 +287,10 @@ def search(): if len(q) >= 3: + blocked = False + hits = None response = request.args.get("g-recaptcha-response", "") if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response): - db.log_search(request.remote_addr, - request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None, - q, extensions, page) try: hits = searchEngine.search(q, page, per_page, sort_order, @@ -299,14 +298,19 @@ def search(): hits = db.join_website_on_search_result(hits) except InvalidQueryException as e: flash("Invalid query: " + str(e), "warning") - return redirect("/search") + blocked = True except Exception: flash("Query failed, this could mean that the search server is overloaded or is not reachable. " "Please try again later", "danger") - hits = None + + db.log_search(request.remote_addr, + request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None, + q, extensions, page, blocked, + hits["hits"]["total"] if hits else -1, hits["took"] if hits else -1) + if blocked: + return redirect("/search") else: flash("Error: Invalid captcha please try again", "danger") - hits = None else: hits = None diff --git a/database.py b/database.py index 91a2cee..a5c0b05 100644 --- a/database.py +++ b/database.py @@ -7,8 +7,6 @@ import bcrypt import uuid import tasks -class InvalidQueryException(Exception): - pass class BlacklistedWebsite: @@ -351,13 +349,13 @@ class Database: return stats - def log_search(self, remote_addr, forwarded_for, q, exts, page): + def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took): with sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() - cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page) VALUES " - "(?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page)) + cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page, blocked, results, took) " + "VALUES (?,?,?,?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took)) conn.commit() diff --git a/init_script.sql b/init_script.sql index 87e214c..cfebb42 100644 --- a/init_script.sql +++ b/init_script.sql @@ -45,7 +45,10 @@ CREATE TABLE SearchLogEntry ( forwarded_for TEXT, query TEXT, extensions TEXT, - page INT + page INT, + blocked INT DEFAULT 0, + results INT DEFAULT 0, + took INT DEFAULT 0 ); CREATE TABLE Queue ( diff --git a/search/filter.py b/search/filter.py new file mode 100644 index 0000000..ecdb21f --- /dev/null +++ b/search/filter.py @@ -0,0 +1,28 @@ +import os + +from fold_to_ascii.fold_to_ascii import mapping + + +class SearchFilter: + + def __init__(self): + + self.blacklisted_terms = set() + self.table = str.maketrans(dict(mapping.translate_table)) + + if os.path.exists("search_blacklist.txt"): + with open("search_blacklist.txt") as f: + self.blacklisted_terms.update(line.strip() for line in f.readlines() if line[0] != "#" and line.strip()) + + def should_block(self, query) -> bool: + + query = query.translate(self.table) + query = query.lower() + + for raw_token in query.split(): + + token = raw_token.strip("\"'/\\").strip() + if token in self.blacklisted_terms: + return True + + return False diff --git a/search/search.py b/search/search.py index f65f539..28b8a0e 100644 --- a/search/search.py +++ b/search/search.py @@ -5,6 +5,12 @@ import os import ujson from apscheduler.schedulers.background import BackgroundScheduler +from search.filter import SearchFilter + + +class InvalidQueryException(Exception): + pass + class IndexingError(Exception): pass @@ -49,6 +55,7 @@ class ElasticSearchEngine(SearchEngine): super().__init__() self.index_name = index_name self.es = elasticsearch.Elasticsearch() + self.filter = SearchFilter() if not self.es.indices.exists(self.index_name): self.init() @@ -165,6 +172,10 @@ class ElasticSearchEngine(SearchEngine): def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, date_max) -> {}: + if self.filter.should_block(query): + raise InvalidQueryException("One or more terms in your query is blocked by the search filter. " + "This incident has been reported.") + filters = [] if extensions: filters.append({"terms": {"ext": extensions}})