diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..6be6199
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "fold_to_ascii"]
+ path = fold_to_ascii
+ url = https://github.com/spanishdict/fold_to_ascii
diff --git a/app.py b/app.py
index 7a93bf6..1278af5 100644
--- a/app.py
+++ b/app.py
@@ -5,13 +5,13 @@ from urllib.parse import urlparse
import os
import time
import datetime
-from database import Database, Website, InvalidQueryException
+from database import Database, Website
from flask_recaptcha import ReCaptcha
import od_util
import config
from flask_caching import Cache
from tasks import TaskManager, Task, TaskResult
-from search.search import ElasticSearchEngine
+from search.search import ElasticSearchEngine, InvalidQueryException
from callbacks import PostCrawlCallbackFactory
app = Flask(__name__)
@@ -287,11 +287,10 @@ def search():
if len(q) >= 3:
+ blocked = False
+ hits = None
response = request.args.get("g-recaptcha-response", "")
if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response):
- db.log_search(request.remote_addr,
- request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
- q, extensions, page)
try:
hits = searchEngine.search(q, page, per_page, sort_order,
@@ -299,14 +298,19 @@ def search():
hits = db.join_website_on_search_result(hits)
except InvalidQueryException as e:
flash("Invalid query: " + str(e), "warning")
- return redirect("/search")
+ blocked = True
except Exception:
flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
"Please try again later", "danger")
- hits = None
+
+ db.log_search(request.remote_addr,
+ request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
+ q, extensions, page, blocked,
+ hits["hits"]["total"] if hits else -1, hits["took"] if hits else -1)
+ if blocked:
+ return redirect("/search")
else:
flash("Error: Invalid captcha please try again", "danger")
- hits = None
else:
hits = None
diff --git a/database.py b/database.py
index 91a2cee..a5c0b05 100644
--- a/database.py
+++ b/database.py
@@ -7,8 +7,6 @@ import bcrypt
import uuid
import tasks
-class InvalidQueryException(Exception):
- pass
class BlacklistedWebsite:
@@ -351,13 +349,13 @@ class Database:
return stats
- def log_search(self, remote_addr, forwarded_for, q, exts, page):
+ def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took):
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
- cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page) VALUES "
- "(?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page))
+ cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page, blocked, results, took) "
+ "VALUES (?,?,?,?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took))
conn.commit()
diff --git a/init_script.sql b/init_script.sql
index 87e214c..cfebb42 100644
--- a/init_script.sql
+++ b/init_script.sql
@@ -45,7 +45,10 @@ CREATE TABLE SearchLogEntry (
forwarded_for TEXT,
query TEXT,
extensions TEXT,
- page INT
+ page INT,
+ blocked INT DEFAULT 0,
+ results INT DEFAULT 0,
+ took INT DEFAULT 0
);
CREATE TABLE Queue (
diff --git a/search/filter.py b/search/filter.py
new file mode 100644
index 0000000..ecdb21f
--- /dev/null
+++ b/search/filter.py
@@ -0,0 +1,28 @@
+import os
+
+from fold_to_ascii.fold_to_ascii import mapping
+
+
+class SearchFilter:
+
+ def __init__(self):
+
+ self.blacklisted_terms = set()
+ self.table = str.maketrans(dict(mapping.translate_table))
+
+ if os.path.exists("search_blacklist.txt"):
+ with open("search_blacklist.txt") as f:
+ self.blacklisted_terms.update(line.strip() for line in f.readlines() if line[0] != "#" and line.strip())
+
+ def should_block(self, query) -> bool:
+
+ query = query.translate(self.table)
+ query = query.lower()
+
+ for raw_token in query.split():
+
+ token = raw_token.strip("\"'/\\").strip()
+ if token in self.blacklisted_terms:
+ return True
+
+ return False
diff --git a/search/search.py b/search/search.py
index f65f539..28b8a0e 100644
--- a/search/search.py
+++ b/search/search.py
@@ -5,6 +5,12 @@ import os
import ujson
from apscheduler.schedulers.background import BackgroundScheduler
+from search.filter import SearchFilter
+
+
+class InvalidQueryException(Exception):
+ pass
+
class IndexingError(Exception):
pass
@@ -49,6 +55,7 @@ class ElasticSearchEngine(SearchEngine):
super().__init__()
self.index_name = index_name
self.es = elasticsearch.Elasticsearch()
+ self.filter = SearchFilter()
if not self.es.indices.exists(self.index_name):
self.init()
@@ -165,6 +172,10 @@ class ElasticSearchEngine(SearchEngine):
def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min,
date_max) -> {}:
+ if self.filter.should_block(query):
+ raise InvalidQueryException("One or more terms in your query is blocked by the search filter. "
+ "This incident has been reported.")
+
filters = []
if extensions:
filters.append({"terms": {"ext": extensions}})