Search filter

This commit is contained in:
Simon 2018-11-16 16:49:23 -05:00
parent a461b22ffc
commit 6e80791264
6 changed files with 61 additions and 14 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "fold_to_ascii"]
path = fold_to_ascii
url = https://github.com/spanishdict/fold_to_ascii

20
app.py
View File

@ -5,13 +5,13 @@ from urllib.parse import urlparse
import os import os
import time import time
import datetime import datetime
from database import Database, Website, InvalidQueryException from database import Database, Website
from flask_recaptcha import ReCaptcha from flask_recaptcha import ReCaptcha
import od_util import od_util
import config import config
from flask_caching import Cache from flask_caching import Cache
from tasks import TaskManager, Task, TaskResult from tasks import TaskManager, Task, TaskResult
from search.search import ElasticSearchEngine from search.search import ElasticSearchEngine, InvalidQueryException
from callbacks import PostCrawlCallbackFactory from callbacks import PostCrawlCallbackFactory
app = Flask(__name__) app = Flask(__name__)
@ -287,11 +287,10 @@ def search():
if len(q) >= 3: if len(q) >= 3:
blocked = False
hits = None
response = request.args.get("g-recaptcha-response", "") response = request.args.get("g-recaptcha-response", "")
if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response): if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response):
db.log_search(request.remote_addr,
request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
q, extensions, page)
try: try:
hits = searchEngine.search(q, page, per_page, sort_order, hits = searchEngine.search(q, page, per_page, sort_order,
@ -299,14 +298,19 @@ def search():
hits = db.join_website_on_search_result(hits) hits = db.join_website_on_search_result(hits)
except InvalidQueryException as e: except InvalidQueryException as e:
flash("<strong>Invalid query:</strong> " + str(e), "warning") flash("<strong>Invalid query:</strong> " + str(e), "warning")
return redirect("/search") blocked = True
except Exception: except Exception:
flash("Query failed, this could mean that the search server is overloaded or is not reachable. " flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
"Please try again later", "danger") "Please try again later", "danger")
hits = None
db.log_search(request.remote_addr,
request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
q, extensions, page, blocked,
hits["hits"]["total"] if hits else -1, hits["took"] if hits else -1)
if blocked:
return redirect("/search")
else: else:
flash("<strong>Error:</strong> Invalid captcha please try again", "danger") flash("<strong>Error:</strong> Invalid captcha please try again", "danger")
hits = None
else: else:
hits = None hits = None

View File

@ -7,8 +7,6 @@ import bcrypt
import uuid import uuid
import tasks import tasks
class InvalidQueryException(Exception):
pass
class BlacklistedWebsite: class BlacklistedWebsite:
@ -351,13 +349,13 @@ class Database:
return stats return stats
def log_search(self, remote_addr, forwarded_for, q, exts, page): def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took):
with sqlite3.connect(self.db_path) as conn: with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page) VALUES " cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page, blocked, results, took) "
"(?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page)) "VALUES (?,?,?,?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took))
conn.commit() conn.commit()

View File

@ -45,7 +45,10 @@ CREATE TABLE SearchLogEntry (
forwarded_for TEXT, forwarded_for TEXT,
query TEXT, query TEXT,
extensions TEXT, extensions TEXT,
page INT page INT,
blocked INT DEFAULT 0,
results INT DEFAULT 0,
took INT DEFAULT 0
); );
CREATE TABLE Queue ( CREATE TABLE Queue (

28
search/filter.py Normal file
View File

@ -0,0 +1,28 @@
import os
from fold_to_ascii.fold_to_ascii import mapping
class SearchFilter:
def __init__(self):
self.blacklisted_terms = set()
self.table = str.maketrans(dict(mapping.translate_table))
if os.path.exists("search_blacklist.txt"):
with open("search_blacklist.txt") as f:
self.blacklisted_terms.update(line.strip() for line in f.readlines() if line[0] != "#" and line.strip())
def should_block(self, query) -> bool:
query = query.translate(self.table)
query = query.lower()
for raw_token in query.split():
token = raw_token.strip("\"'/\\").strip()
if token in self.blacklisted_terms:
return True
return False

View File

@ -5,6 +5,12 @@ import os
import ujson import ujson
from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.schedulers.background import BackgroundScheduler
from search.filter import SearchFilter
class InvalidQueryException(Exception):
pass
class IndexingError(Exception): class IndexingError(Exception):
pass pass
@ -49,6 +55,7 @@ class ElasticSearchEngine(SearchEngine):
super().__init__() super().__init__()
self.index_name = index_name self.index_name = index_name
self.es = elasticsearch.Elasticsearch() self.es = elasticsearch.Elasticsearch()
self.filter = SearchFilter()
if not self.es.indices.exists(self.index_name): if not self.es.indices.exists(self.index_name):
self.init() self.init()
@ -165,6 +172,10 @@ class ElasticSearchEngine(SearchEngine):
def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min,
date_max) -> {}: date_max) -> {}:
if self.filter.should_block(query):
raise InvalidQueryException("One or more terms in your query is blocked by the search filter. "
"This incident has been reported.")
filters = [] filters = []
if extensions: if extensions:
filters.append({"terms": {"ext": extensions}}) filters.append({"terms": {"ext": extensions}})