Search filter

This commit is contained in:
Simon 2018-11-16 16:49:23 -05:00
parent a461b22ffc
commit 6e80791264
6 changed files with 61 additions and 14 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "fold_to_ascii"]
path = fold_to_ascii
url = https://github.com/spanishdict/fold_to_ascii

20
app.py
View File

@ -5,13 +5,13 @@ from urllib.parse import urlparse
import os
import time
import datetime
from database import Database, Website, InvalidQueryException
from database import Database, Website
from flask_recaptcha import ReCaptcha
import od_util
import config
from flask_caching import Cache
from tasks import TaskManager, Task, TaskResult
from search.search import ElasticSearchEngine
from search.search import ElasticSearchEngine, InvalidQueryException
from callbacks import PostCrawlCallbackFactory
app = Flask(__name__)
@ -287,11 +287,10 @@ def search():
if len(q) >= 3:
blocked = False
hits = None
response = request.args.get("g-recaptcha-response", "")
if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response):
db.log_search(request.remote_addr,
request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
q, extensions, page)
try:
hits = searchEngine.search(q, page, per_page, sort_order,
@ -299,14 +298,19 @@ def search():
hits = db.join_website_on_search_result(hits)
except InvalidQueryException as e:
flash("<strong>Invalid query:</strong> " + str(e), "warning")
return redirect("/search")
blocked = True
except Exception:
flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
"Please try again later", "danger")
hits = None
db.log_search(request.remote_addr,
request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
q, extensions, page, blocked,
hits["hits"]["total"] if hits else -1, hits["took"] if hits else -1)
if blocked:
return redirect("/search")
else:
flash("<strong>Error:</strong> Invalid captcha please try again", "danger")
hits = None
else:
hits = None

View File

@ -7,8 +7,6 @@ import bcrypt
import uuid
import tasks
class InvalidQueryException(Exception):
pass
class BlacklistedWebsite:
@ -351,13 +349,13 @@ class Database:
return stats
def log_search(self, remote_addr, forwarded_for, q, exts, page):
def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took):
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page) VALUES "
"(?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page))
cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page, blocked, results, took) "
"VALUES (?,?,?,?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took))
conn.commit()

View File

@ -45,7 +45,10 @@ CREATE TABLE SearchLogEntry (
forwarded_for TEXT,
query TEXT,
extensions TEXT,
page INT
page INT,
blocked INT DEFAULT 0,
results INT DEFAULT 0,
took INT DEFAULT 0
);
CREATE TABLE Queue (

28
search/filter.py Normal file
View File

@ -0,0 +1,28 @@
import os
from fold_to_ascii.fold_to_ascii import mapping
class SearchFilter:
def __init__(self):
self.blacklisted_terms = set()
self.table = str.maketrans(dict(mapping.translate_table))
if os.path.exists("search_blacklist.txt"):
with open("search_blacklist.txt") as f:
self.blacklisted_terms.update(line.strip() for line in f.readlines() if line[0] != "#" and line.strip())
def should_block(self, query) -> bool:
query = query.translate(self.table)
query = query.lower()
for raw_token in query.split():
token = raw_token.strip("\"'/\\").strip()
if token in self.blacklisted_terms:
return True
return False

View File

@ -5,6 +5,12 @@ import os
import ujson
from apscheduler.schedulers.background import BackgroundScheduler
from search.filter import SearchFilter
class InvalidQueryException(Exception):
pass
class IndexingError(Exception):
pass
@ -49,6 +55,7 @@ class ElasticSearchEngine(SearchEngine):
super().__init__()
self.index_name = index_name
self.es = elasticsearch.Elasticsearch()
self.filter = SearchFilter()
if not self.es.indices.exists(self.index_name):
self.init()
@ -165,6 +172,10 @@ class ElasticSearchEngine(SearchEngine):
def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min,
date_max) -> {}:
if self.filter.should_block(query):
raise InvalidQueryException("One or more terms in your query is blocked by the search filter. "
"This incident has been reported.")
filters = []
if extensions:
filters.append({"terms": {"ext": extensions}})