mirror of
https://github.com/simon987/od-database.git
synced 2025-04-16 00:46:46 +00:00
Search filter
This commit is contained in:
parent
a461b22ffc
commit
6e80791264
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
[submodule "fold_to_ascii"]
|
||||
path = fold_to_ascii
|
||||
url = https://github.com/spanishdict/fold_to_ascii
|
20
app.py
20
app.py
@ -5,13 +5,13 @@ from urllib.parse import urlparse
|
||||
import os
|
||||
import time
|
||||
import datetime
|
||||
from database import Database, Website, InvalidQueryException
|
||||
from database import Database, Website
|
||||
from flask_recaptcha import ReCaptcha
|
||||
import od_util
|
||||
import config
|
||||
from flask_caching import Cache
|
||||
from tasks import TaskManager, Task, TaskResult
|
||||
from search.search import ElasticSearchEngine
|
||||
from search.search import ElasticSearchEngine, InvalidQueryException
|
||||
from callbacks import PostCrawlCallbackFactory
|
||||
|
||||
app = Flask(__name__)
|
||||
@ -287,11 +287,10 @@ def search():
|
||||
|
||||
if len(q) >= 3:
|
||||
|
||||
blocked = False
|
||||
hits = None
|
||||
response = request.args.get("g-recaptcha-response", "")
|
||||
if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response):
|
||||
db.log_search(request.remote_addr,
|
||||
request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
|
||||
q, extensions, page)
|
||||
|
||||
try:
|
||||
hits = searchEngine.search(q, page, per_page, sort_order,
|
||||
@ -299,14 +298,19 @@ def search():
|
||||
hits = db.join_website_on_search_result(hits)
|
||||
except InvalidQueryException as e:
|
||||
flash("<strong>Invalid query:</strong> " + str(e), "warning")
|
||||
return redirect("/search")
|
||||
blocked = True
|
||||
except Exception:
|
||||
flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
|
||||
"Please try again later", "danger")
|
||||
hits = None
|
||||
|
||||
db.log_search(request.remote_addr,
|
||||
request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
|
||||
q, extensions, page, blocked,
|
||||
hits["hits"]["total"] if hits else -1, hits["took"] if hits else -1)
|
||||
if blocked:
|
||||
return redirect("/search")
|
||||
else:
|
||||
flash("<strong>Error:</strong> Invalid captcha please try again", "danger")
|
||||
hits = None
|
||||
|
||||
else:
|
||||
hits = None
|
||||
|
@ -7,8 +7,6 @@ import bcrypt
|
||||
import uuid
|
||||
import tasks
|
||||
|
||||
class InvalidQueryException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class BlacklistedWebsite:
|
||||
@ -351,13 +349,13 @@ class Database:
|
||||
|
||||
return stats
|
||||
|
||||
def log_search(self, remote_addr, forwarded_for, q, exts, page):
|
||||
def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took):
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page) VALUES "
|
||||
"(?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page))
|
||||
cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page, blocked, results, took) "
|
||||
"VALUES (?,?,?,?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took))
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
@ -45,7 +45,10 @@ CREATE TABLE SearchLogEntry (
|
||||
forwarded_for TEXT,
|
||||
query TEXT,
|
||||
extensions TEXT,
|
||||
page INT
|
||||
page INT,
|
||||
blocked INT DEFAULT 0,
|
||||
results INT DEFAULT 0,
|
||||
took INT DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE TABLE Queue (
|
||||
|
28
search/filter.py
Normal file
28
search/filter.py
Normal file
@ -0,0 +1,28 @@
|
||||
import os
|
||||
|
||||
from fold_to_ascii.fold_to_ascii import mapping
|
||||
|
||||
|
||||
class SearchFilter:
|
||||
|
||||
def __init__(self):
|
||||
|
||||
self.blacklisted_terms = set()
|
||||
self.table = str.maketrans(dict(mapping.translate_table))
|
||||
|
||||
if os.path.exists("search_blacklist.txt"):
|
||||
with open("search_blacklist.txt") as f:
|
||||
self.blacklisted_terms.update(line.strip() for line in f.readlines() if line[0] != "#" and line.strip())
|
||||
|
||||
def should_block(self, query) -> bool:
|
||||
|
||||
query = query.translate(self.table)
|
||||
query = query.lower()
|
||||
|
||||
for raw_token in query.split():
|
||||
|
||||
token = raw_token.strip("\"'/\\").strip()
|
||||
if token in self.blacklisted_terms:
|
||||
return True
|
||||
|
||||
return False
|
@ -5,6 +5,12 @@ import os
|
||||
import ujson
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
|
||||
from search.filter import SearchFilter
|
||||
|
||||
|
||||
class InvalidQueryException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class IndexingError(Exception):
|
||||
pass
|
||||
@ -49,6 +55,7 @@ class ElasticSearchEngine(SearchEngine):
|
||||
super().__init__()
|
||||
self.index_name = index_name
|
||||
self.es = elasticsearch.Elasticsearch()
|
||||
self.filter = SearchFilter()
|
||||
|
||||
if not self.es.indices.exists(self.index_name):
|
||||
self.init()
|
||||
@ -165,6 +172,10 @@ class ElasticSearchEngine(SearchEngine):
|
||||
def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min,
|
||||
date_max) -> {}:
|
||||
|
||||
if self.filter.should_block(query):
|
||||
raise InvalidQueryException("One or more terms in your query is blocked by the search filter. "
|
||||
"This incident has been reported.")
|
||||
|
||||
filters = []
|
||||
if extensions:
|
||||
filters.append({"terms": {"ext": extensions}})
|
||||
|
Loading…
x
Reference in New Issue
Block a user