mirror of
https://github.com/simon987/od-database.git
synced 2025-04-18 18:06:44 +00:00
Search filter
This commit is contained in:
parent
a461b22ffc
commit
6e80791264
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
[submodule "fold_to_ascii"]
|
||||||
|
path = fold_to_ascii
|
||||||
|
url = https://github.com/spanishdict/fold_to_ascii
|
20
app.py
20
app.py
@ -5,13 +5,13 @@ from urllib.parse import urlparse
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import datetime
|
import datetime
|
||||||
from database import Database, Website, InvalidQueryException
|
from database import Database, Website
|
||||||
from flask_recaptcha import ReCaptcha
|
from flask_recaptcha import ReCaptcha
|
||||||
import od_util
|
import od_util
|
||||||
import config
|
import config
|
||||||
from flask_caching import Cache
|
from flask_caching import Cache
|
||||||
from tasks import TaskManager, Task, TaskResult
|
from tasks import TaskManager, Task, TaskResult
|
||||||
from search.search import ElasticSearchEngine
|
from search.search import ElasticSearchEngine, InvalidQueryException
|
||||||
from callbacks import PostCrawlCallbackFactory
|
from callbacks import PostCrawlCallbackFactory
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
@ -287,11 +287,10 @@ def search():
|
|||||||
|
|
||||||
if len(q) >= 3:
|
if len(q) >= 3:
|
||||||
|
|
||||||
|
blocked = False
|
||||||
|
hits = None
|
||||||
response = request.args.get("g-recaptcha-response", "")
|
response = request.args.get("g-recaptcha-response", "")
|
||||||
if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response):
|
if not config.CAPTCHA_SEARCH or recaptcha_search.verify(response):
|
||||||
db.log_search(request.remote_addr,
|
|
||||||
request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
|
|
||||||
q, extensions, page)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
hits = searchEngine.search(q, page, per_page, sort_order,
|
hits = searchEngine.search(q, page, per_page, sort_order,
|
||||||
@ -299,14 +298,19 @@ def search():
|
|||||||
hits = db.join_website_on_search_result(hits)
|
hits = db.join_website_on_search_result(hits)
|
||||||
except InvalidQueryException as e:
|
except InvalidQueryException as e:
|
||||||
flash("<strong>Invalid query:</strong> " + str(e), "warning")
|
flash("<strong>Invalid query:</strong> " + str(e), "warning")
|
||||||
return redirect("/search")
|
blocked = True
|
||||||
except Exception:
|
except Exception:
|
||||||
flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
|
flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
|
||||||
"Please try again later", "danger")
|
"Please try again later", "danger")
|
||||||
hits = None
|
|
||||||
|
db.log_search(request.remote_addr,
|
||||||
|
request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None,
|
||||||
|
q, extensions, page, blocked,
|
||||||
|
hits["hits"]["total"] if hits else -1, hits["took"] if hits else -1)
|
||||||
|
if blocked:
|
||||||
|
return redirect("/search")
|
||||||
else:
|
else:
|
||||||
flash("<strong>Error:</strong> Invalid captcha please try again", "danger")
|
flash("<strong>Error:</strong> Invalid captcha please try again", "danger")
|
||||||
hits = None
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
hits = None
|
hits = None
|
||||||
|
@ -7,8 +7,6 @@ import bcrypt
|
|||||||
import uuid
|
import uuid
|
||||||
import tasks
|
import tasks
|
||||||
|
|
||||||
class InvalidQueryException(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class BlacklistedWebsite:
|
class BlacklistedWebsite:
|
||||||
@ -351,13 +349,13 @@ class Database:
|
|||||||
|
|
||||||
return stats
|
return stats
|
||||||
|
|
||||||
def log_search(self, remote_addr, forwarded_for, q, exts, page):
|
def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took):
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page) VALUES "
|
cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page, blocked, results, took) "
|
||||||
"(?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page))
|
"VALUES (?,?,?,?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
@ -45,7 +45,10 @@ CREATE TABLE SearchLogEntry (
|
|||||||
forwarded_for TEXT,
|
forwarded_for TEXT,
|
||||||
query TEXT,
|
query TEXT,
|
||||||
extensions TEXT,
|
extensions TEXT,
|
||||||
page INT
|
page INT,
|
||||||
|
blocked INT DEFAULT 0,
|
||||||
|
results INT DEFAULT 0,
|
||||||
|
took INT DEFAULT 0
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE Queue (
|
CREATE TABLE Queue (
|
||||||
|
28
search/filter.py
Normal file
28
search/filter.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from fold_to_ascii.fold_to_ascii import mapping
|
||||||
|
|
||||||
|
|
||||||
|
class SearchFilter:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
|
||||||
|
self.blacklisted_terms = set()
|
||||||
|
self.table = str.maketrans(dict(mapping.translate_table))
|
||||||
|
|
||||||
|
if os.path.exists("search_blacklist.txt"):
|
||||||
|
with open("search_blacklist.txt") as f:
|
||||||
|
self.blacklisted_terms.update(line.strip() for line in f.readlines() if line[0] != "#" and line.strip())
|
||||||
|
|
||||||
|
def should_block(self, query) -> bool:
|
||||||
|
|
||||||
|
query = query.translate(self.table)
|
||||||
|
query = query.lower()
|
||||||
|
|
||||||
|
for raw_token in query.split():
|
||||||
|
|
||||||
|
token = raw_token.strip("\"'/\\").strip()
|
||||||
|
if token in self.blacklisted_terms:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
@ -5,6 +5,12 @@ import os
|
|||||||
import ujson
|
import ujson
|
||||||
from apscheduler.schedulers.background import BackgroundScheduler
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
|
|
||||||
|
from search.filter import SearchFilter
|
||||||
|
|
||||||
|
|
||||||
|
class InvalidQueryException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class IndexingError(Exception):
|
class IndexingError(Exception):
|
||||||
pass
|
pass
|
||||||
@ -49,6 +55,7 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
self.index_name = index_name
|
self.index_name = index_name
|
||||||
self.es = elasticsearch.Elasticsearch()
|
self.es = elasticsearch.Elasticsearch()
|
||||||
|
self.filter = SearchFilter()
|
||||||
|
|
||||||
if not self.es.indices.exists(self.index_name):
|
if not self.es.indices.exists(self.index_name):
|
||||||
self.init()
|
self.init()
|
||||||
@ -165,6 +172,10 @@ class ElasticSearchEngine(SearchEngine):
|
|||||||
def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min,
|
def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min,
|
||||||
date_max) -> {}:
|
date_max) -> {}:
|
||||||
|
|
||||||
|
if self.filter.should_block(query):
|
||||||
|
raise InvalidQueryException("One or more terms in your query is blocked by the search filter. "
|
||||||
|
"This incident has been reported.")
|
||||||
|
|
||||||
filters = []
|
filters = []
|
||||||
if extensions:
|
if extensions:
|
||||||
filters.append({"terms": {"ext": extensions}})
|
filters.append({"terms": {"ext": extensions}})
|
||||||
|
Loading…
x
Reference in New Issue
Block a user