mirror of
https://github.com/simon987/od-database.git
synced 2025-04-10 14:06:45 +00:00
Switch to postgresql, finish minimum viable task_tracker/ws_bucket integration
This commit is contained in:
parent
b170f9bfd8
commit
b9f25630b4
@ -37,7 +37,8 @@ def verify():
|
|||||||
)
|
)
|
||||||
|
|
||||||
if "cap" in session:
|
if "cap" in session:
|
||||||
expected = oddb.redis.get(session["cap"]).decode("utf8")
|
expected = oddb.redis.get(session["cap"])
|
||||||
|
expected = expected.decode("utf8") if expected is not None else ""
|
||||||
oddb.redis.delete(session["cap"])
|
oddb.redis.delete(session["cap"])
|
||||||
|
|
||||||
if expected == attempt:
|
if expected == attempt:
|
||||||
|
@ -5,6 +5,7 @@ from logging import FileHandler, StreamHandler
|
|||||||
import redis as r
|
import redis as r
|
||||||
from flask import session, abort
|
from flask import session, abort
|
||||||
|
|
||||||
|
import config
|
||||||
from database import Database
|
from database import Database
|
||||||
from search.search import ElasticSearchEngine
|
from search.search import ElasticSearchEngine
|
||||||
from tasks import TaskManager
|
from tasks import TaskManager
|
||||||
@ -19,13 +20,15 @@ logger.setLevel(logging.DEBUG)
|
|||||||
formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
|
formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
|
||||||
file_handler = FileHandler("oddb.log")
|
file_handler = FileHandler("oddb.log")
|
||||||
file_handler.setFormatter(formatter)
|
file_handler.setFormatter(formatter)
|
||||||
|
for h in logger.handlers:
|
||||||
|
logger.removeHandler(h)
|
||||||
logger.addHandler(file_handler)
|
logger.addHandler(file_handler)
|
||||||
logger.addHandler(StreamHandler(sys.stdout))
|
logger.addHandler(StreamHandler(sys.stdout))
|
||||||
|
|
||||||
taskManager = TaskManager()
|
taskManager = TaskManager()
|
||||||
searchEngine = ElasticSearchEngine("od-database")
|
searchEngine = ElasticSearchEngine("od-database")
|
||||||
searchEngine.start_stats_scheduler()
|
searchEngine.start_stats_scheduler()
|
||||||
db = Database("db.sqlite3")
|
db = Database(config.DB_CONN_STR)
|
||||||
|
|
||||||
redis = r.Redis()
|
redis = r.Redis()
|
||||||
|
|
||||||
|
157
database.py
157
database.py
@ -1,9 +1,9 @@
|
|||||||
import os
|
import time
|
||||||
import sqlite3
|
|
||||||
import uuid
|
import uuid
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import bcrypt
|
import bcrypt
|
||||||
|
import psycopg2
|
||||||
|
|
||||||
|
|
||||||
class BlacklistedWebsite:
|
class BlacklistedWebsite:
|
||||||
@ -31,35 +31,42 @@ class ApiClient:
|
|||||||
|
|
||||||
class Database:
|
class Database:
|
||||||
|
|
||||||
def __init__(self, db_path):
|
def __init__(self, db_conn_str):
|
||||||
|
self.db_conn_str = db_conn_str
|
||||||
|
self.website_cache = dict()
|
||||||
|
self.website_cache_time = 0
|
||||||
|
|
||||||
self.db_path = db_path
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("SELECT EXISTS (SELECT 1 FROM pg_tables "
|
||||||
|
"WHERE tablename = 'searchlogentry')")
|
||||||
|
|
||||||
if not os.path.exists(db_path):
|
if not cursor.fetchone()[0]:
|
||||||
self.init_database()
|
self.init_database()
|
||||||
|
|
||||||
def init_database(self):
|
def init_database(self):
|
||||||
|
|
||||||
|
print("Initializing database")
|
||||||
|
|
||||||
with open("init_script.sql", "r") as f:
|
with open("init_script.sql", "r") as f:
|
||||||
init_script = f.read()
|
init_script = f.read()
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
conn.executescript(init_script)
|
cur = conn.cursor()
|
||||||
conn.commit()
|
cur.execute(init_script)
|
||||||
|
|
||||||
def update_website_date_if_exists(self, website_id):
|
def update_website_date_if_exists(self, website_id):
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id=?", (website_id, ))
|
cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id=%s", (website_id,))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
def insert_website(self, website: Website):
|
def insert_website(self, website: Website):
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute("INSERT INTO Website (url, logged_ip, logged_useragent) VALUES (?,?,?)",
|
cursor.execute("INSERT INTO Website (url, logged_ip, logged_useragent) VALUES (%s,%s,%s)",
|
||||||
(website.url, str(website.logged_ip), str(website.logged_useragent)))
|
(website.url, str(website.logged_ip), str(website.logged_useragent)))
|
||||||
cursor.execute("SELECT LAST_INSERT_ROWID()")
|
cursor.execute("SELECT LAST_INSERT_ROWID()")
|
||||||
|
|
||||||
@ -70,28 +77,28 @@ class Database:
|
|||||||
|
|
||||||
def get_website_by_url(self, url):
|
def get_website_by_url(self, url):
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("SELECT id, url, logged_ip, logged_useragent, last_modified FROM Website WHERE url=?",
|
cursor.execute("SELECT id, url, logged_ip, logged_useragent, last_modified FROM Website WHERE url=%s",
|
||||||
(url, ))
|
(url,))
|
||||||
db_web = cursor.fetchone()
|
db_web = cursor.fetchone()
|
||||||
if db_web:
|
if db_web:
|
||||||
website = Website(db_web[1], db_web[2], db_web[3], db_web[4], db_web[0])
|
website = Website(db_web[1], db_web[2], db_web[3], db_web[4], int(db_web[0].timestamp()))
|
||||||
return website
|
return website
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_website_by_id(self, website_id):
|
def get_website_by_id(self, website_id):
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("SELECT * FROM Website WHERE id=?", (website_id, ))
|
cursor.execute("SELECT * FROM Website WHERE id=%s", (website_id,))
|
||||||
db_web = cursor.fetchone()
|
db_web = cursor.fetchone()
|
||||||
|
|
||||||
if db_web:
|
if db_web:
|
||||||
website = Website(db_web[1], db_web[2], db_web[3], db_web[4])
|
website = Website(db_web[1], db_web[2], db_web[3], int(db_web[4].timestamp()))
|
||||||
website.id = db_web[0]
|
website.id = db_web[0]
|
||||||
return website
|
return website
|
||||||
else:
|
else:
|
||||||
@ -99,57 +106,58 @@ class Database:
|
|||||||
|
|
||||||
def get_websites(self, per_page, page: int, url):
|
def get_websites(self, per_page, page: int, url):
|
||||||
"""Get all websites"""
|
"""Get all websites"""
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("SELECT Website.id, Website.url, Website.last_modified FROM Website "
|
cursor.execute("SELECT Website.id, Website.url, Website.last_modified FROM Website "
|
||||||
"WHERE Website.url LIKE ?"
|
"WHERE Website.url LIKE %s "
|
||||||
"ORDER BY last_modified DESC LIMIT ? OFFSET ?", (url + "%", per_page, page * per_page))
|
"ORDER BY last_modified DESC LIMIT %s OFFSET %s", (url + "%", per_page, page * per_page))
|
||||||
|
|
||||||
return cursor.fetchall()
|
return cursor.fetchall()
|
||||||
|
|
||||||
def get_random_website_id(self):
|
def get_random_website_id(self):
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute("SELECT id FROM Website WHERE id >= (abs(random()) % (SELECT max(id) FROM Website)) LIMIT 1;")
|
cursor.execute(
|
||||||
|
"SELECT id FROM Website WHERE id >= (abs(random()) % (SELECT max(id) FROM Website)) LIMIT 1;")
|
||||||
|
|
||||||
return cursor.fetchone()[0]
|
return cursor.fetchone()[0]
|
||||||
|
|
||||||
def website_exists(self, url):
|
def website_exists(self, url):
|
||||||
"""Check if an url or the parent directory of an url already exists"""
|
"""Check if an url or the parent directory of an url already exists"""
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("SELECT id FROM Website WHERE url = substr(?, 0, length(url) + 1)", (url, ))
|
cursor.execute("SELECT id FROM Website WHERE url = substr(%s, 0, length(url) + 1)", (url,))
|
||||||
website_id = cursor.fetchone()
|
website_id = cursor.fetchone()
|
||||||
return website_id[0] if website_id else None
|
return website_id[0] if website_id else None
|
||||||
|
|
||||||
def delete_website(self, website_id):
|
def delete_website(self, website_id):
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("DELETE FROM Website WHERE id=?", (website_id, ))
|
cursor.execute("DELETE FROM Website WHERE id=%s", (website_id,))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
def check_login(self, username, password) -> bool:
|
def check_login(self, username, password) -> bool:
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("SELECT password FROM Admin WHERE username=?", (username, ))
|
cursor.execute("SELECT password FROM Admin WHERE username=%s", (username,))
|
||||||
|
|
||||||
db_user = cursor.fetchone()
|
db_user = cursor.fetchone()
|
||||||
|
|
||||||
if db_user:
|
if db_user:
|
||||||
return bcrypt.checkpw(password.encode(), db_user[0])
|
return bcrypt.checkpw(password.encode(), db_user[0].tobytes())
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def get_user_role(self, username: str):
|
def get_user_role(self, username: str):
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("SELECT role FROM Admin WHERE username=?", (username, ))
|
cursor.execute("SELECT role FROM Admin WHERE username=%s", (username,))
|
||||||
|
|
||||||
db_user = cursor.fetchone()
|
db_user = cursor.fetchone()
|
||||||
|
|
||||||
@ -159,37 +167,38 @@ class Database:
|
|||||||
|
|
||||||
def generate_login(self, username, password) -> None:
|
def generate_login(self, username, password) -> None:
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
hashed_pw = bcrypt.hashpw(password.encode(), bcrypt.gensalt(12))
|
hashed_pw = bcrypt.hashpw(password.encode(), bcrypt.gensalt(12))
|
||||||
|
|
||||||
cursor.execute("INSERT INTO Admin (username, password, role) VALUES (?,?, 'admin')", (username, hashed_pw))
|
cursor.execute("INSERT INTO Admin (username, password, role) VALUES (%s,%s, 'admin')",
|
||||||
|
(username, hashed_pw))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
def check_api_token(self, token) -> str:
|
def check_api_token(self, token) -> str:
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("SELECT name FROM ApiClient WHERE token=?", (token, ))
|
cursor.execute("SELECT name FROM ApiClient WHERE token=%s", (token,))
|
||||||
result = cursor.fetchone()
|
result = cursor.fetchone()
|
||||||
return result[0] if result else None
|
return result[0] if result else None
|
||||||
|
|
||||||
def generate_api_token(self, name: str) -> str:
|
def generate_api_token(self, name: str) -> str:
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
token = str(uuid.uuid4())
|
token = str(uuid.uuid4())
|
||||||
cursor.execute("INSERT INTO ApiClient (token, name) VALUES (?, ?)", (token, name))
|
cursor.execute("INSERT INTO ApiClient (token, name) VALUES (%s, %s)", (token, name))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def get_tokens(self) -> list:
|
def get_tokens(self) -> list:
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("SELECT token, name FROM ApiClient")
|
cursor.execute("SELECT token, name FROM ApiClient")
|
||||||
@ -198,26 +207,28 @@ class Database:
|
|||||||
|
|
||||||
def delete_token(self, token: str) -> None:
|
def delete_token(self, token: str) -> None:
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("DELETE FROM ApiClient WHERE token=?", (token, ))
|
cursor.execute("DELETE FROM ApiClient WHERE token=%s", (token,))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
def get_all_websites(self) -> dict:
|
def get_all_websites(self) -> dict:
|
||||||
|
if self.website_cache_time + 120 < time.time():
|
||||||
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
# todo: mem cache that
|
cursor.execute("SELECT id, url FROM Website")
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
|
||||||
|
|
||||||
cursor = conn.cursor()
|
result = dict()
|
||||||
|
|
||||||
cursor.execute("SELECT id, url FROM Website")
|
for db_website in cursor.fetchall():
|
||||||
|
result[db_website[0]] = db_website[1]
|
||||||
|
|
||||||
result = {}
|
self.website_cache = result
|
||||||
|
self.website_cache_time = time.time()
|
||||||
|
|
||||||
for db_website in cursor.fetchall():
|
return self.website_cache
|
||||||
result[db_website[0]] = db_website[1]
|
|
||||||
return result
|
|
||||||
|
|
||||||
def join_website_on_search_result(self, page: dict) -> dict:
|
def join_website_on_search_result(self, page: dict) -> dict:
|
||||||
|
|
||||||
@ -248,39 +259,39 @@ class Database:
|
|||||||
websites = self.get_all_websites()
|
websites = self.get_all_websites()
|
||||||
|
|
||||||
for website in stats["website_scatter"]:
|
for website in stats["website_scatter"]:
|
||||||
website[0] = websites.get(website[0], "[DELETED]")
|
website[0] = websites.get(website[0], "[DELETED]")
|
||||||
|
|
||||||
def add_blacklist_website(self, url):
|
def add_blacklist_website(self, url):
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
url = parsed_url.scheme + "://" + parsed_url.netloc
|
url = parsed_url.scheme + "://" + parsed_url.netloc
|
||||||
cursor.execute("INSERT INTO BlacklistedWebsite (url) VALUES (?)", (url, ))
|
cursor.execute("INSERT INTO BlacklistedWebsite (url) VALUES (%s)", (url,))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
def remove_blacklist_website(self, blacklist_id):
|
def remove_blacklist_website(self, blacklist_id):
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("DELETE FROM BlacklistedWebsite WHERE id=?", (blacklist_id, ))
|
cursor.execute("DELETE FROM BlacklistedWebsite WHERE id=%s", (blacklist_id,))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
def is_blacklisted(self, url):
|
def is_blacklisted(self, url):
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
url = parsed_url.scheme + "://" + parsed_url.netloc
|
url = parsed_url.scheme + "://" + parsed_url.netloc
|
||||||
print(url)
|
print(url)
|
||||||
cursor.execute("SELECT id FROM BlacklistedWebsite WHERE url LIKE ? LIMIT 1", (url, ))
|
cursor.execute("SELECT id FROM BlacklistedWebsite WHERE url LIKE %s LIMIT 1", (url,))
|
||||||
|
|
||||||
return cursor.fetchone() is not None
|
return cursor.fetchone() is not None
|
||||||
|
|
||||||
def get_blacklist(self):
|
def get_blacklist(self):
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("SELECT * FROM BlacklistedWebsite")
|
cursor.execute("SELECT * FROM BlacklistedWebsite")
|
||||||
@ -288,10 +299,30 @@ class Database:
|
|||||||
|
|
||||||
def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took):
|
def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took):
|
||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("INSERT INTO SearchLogEntry (remote_addr, forwarded_for, query, extensions, page, blocked, results, took) "
|
cursor.execute(
|
||||||
"VALUES (?,?,?,?,?,?,?,?)", (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took))
|
"INSERT INTO SearchLogEntry "
|
||||||
|
"(remote_addr, forwarded_for, query, extensions, page, blocked, results, took) "
|
||||||
|
"VALUES (%s,%s,%s,%s,%s,%s,%s,%s)",
|
||||||
|
(remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
def get_oldest_updated_websites(self, size: int):
|
||||||
|
|
||||||
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute("SELECT id, url, last_modified FROM website "
|
||||||
|
"ORDER BY last_modified ASC LIMIT %s",
|
||||||
|
(size,))
|
||||||
|
return [Website(url=r[1],
|
||||||
|
website_id=r[0],
|
||||||
|
last_modified=r[2],
|
||||||
|
logged_ip=None,
|
||||||
|
logged_useragent=None
|
||||||
|
)
|
||||||
|
for r in cursor.fetchall()]
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import config
|
||||||
from database import Database
|
from database import Database
|
||||||
from search.search import ElasticSearchEngine
|
from search.search import ElasticSearchEngine
|
||||||
|
|
||||||
@ -9,7 +10,7 @@ def export(outfile="out.csv"):
|
|||||||
|
|
||||||
print("Export started, connecting to databases...")
|
print("Export started, connecting to databases...")
|
||||||
es = ElasticSearchEngine("od-database")
|
es = ElasticSearchEngine("od-database")
|
||||||
db = Database("db.sqlite3")
|
db = Database(config.DB_CONN_STR)
|
||||||
docs = es.stream_all_docs()
|
docs = es.stream_all_docs()
|
||||||
docs_with_website = db.join_website_on_scan(docs)
|
docs_with_website = db.join_website_on_scan(docs)
|
||||||
|
|
||||||
|
@ -1,22 +1,22 @@
|
|||||||
PRAGMA journal_mode=WAL;
|
DROP TABLE IF EXISTS Website, Admin, BlacklistedWebsite, ApiClient, SearchLogEntry;
|
||||||
|
|
||||||
CREATE TABLE Website (
|
CREATE TABLE Website (
|
||||||
|
|
||||||
id INTEGER PRIMARY KEY NOT NULL,
|
id SERIAL PRIMARY KEY NOT NULL,
|
||||||
url TEXT,
|
url TEXT,
|
||||||
logged_ip TEXT,
|
logged_ip TEXT,
|
||||||
logged_useragent TEXT,
|
logged_useragent TEXT,
|
||||||
last_modified INTEGER DEFAULT CURRENT_TIMESTAMP
|
last_modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE Admin (
|
CREATE TABLE Admin (
|
||||||
username TEXT PRIMARY KEY NOT NULL,
|
username TEXT PRIMARY KEY NOT NULL,
|
||||||
password TEXT,
|
password BYTEA,
|
||||||
role TEXT
|
role TEXT
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE BlacklistedWebsite (
|
CREATE TABLE BlacklistedWebsite (
|
||||||
id INTEGER PRIMARY KEY NOT NULL,
|
id SERIAL PRIMARY KEY NOT NULL,
|
||||||
url TEXT
|
url TEXT
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -25,16 +25,15 @@ CREATE TABLE ApiClient (
|
|||||||
token TEXT NOT NULL
|
token TEXT NOT NULL
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
CREATE TABLE SearchLogEntry (
|
CREATE TABLE SearchLogEntry (
|
||||||
id INTEGER PRIMARY KEY,
|
id SERIAL PRIMARY KEY,
|
||||||
search_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
search_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
remote_addr TEXT,
|
remote_addr TEXT,
|
||||||
forwarded_for TEXT,
|
forwarded_for TEXT,
|
||||||
query TEXT,
|
query TEXT,
|
||||||
extensions TEXT,
|
extensions TEXT,
|
||||||
page INT,
|
page INT,
|
||||||
blocked INT DEFAULT 0,
|
blocked BOOLEAN DEFAULT FALSE,
|
||||||
results INT DEFAULT 0,
|
results INT DEFAULT 0,
|
||||||
took INT DEFAULT 0
|
took INT DEFAULT 0
|
||||||
);
|
);
|
||||||
|
@ -8,19 +8,17 @@ praw
|
|||||||
humanfriendly
|
humanfriendly
|
||||||
apscheduler
|
apscheduler
|
||||||
bcrypt
|
bcrypt
|
||||||
ftputil
|
|
||||||
elasticsearch
|
elasticsearch
|
||||||
python-dateutil
|
python-dateutil
|
||||||
flask_httpauth
|
flask_httpauth
|
||||||
ujson
|
ujson
|
||||||
urllib3
|
urllib3
|
||||||
pyOpenSSL
|
pyOpenSSL
|
||||||
pybloom-live
|
|
||||||
pycurl
|
|
||||||
lxml
|
lxml
|
||||||
pillow
|
pillow
|
||||||
Wand
|
Wand
|
||||||
numpy
|
numpy
|
||||||
matplotlib
|
matplotlib
|
||||||
uwsgi
|
uwsgi
|
||||||
redis
|
redis
|
||||||
|
psycopg2-binary
|
74
tasks.py
74
tasks.py
@ -2,6 +2,7 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
from multiprocessing.pool import ThreadPool
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
@ -9,6 +10,7 @@ import urllib3
|
|||||||
|
|
||||||
import config
|
import config
|
||||||
import database
|
import database
|
||||||
|
from database import Website
|
||||||
from search.search import ElasticSearchEngine
|
from search.search import ElasticSearchEngine
|
||||||
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
|
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
|
||||||
from ws_bucket_client.api import WsBucketApi
|
from ws_bucket_client.api import WsBucketApi
|
||||||
@ -59,7 +61,7 @@ class TaskManager:
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.search = ElasticSearchEngine("od-database")
|
self.search = ElasticSearchEngine("od-database")
|
||||||
self.db = database.Database("db.sqlite3")
|
self.db = database.Database(config.DB_CONN_STR)
|
||||||
self.tracker = TaskTrackerApi(config.TT_API)
|
self.tracker = TaskTrackerApi(config.TT_API)
|
||||||
|
|
||||||
self.worker = Worker.from_file(self.tracker)
|
self.worker = Worker.from_file(self.tracker)
|
||||||
@ -71,25 +73,33 @@ class TaskManager:
|
|||||||
|
|
||||||
self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)
|
self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)
|
||||||
|
|
||||||
self._indexer_thread = Thread(target=self._do_indexing)
|
self._indexer_threads = list()
|
||||||
self._indexer_thread.start()
|
logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, ))
|
||||||
|
for _ in range(config.INDEXER_THREADS):
|
||||||
|
t = Thread(target=self._do_indexing)
|
||||||
|
self._indexer_threads.append(t)
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
self._recrawl_thread = Thread(target=self._do_recrawl)
|
||||||
|
self._recrawl_thread.start()
|
||||||
|
|
||||||
def _do_indexing(self):
|
def _do_indexing(self):
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
logger.debug("Fetching indexing task...")
|
task = self.worker.fetch_task(project_id=config.TT_INDEX_PROJECT)
|
||||||
task = self.tracker.fetch_task(worker=self.worker, project_id=config.TT_INDEX_PROJECT)
|
|
||||||
|
|
||||||
if task:
|
if task:
|
||||||
try:
|
try:
|
||||||
recipe = task.json_recipe()
|
recipe = task.json_recipe()
|
||||||
logger.debug("Got indexing task: " + str(recipe))
|
logger.debug("Got indexing task: " + str(recipe))
|
||||||
filename = os.path.join(config.WSB_PATH, format_file_name(recipe["website_id"], recipe["upload_token"]))
|
filename = os.path.join(config.WSB_PATH,
|
||||||
|
format_file_name(recipe["website_id"], recipe["upload_token"]))
|
||||||
|
self._complete_task(filename, Task(recipe["website_id"], recipe["url"]))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
self.worker.release_task(task_id=task.id, result=1, verification=0)
|
||||||
finally:
|
finally:
|
||||||
try:
|
try:
|
||||||
self._complete_task(filename, Task(recipe["website_id"], recipe["url"]))
|
self.worker.release_task(task_id=task.id, result=0, verification=0)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
@ -108,29 +118,34 @@ class TaskManager:
|
|||||||
line = f.readline()
|
line = f.readline()
|
||||||
|
|
||||||
self.search.import_json(iter_lines(), task.website_id)
|
self.search.import_json(iter_lines(), task.website_id)
|
||||||
|
os.remove(file_list)
|
||||||
|
|
||||||
self.db.update_website_date_if_exists(task.website_id)
|
self.db.update_website_date_if_exists(task.website_id)
|
||||||
|
|
||||||
def fetch_indexing_task(self):
|
def _do_recrawl(self):
|
||||||
|
while True:
|
||||||
|
time.sleep(60 * 30)
|
||||||
|
logger.debug("Creating re-crawl tasks")
|
||||||
|
self._generate_crawling_tasks()
|
||||||
|
|
||||||
task = self.tracker.fetch_task(worker=self.worker, project_id=config.TT_INDEX_PROJECT)
|
def _generate_crawling_tasks(self):
|
||||||
print(task)
|
|
||||||
|
# TODO: Insert more in-depth re-crawl logic here
|
||||||
|
websites_to_crawl = self.db.get_oldest_updated_websites(10000)
|
||||||
|
|
||||||
|
def recrawl(website: Website):
|
||||||
|
crawl_task = Task(website.id, website.url,
|
||||||
|
priority=(int((time.time() - website.last_modified.timestamp()) / 3600))
|
||||||
|
)
|
||||||
|
self.queue_task(crawl_task)
|
||||||
|
|
||||||
|
pool = ThreadPool(processes=10)
|
||||||
|
pool.map(func=recrawl, iterable=websites_to_crawl)
|
||||||
|
|
||||||
def queue_task(self, task: Task):
|
def queue_task(self, task: Task):
|
||||||
|
|
||||||
max_assign_time = 24 * 7 * 3600
|
max_assign_time = 24 * 7 * 3600
|
||||||
upload_token = uuid4().__str__()
|
upload_token = uuid4().__str__()
|
||||||
|
|
||||||
bucket_response = self.bucket.allocate(upload_token.__str__(),
|
|
||||||
21474837499, # 20Gib
|
|
||||||
format_file_name(task.website_id, upload_token),
|
|
||||||
to_dispose_date=int(time.time() + max_assign_time),
|
|
||||||
upload_hook="")
|
|
||||||
if not bucket_response:
|
|
||||||
return
|
|
||||||
|
|
||||||
print("Allocated upload bucket: %d, t=%s, r=%s" % (task.website_id, upload_token, bucket_response.text))
|
|
||||||
|
|
||||||
task.upload_token = upload_token
|
task.upload_token = upload_token
|
||||||
tracker_response = self.worker.submit_task(config.TT_CRAWL_PROJECT,
|
tracker_response = self.worker.submit_task(config.TT_CRAWL_PROJECT,
|
||||||
recipe=task.__str__(),
|
recipe=task.__str__(),
|
||||||
@ -140,9 +155,18 @@ class TaskManager:
|
|||||||
verification_count=1,
|
verification_count=1,
|
||||||
max_retries=3
|
max_retries=3
|
||||||
)
|
)
|
||||||
print("Queued task and made it available to crawlers: t=%s, r=%s" % (task, tracker_response.text))
|
print(tracker_response.text)
|
||||||
|
logging.info("Queued task and made it available to crawlers: t=%s, r=%s" % (task, tracker_response.text))
|
||||||
|
if not tracker_response.json()["ok"]:
|
||||||
|
return
|
||||||
|
|
||||||
|
bucket_response = self.bucket.allocate(upload_token.__str__(),
|
||||||
|
21474837499, # 20Gib
|
||||||
|
format_file_name(task.website_id, upload_token),
|
||||||
|
to_dispose_date=int(time.time() + max_assign_time),
|
||||||
|
upload_hook="")
|
||||||
|
logging.info("Allocated upload bucket: %d, t=%s, r=%s" % (task.website_id, upload_token, bucket_response.text))
|
||||||
|
|
||||||
|
|
||||||
def format_file_name(website_id, token):
|
def format_file_name(website_id, token):
|
||||||
return "%d_%s.NDJSON" % (website_id, token, )
|
return "%d_%s.NDJSON" % (website_id, token,)
|
||||||
|
|
||||||
|
@ -74,12 +74,6 @@
|
|||||||
</form>
|
</form>
|
||||||
|
|
||||||
<br>
|
<br>
|
||||||
<hr>
|
|
||||||
<h3>Misc actions</h3>
|
|
||||||
|
|
||||||
<a class="btn btn-danger" href="/website/delete_empty">Delete websites with no associated files that are
|
|
||||||
not queued</a>
|
|
||||||
|
|
||||||
<hr>
|
<hr>
|
||||||
<a class="btn btn-info" href="/logout">Logout</a>
|
<a class="btn btn-info" href="/logout">Logout</a>
|
||||||
</div>
|
</div>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user