mirror of
https://github.com/simon987/od-database.git
synced 2025-12-13 14:59:02 +00:00
Tasks can now be queued from the web interface. Tasks are dispatched to the crawl server(s)
This commit is contained in:
252
database.py
252
database.py
@@ -20,16 +20,6 @@ class Website:
|
||||
self.id = website_id
|
||||
|
||||
|
||||
class File:
|
||||
|
||||
def __init__(self, website_id: int, path: str, mime: str, name: str, size: int):
|
||||
self.mime = mime
|
||||
self.size = size
|
||||
self.name = name
|
||||
self.path = path
|
||||
self.website_id = website_id
|
||||
|
||||
|
||||
class ApiToken:
|
||||
|
||||
def __init__(self, token, description):
|
||||
@@ -39,13 +29,6 @@ class ApiToken:
|
||||
|
||||
class Database:
|
||||
|
||||
SORT_ORDERS = {
|
||||
"score": "ORDER BY rank",
|
||||
"size_asc": "ORDER BY size ASC",
|
||||
"size_dsc": "ORDER BY size DESC",
|
||||
"none": ""
|
||||
}
|
||||
|
||||
def __init__(self, db_path):
|
||||
|
||||
self.db_path = db_path
|
||||
@@ -75,60 +58,6 @@ class Database:
|
||||
|
||||
return website_id
|
||||
|
||||
def insert_files(self, files: list):
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Insert Paths first
|
||||
website_paths = dict()
|
||||
for file in files:
|
||||
if file.path not in website_paths:
|
||||
cursor.execute("INSERT INTO WebsitePath (website_id, path) VALUES (?,?)",
|
||||
(file.website_id, file.path))
|
||||
cursor.execute("SELECT LAST_INSERT_ROWID()")
|
||||
website_paths[file.path] = cursor.fetchone()[0]
|
||||
|
||||
# Then FileTypes
|
||||
mimetypes = dict()
|
||||
cursor.execute("SELECT * FROM FileType")
|
||||
db_mimetypes = cursor.fetchall()
|
||||
for db_mimetype in db_mimetypes:
|
||||
mimetypes[db_mimetype[1]] = db_mimetype[0]
|
||||
for file in files:
|
||||
if file.mime not in mimetypes:
|
||||
cursor.execute("INSERT INTO FileType (mime) VALUES (?)", (file.mime, ))
|
||||
cursor.execute("SELECT LAST_INSERT_ROWID()")
|
||||
mimetypes[file.mime] = cursor.fetchone()[0]
|
||||
|
||||
conn.commit()
|
||||
# Then insert files
|
||||
cursor.executemany("INSERT INTO File (path_id, name, size, mime_id) VALUES (?,?,?,?)",
|
||||
[(website_paths[x.path], x.name, x.size, mimetypes[x.mime]) for x in files])
|
||||
|
||||
# Update date
|
||||
if len(files) > 0:
|
||||
cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id = ?",
|
||||
(files[0].website_id, ))
|
||||
|
||||
conn.commit()
|
||||
|
||||
def import_json(self, json_file, website: Website):
|
||||
|
||||
if not self.get_website_by_url(website.url):
|
||||
website_id = self.insert_website(website)
|
||||
else:
|
||||
website_id = website.id
|
||||
|
||||
with open(json_file, "r") as f:
|
||||
try:
|
||||
self.insert_files([File(website_id, x["path"], os.path.splitext(x["name"])[1].lower(), x["name"], x["size"])
|
||||
for x in json.load(f)])
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print("Couldn't read json file!")
|
||||
pass
|
||||
|
||||
def get_website_by_url(self, url):
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
@@ -158,152 +87,6 @@ class Database:
|
||||
else:
|
||||
return None
|
||||
|
||||
def enqueue(self, website_id, reddit_post_id=None, reddit_comment_id=None, priority=1):
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
if reddit_post_id:
|
||||
cursor.execute("INSERT OR IGNORE INTO Queue (website_id, reddit_post_id, priority) VALUES (?,?,?)",
|
||||
(website_id, reddit_post_id, priority))
|
||||
else:
|
||||
cursor.execute("INSERT OR IGNORE INTO Queue (website_id, reddit_comment_id, priority) VALUES (?,?,?)",
|
||||
(website_id, reddit_comment_id, priority))
|
||||
conn.commit()
|
||||
|
||||
def dequeue(self):
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT website_id, reddit_post_id, reddit_comment_id"
|
||||
" FROM Queue ORDER BY priority DESC, Queue.id ASC LIMIT 1")
|
||||
website = cursor.fetchone()
|
||||
|
||||
if website:
|
||||
cursor.execute("DELETE FROM Queue WHERE website_id=?", (website[0],))
|
||||
return website[0], website[1], website[2]
|
||||
else:
|
||||
return None
|
||||
|
||||
def queue(self):
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT url, logged_ip, logged_useragent, last_modified "
|
||||
"FROM Queue INNER JOIN Website ON website_id=Website.id "
|
||||
"ORDER BY Queue.priority DESC, Queue.id ASC")
|
||||
|
||||
return [Website(x[0], x[1], x[2], x[3]) for x in cursor.fetchall()]
|
||||
|
||||
def get_stats(self):
|
||||
|
||||
stats = {}
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT COUNT(*), SUM(size) FROM File")
|
||||
db_files = cursor.fetchone()
|
||||
|
||||
stats["file_count"] = db_files[0]
|
||||
stats["file_size"] = db_files[1]
|
||||
|
||||
cursor.execute("SELECT COUNT(DISTINCT website_id), COUNT(id) FROM WebsitePath")
|
||||
db_websites = cursor.fetchone()
|
||||
stats["website_count"] = db_websites[0]
|
||||
stats["website_paths"] = db_websites[1]
|
||||
|
||||
return stats
|
||||
|
||||
def search(self, q, limit: int = 50, offset: int = 0, sort_order="score"):
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
order_by = Database.SORT_ORDERS.get(sort_order, "")
|
||||
cursor.execute("SELECT size, Website.url, WebsitePath.path, File.name, Website.id FROM File_index "
|
||||
"INNER JOIN File ON File.id = File_index.rowid "
|
||||
"INNER JOIN WebsitePath ON File.path_id = WebsitePath.id "
|
||||
"INNER JOIN Website ON website_id = Website.id "
|
||||
"WHERE File_index MATCH ? " +
|
||||
order_by + " LIMIT ? OFFSET ?",
|
||||
(q, limit, offset * limit))
|
||||
except sqlite3.OperationalError as e:
|
||||
raise InvalidQueryException(str(e))
|
||||
|
||||
return cursor.fetchall()
|
||||
|
||||
def get_website_stats(self, website_id):
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT SUM(File.size), COUNT(*) FROM File "
|
||||
"INNER JOIN WebsitePath Path on File.path_id = Path.id "
|
||||
"WHERE Path.website_id = ?", (website_id, ))
|
||||
file_sum, file_count = cursor.fetchone()
|
||||
|
||||
cursor.execute("SELECT SUM(File.size) as total_size, COUNT(File.id), FileType.mime FROM File "
|
||||
"INNER JOIN FileType ON FileType.id = File.mime_id "
|
||||
"INNER JOIN WebsitePath Path on File.path_id = Path.id "
|
||||
"WHERE Path.website_id = ? "
|
||||
"GROUP BY FileType.id ORDER BY total_size DESC", (website_id, ))
|
||||
db_mime_stats = cursor.fetchall()
|
||||
|
||||
cursor.execute("SELECT Website.url, Website.last_modified FROM Website WHERE id = ?", (website_id, ))
|
||||
website_url, website_date = cursor.fetchone()
|
||||
|
||||
return {
|
||||
"total_size": file_sum if file_sum else 0,
|
||||
"total_count": file_count if file_count else 0,
|
||||
"base_url": website_url,
|
||||
"report_time": website_date,
|
||||
"mime_stats": db_mime_stats
|
||||
}
|
||||
|
||||
def get_subdir_stats(self, website_id: int, path: str):
|
||||
"""Get stats of a sub directory. path must not start with / and must end with /"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT SUM(File.size), COUNT(*) FROM File "
|
||||
"INNER JOIN WebsitePath Path on File.path_id = Path.id "
|
||||
"WHERE Path.website_id = ? AND Path.path LIKE ?", (website_id, path + "%"))
|
||||
file_sum, file_count = cursor.fetchone()
|
||||
|
||||
cursor.execute("SELECT SUM(File.size) as total_size, COUNT(File.id), FileType.mime FROM File "
|
||||
"INNER JOIN FileType ON FileType.id = File.mime_id "
|
||||
"INNER JOIN WebsitePath Path on File.path_id = Path.id "
|
||||
"WHERE Path.website_id = ? AND Path.path LIKE ? "
|
||||
"GROUP BY FileType.id ORDER BY total_size DESC", (website_id, path + "%"))
|
||||
db_mime_stats = cursor.fetchall()
|
||||
|
||||
cursor.execute("SELECT Website.url, Website.last_modified FROM Website WHERE id = ?", (website_id, ))
|
||||
website_url, website_date = cursor.fetchone()
|
||||
|
||||
return {
|
||||
"total_size": file_sum if file_sum else 0,
|
||||
"total_count": file_count if file_count else 0,
|
||||
"base_url": website_url,
|
||||
"report_time": website_date,
|
||||
"mime_stats": db_mime_stats
|
||||
}
|
||||
|
||||
def get_website_links(self, website_id):
|
||||
"""Get all download links of a website"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
website = self.get_website_by_id(website_id)
|
||||
|
||||
cursor.execute("SELECT File.name, WebsitePath.path FROM File "
|
||||
"INNER JOIN WebsitePath on File.path_id = WebsitePath.id "
|
||||
"WHERE WebsitePath.website_id = ?", (website.id, ))
|
||||
|
||||
return [website.url + x[1] + ("/" if len(x[1]) > 0 else "") + x[0] for x in cursor.fetchall()]
|
||||
|
||||
def get_websites(self, per_page, page: int):
|
||||
"""Get all websites"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
@@ -325,29 +108,13 @@ class Database:
|
||||
|
||||
def website_has_been_scanned(self, url):
|
||||
"""Check if a website has at least 1 file"""
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
website_id = self.website_exists(url)
|
||||
|
||||
if website_id:
|
||||
cursor.execute("SELECT COUNT(Path.id) FROM Website "
|
||||
"INNER JOIN WebsitePath Path on Website.id = Path.website_id "
|
||||
"WHERE Website.id = ?", (website_id, ))
|
||||
return cursor.fetchone()[0] > 0
|
||||
return None
|
||||
# TODO: Check with SearchEngine
|
||||
print("FIXME: website_has_been_scanned")
|
||||
|
||||
def clear_website(self, website_id):
|
||||
"""Remove all files from a website and update its last_updated date"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("DELETE FROM File WHERE File.path_id IN (SELECT WebsitePath.id "
|
||||
"FROM WebsitePath WHERE WebsitePath.website_id=?)", (website_id, ))
|
||||
cursor.execute("DELETE FROM WebsitePath WHERE website_id=?", (website_id, ))
|
||||
cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id=?", (website_id, ))
|
||||
conn.commit()
|
||||
# TODO: Check with SearchEngine
|
||||
print("FIXME: clear_website")
|
||||
|
||||
def get_websites_older(self, delta: datetime.timedelta):
|
||||
"""Get websites last updated before a given date"""
|
||||
@@ -358,17 +125,6 @@ class Database:
|
||||
cursor.execute("SELECT Website.id FROM Website WHERE last_modified < ?", (date, ))
|
||||
return [x[0] for x in cursor.fetchall()]
|
||||
|
||||
def get_websites_smaller(self, size: int):
|
||||
"""Get the websites with total size smaller than specified"""
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT Website.id FROM Website "
|
||||
"INNER JOIN WebsitePath Path on Website.id = Path.website_id "
|
||||
"INNER JOIN File F on Path.id = F.path_id "
|
||||
"GROUP BY Website.id HAVING SUM(F.size) < ?", (size, ))
|
||||
return cursor.fetchall()
|
||||
|
||||
def delete_website(self, website_id):
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
|
||||
Reference in New Issue
Block a user