Changed from mime to extension for graph and added script to clear invalid websites

This commit is contained in:
Simon 2018-05-31 10:51:59 -04:00
parent 819e2fbddb
commit bb872a9248
6 changed files with 33 additions and 11 deletions

View File

@ -0,0 +1,9 @@
from database import Database
db = Database("db.sqlite3")
websites_to_delete = db.get_websites_smaller(10000000)
for website_id in [x[0] for x in websites_to_delete]:
db.clear_website(website_id)
db.delete_website(website_id)
print("Deleted " + str(website_id))

View File

@ -73,15 +73,15 @@ class Database:
cursor.execute("SELECT LAST_INSERT_ROWID()")
website_paths[file.path] = cursor.fetchone()[0]
# Then MimeTypes
# Then FileTypes
mimetypes = dict()
cursor.execute("SELECT * FROM MimeType")
cursor.execute("SELECT * FROM FileType")
db_mimetypes = cursor.fetchall()
for db_mimetype in db_mimetypes:
mimetypes[db_mimetype[1]] = db_mimetype[0]
for file in files:
if file.mime not in mimetypes:
cursor.execute("INSERT INTO MimeType (mime) VALUES (?)", (file.mime, ))
cursor.execute("INSERT INTO FileType (mime) VALUES (?)", (file.mime, ))
cursor.execute("SELECT LAST_INSERT_ROWID()")
mimetypes[file.mime] = cursor.fetchone()[0]
@ -103,7 +103,8 @@ class Database:
with open(json_file, "r") as f:
try:
self.insert_files([File(website_id, x["path"], x["mime"], x["name"], x["size"]) for x in json.load(f)])
self.insert_files([File(website_id, x["path"], os.path.splitext(x["name"])[1].lower(), x["name"], x["size"])
for x in json.load(f)])
except Exception as e:
print(e)
print("Couldn't read json file!")
@ -218,11 +219,11 @@ class Database:
"WHERE File.path_id IN (SELECT id FROM WebsitePath WHERE website_id = ?)", (website_id, ))
file_sum, file_count = cursor.fetchone()
cursor.execute("SELECT SUM(File.size) as total_size, COUNT(File.id), MimeType.mime FROM File "
"INNER JOIN MimeType ON MimeType.id = File.mime_id "
cursor.execute("SELECT SUM(File.size) as total_size, COUNT(File.id), FileType.mime FROM File "
"INNER JOIN FileType ON FileType.id = File.mime_id "
"INNER JOIN WebsitePath Path on File.path_id = Path.id "
"WHERE Path.website_id = ? "
"GROUP BY MimeType.id ORDER BY total_size DESC", (website_id, ))
"GROUP BY FileType.id ORDER BY total_size DESC", (website_id, ))
db_mime_stats = cursor.fetchall()
cursor.execute("SELECT Website.url, Website.last_modified FROM Website WHERE id = ?", (website_id, ))
@ -287,6 +288,17 @@ class Database:
cursor.execute("SELECT Website.id FROM Website WHERE last_modified < ?", (date, ))
return [x[0] for x in cursor.fetchall()]
def get_websites_smaller(self, size: int):
"""Get the websites with total size smaller than specified"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("SELECT Website.id FROM Website "
"INNER JOIN WebsitePath Path on Website.id = Path.website_id "
"INNER JOIN File F on Path.id = F.path_id "
"GROUP BY Website.id HAVING SUM(F.size) < ?", (size, ))
return cursor.fetchall()
def delete_website(self, website_id):
with sqlite3.connect(self.db_path) as conn:

View File

@ -2,4 +2,4 @@ SELECT Website.url, WebsitePath.path, File.name, File.size, MT.mime
FROM File
INNER JOIN WebsitePath on File.path_id = WebsitePath.id
INNER JOIN Website on WebsitePath.website_id = Website.id
INNER JOIN MimeType MT on File.mime_id = MT.id;
INNER JOIN FileType MT on File.mime_id = MT.id;

View File

@ -16,7 +16,7 @@ CREATE TABLE WebsitePath (
FOREIGN KEY (website_id) REFERENCES Website(id)
);
CREATE TABLE MimeType (
CREATE TABLE FileType (
id INTEGER PRIMARY KEY NOT NULL,
mime TEXT
);
@ -29,7 +29,7 @@ CREATE TABLE File (
size INTEGER,
FOREIGN KEY (path_id) REFERENCES WebsitePath(id),
FOREIGN KEY (mime_id) REFERENCES MimeType(id)
FOREIGN KEY (mime_id) REFERENCES FileType(id)
);
CREATE TABLE Queue (

File diff suppressed because one or more lines are too long

View File

@ -57,6 +57,7 @@ class TaskManager:
print("Imported in SQLite3")
if post_id:
# TODO check should_comment()
stats = self.db.get_website_stats(website.id)
comment = self.reddit_bot.get_comment(stats, website.id)
print(comment)