From 0b1d76f47864abdf999948716648cb3ad18210d3 Mon Sep 17 00:00:00 2001 From: Simon Date: Wed, 6 Jun 2018 10:17:30 -0400 Subject: [PATCH] Added blacklist feature (untested) --- app.py | 8 +++++++- blacklist.txt | 5 +++++ database.py | 22 ++++++++++++++++++-- od_util.py | 13 +++++++++--- queue_reddit_links.py | 47 +++++++++++++++++++++++++++++++------------ static/js/report.js | 2 +- 6 files changed, 77 insertions(+), 20 deletions(-) create mode 100644 blacklist.txt diff --git a/app.py b/app.py index 072361b..1d88833 100644 --- a/app.py +++ b/app.py @@ -130,10 +130,16 @@ def enqueue(): "FTP is not supported", "danger") return redirect("/submit") + if od_util.is_blacklisted(url): + flash("Error: " + "Sorry, this website has been blacklisted. If you think " + "this is an error, please contact me.", "danger") + return redirect("/submit") + if not od_util.is_od(url): flash("Error:" "The anti-spam algorithm determined that the submitted url is not " - "an open directory or the server is not responding. If you think" + "an open directory or the server is not responding. If you think " "this is an error, please contact me.", "danger") return redirect("/submit") diff --git a/blacklist.txt b/blacklist.txt new file mode 100644 index 0000000..39ada60 --- /dev/null +++ b/blacklist.txt @@ -0,0 +1,5 @@ +https://sdo.gsfc.nasa.gov +https://drive.google +https://mirror.math.princeton.edu +http://mirror.math.princeton.edu +https://www.dropbox.com \ No newline at end of file diff --git a/database.py b/database.py index 1e0ede8..6a96f25 100644 --- a/database.py +++ b/database.py @@ -87,11 +87,14 @@ class Database: conn.commit() # Then insert files - cursor.execute("PRAGMA foreign_keys = OFF") - conn.commit() cursor.executemany("INSERT INTO File (path_id, name, size, mime_id) VALUES (?,?,?,?)", [(website_paths[x.path], x.name, x.size, mimetypes[x.mime]) for x in files]) + # Update date + if len(files) > 0: + cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id = ?", + (files[0].website_id, )) + conn.commit() def import_json(self, json_file, website: Website): @@ -302,6 +305,21 @@ class Database: website_id = cursor.fetchone() return website_id[0] if website_id else None + def website_has_been_scanned(self, url): + """Check if a website has at least 1 file""" + + with sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + + website_id = self.website_exists(url) + + if website_id: + cursor.execute("SELECT COUNT(Path.id) FROM Website " + "INNER JOIN WebsitePath Path on Website.id = Path.website_id " + "WHERE Website.id = ?", (website_id, )) + return cursor.fetchone()[0] > 0 + return None + def clear_website(self, website_id): """Remove all files from a website and update its last_updated date""" with sqlite3.connect(self.db_path) as conn: diff --git a/od_util.py b/od_util.py index fe4f3f5..775f108 100644 --- a/od_util.py +++ b/od_util.py @@ -61,9 +61,6 @@ def is_external_link(base_url, url: str): def is_od(url): - if "?" in url: - print("Url has parameter in url!") - return False if not url.endswith("/"): print("Url does not end with trailing /") @@ -97,3 +94,13 @@ def is_od(url): except Exception as e: print(e) return False + + +def is_blacklisted(url): + + with open("blacklist.txt", "r") as f: + for line in f.readlines(): + if url.startswith(line.strip()): + return True + + return False diff --git a/queue_reddit_links.py b/queue_reddit_links.py index 80e8f04..ea23183 100644 --- a/queue_reddit_links.py +++ b/queue_reddit_links.py @@ -19,7 +19,7 @@ submissions = [] def handle_exact_repost(website_id, reddit_obj): stats = db.get_website_stats(website_id) comment = bot.get_comment({"": stats}, website_id, - f"I already scanned this website on {website.last_modified} UTC") + "I already scanned this website on " + website.last_modified + " UTC") print(comment) print("Exact repost!") bot.reply(reddit_obj, comment) @@ -33,9 +33,9 @@ def handle_subdir_repost(website_id, reddit_obj): subdir_stats = db.get_subdir_stats(website_id, subdir) stats = db.get_website_stats(website_id) - comment = bot.get_comment({"Parent directory:": stats, f"Subdirectory `/{subdir}`:": subdir_stats}, - website_id, f"I already scanned a parent directory of this website on" - f" {website.last_modified} UTC") + comment = bot.get_comment({"Parent directory:": stats, "Subdirectory `/" + subdir + "`:": subdir_stats}, + website_id, "I already scanned a parent directory of this website on" + + website.last_modified + " UTC") print(comment) print("Subdir repost!") bot.reply(reddit_obj, comment) @@ -50,16 +50,24 @@ for comment in []: #subreddit.comments(limit=50): lines = text.split() if len(lines) > 1: url = os.path.join(lines[1], "") # Add trailing slash + scanned = db.website_has_been_scanned(url) website = db.get_website_by_url(url) - if website: + if website and not scanned: + # in progress + pass + + if website and db.website_has_been_scanned(url): bot.log_crawl(comment.id) handle_exact_repost(website.id, comment) continue website_id = db.website_exists(url) - if website_id: + if website_id and not scanned: + # IN progress + pass + if website_id and db.website_has_been_scanned(url): bot.log_crawl(comment.id) handle_subdir_repost(website_id, comment) continue @@ -67,19 +75,27 @@ for comment in []: #subreddit.comments(limit=50): if not od_util.is_valid_url(url): print("Skipping reddit comment: Invalid url") bot.log_crawl(comment.id) - bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `" - f"{url}` is not valid. Make sure that you include the `http(s)://` prefix. \n") + bot.reply(comment, "Hello, " + comment.author + ". Unfortunately it seems that the link you " + "provided: `" + url + "` is not valid. Make sure that you include the" + "'`http(s)://` prefix. \n") continue + if od_util.is_blacklisted(url): + print("Skipping reddit comment: blacklisted") + bot.log_crawl(comment.id) + bot.reply(comment, "Hello, " + comment.author + ". Unfortunately my programmer has blacklisted " + "this website. If you think that this is an error, please " + "[contact him](https://www.reddit.com/message/compose?to=Hexahedr_n)") + if not od_util.is_od(url): print("Skipping reddit comment: Not an OD") print(url) bot.log_crawl(comment.id) - bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `" - f"{url}` does not point to an open directory. This could also mean that the " - f"website is not responding (in which case, feel free to retry in a few minutes)" - f" If you think that this is an error, please " - f"[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)") + bot.reply(comment, "Hello, " + comment.author + ". Unfortunately it seems that the link you " + "provided: `" + url + "` does not point to an open directory. This could also" + " mean that the website is not responding (in which case, feel free to retry in " + "a few minutes). If you think that this is an error, please " + "[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)") continue bot.log_crawl(comment.id) @@ -116,6 +132,11 @@ for s in submissions: bot.log_crawl(s.id) continue + if od_util.is_blacklisted(url): + print("Skipping reddit post: blacklisted") + bot.log_crawl(s.id) + continue + if not od_util.is_od(url): print("Skipping reddit post: Not an OD") print(url) diff --git a/static/js/report.js b/static/js/report.js index 5792482..ae46c2e 100644 --- a/static/js/report.js +++ b/static/js/report.js @@ -113,7 +113,7 @@ function getRandomColor() { */ function humanFileSize(bytes) { - if(bytes === 0) { + if(bytes <= 0) { return "? B" }