Added blacklist feature (untested)

This commit is contained in:
Simon 2018-06-06 10:17:30 -04:00
parent cfa6a9f02f
commit 0b1d76f478
6 changed files with 77 additions and 20 deletions

8
app.py
View File

@ -130,10 +130,16 @@ def enqueue():
"FTP is not supported", "danger") "FTP is not supported", "danger")
return redirect("/submit") return redirect("/submit")
if od_util.is_blacklisted(url):
flash("<strong>Error:</strong> "
"Sorry, this website has been blacklisted. If you think "
"this is an error, please <a href='/contribute'>contact me</a>.", "danger")
return redirect("/submit")
if not od_util.is_od(url): if not od_util.is_od(url):
flash("<strong>Error:</strong>" flash("<strong>Error:</strong>"
"The anti-spam algorithm determined that the submitted url is not " "The anti-spam algorithm determined that the submitted url is not "
"an open directory or the server is not responding. If you think" "an open directory or the server is not responding. If you think "
"this is an error, please <a href='/contribute'>contact me</a>.", "danger") "this is an error, please <a href='/contribute'>contact me</a>.", "danger")
return redirect("/submit") return redirect("/submit")

5
blacklist.txt Normal file
View File

@ -0,0 +1,5 @@
https://sdo.gsfc.nasa.gov
https://drive.google
https://mirror.math.princeton.edu
http://mirror.math.princeton.edu
https://www.dropbox.com

View File

@ -87,11 +87,14 @@ class Database:
conn.commit() conn.commit()
# Then insert files # Then insert files
cursor.execute("PRAGMA foreign_keys = OFF")
conn.commit()
cursor.executemany("INSERT INTO File (path_id, name, size, mime_id) VALUES (?,?,?,?)", cursor.executemany("INSERT INTO File (path_id, name, size, mime_id) VALUES (?,?,?,?)",
[(website_paths[x.path], x.name, x.size, mimetypes[x.mime]) for x in files]) [(website_paths[x.path], x.name, x.size, mimetypes[x.mime]) for x in files])
# Update date
if len(files) > 0:
cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id = ?",
(files[0].website_id, ))
conn.commit() conn.commit()
def import_json(self, json_file, website: Website): def import_json(self, json_file, website: Website):
@ -302,6 +305,21 @@ class Database:
website_id = cursor.fetchone() website_id = cursor.fetchone()
return website_id[0] if website_id else None return website_id[0] if website_id else None
def website_has_been_scanned(self, url):
"""Check if a website has at least 1 file"""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
website_id = self.website_exists(url)
if website_id:
cursor.execute("SELECT COUNT(Path.id) FROM Website "
"INNER JOIN WebsitePath Path on Website.id = Path.website_id "
"WHERE Website.id = ?", (website_id, ))
return cursor.fetchone()[0] > 0
return None
def clear_website(self, website_id): def clear_website(self, website_id):
"""Remove all files from a website and update its last_updated date""" """Remove all files from a website and update its last_updated date"""
with sqlite3.connect(self.db_path) as conn: with sqlite3.connect(self.db_path) as conn:

View File

@ -61,9 +61,6 @@ def is_external_link(base_url, url: str):
def is_od(url): def is_od(url):
if "?" in url:
print("Url has parameter in url!")
return False
if not url.endswith("/"): if not url.endswith("/"):
print("Url does not end with trailing /") print("Url does not end with trailing /")
@ -97,3 +94,13 @@ def is_od(url):
except Exception as e: except Exception as e:
print(e) print(e)
return False return False
def is_blacklisted(url):
with open("blacklist.txt", "r") as f:
for line in f.readlines():
if url.startswith(line.strip()):
return True
return False

View File

@ -19,7 +19,7 @@ submissions = []
def handle_exact_repost(website_id, reddit_obj): def handle_exact_repost(website_id, reddit_obj):
stats = db.get_website_stats(website_id) stats = db.get_website_stats(website_id)
comment = bot.get_comment({"": stats}, website_id, comment = bot.get_comment({"": stats}, website_id,
f"I already scanned this website on {website.last_modified} UTC") "I already scanned this website on " + website.last_modified + " UTC")
print(comment) print(comment)
print("Exact repost!") print("Exact repost!")
bot.reply(reddit_obj, comment) bot.reply(reddit_obj, comment)
@ -33,9 +33,9 @@ def handle_subdir_repost(website_id, reddit_obj):
subdir_stats = db.get_subdir_stats(website_id, subdir) subdir_stats = db.get_subdir_stats(website_id, subdir)
stats = db.get_website_stats(website_id) stats = db.get_website_stats(website_id)
comment = bot.get_comment({"Parent directory:": stats, f"Subdirectory `/{subdir}`:": subdir_stats}, comment = bot.get_comment({"Parent directory:": stats, "Subdirectory `/" + subdir + "`:": subdir_stats},
website_id, f"I already scanned a parent directory of this website on" website_id, "I already scanned a parent directory of this website on"
f" {website.last_modified} UTC") + website.last_modified + " UTC")
print(comment) print(comment)
print("Subdir repost!") print("Subdir repost!")
bot.reply(reddit_obj, comment) bot.reply(reddit_obj, comment)
@ -50,16 +50,24 @@ for comment in []: #subreddit.comments(limit=50):
lines = text.split() lines = text.split()
if len(lines) > 1: if len(lines) > 1:
url = os.path.join(lines[1], "") # Add trailing slash url = os.path.join(lines[1], "") # Add trailing slash
scanned = db.website_has_been_scanned(url)
website = db.get_website_by_url(url) website = db.get_website_by_url(url)
if website: if website and not scanned:
# in progress
pass
if website and db.website_has_been_scanned(url):
bot.log_crawl(comment.id) bot.log_crawl(comment.id)
handle_exact_repost(website.id, comment) handle_exact_repost(website.id, comment)
continue continue
website_id = db.website_exists(url) website_id = db.website_exists(url)
if website_id: if website_id and not scanned:
# IN progress
pass
if website_id and db.website_has_been_scanned(url):
bot.log_crawl(comment.id) bot.log_crawl(comment.id)
handle_subdir_repost(website_id, comment) handle_subdir_repost(website_id, comment)
continue continue
@ -67,19 +75,27 @@ for comment in []: #subreddit.comments(limit=50):
if not od_util.is_valid_url(url): if not od_util.is_valid_url(url):
print("Skipping reddit comment: Invalid url") print("Skipping reddit comment: Invalid url")
bot.log_crawl(comment.id) bot.log_crawl(comment.id)
bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `" bot.reply(comment, "Hello, " + comment.author + ". Unfortunately it seems that the link you "
f"{url}` is not valid. Make sure that you include the `http(s)://` prefix. \n") "provided: `" + url + "` is not valid. Make sure that you include the"
"'`http(s)://` prefix. \n")
continue continue
if od_util.is_blacklisted(url):
print("Skipping reddit comment: blacklisted")
bot.log_crawl(comment.id)
bot.reply(comment, "Hello, " + comment.author + ". Unfortunately my programmer has blacklisted "
"this website. If you think that this is an error, please "
"[contact him](https://www.reddit.com/message/compose?to=Hexahedr_n)")
if not od_util.is_od(url): if not od_util.is_od(url):
print("Skipping reddit comment: Not an OD") print("Skipping reddit comment: Not an OD")
print(url) print(url)
bot.log_crawl(comment.id) bot.log_crawl(comment.id)
bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `" bot.reply(comment, "Hello, " + comment.author + ". Unfortunately it seems that the link you "
f"{url}` does not point to an open directory. This could also mean that the " "provided: `" + url + "` does not point to an open directory. This could also"
f"website is not responding (in which case, feel free to retry in a few minutes)" " mean that the website is not responding (in which case, feel free to retry in "
f" If you think that this is an error, please " "a few minutes). If you think that this is an error, please "
f"[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)") "[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)")
continue continue
bot.log_crawl(comment.id) bot.log_crawl(comment.id)
@ -116,6 +132,11 @@ for s in submissions:
bot.log_crawl(s.id) bot.log_crawl(s.id)
continue continue
if od_util.is_blacklisted(url):
print("Skipping reddit post: blacklisted")
bot.log_crawl(s.id)
continue
if not od_util.is_od(url): if not od_util.is_od(url):
print("Skipping reddit post: Not an OD") print("Skipping reddit post: Not an OD")
print(url) print(url)

View File

@ -113,7 +113,7 @@ function getRandomColor() {
*/ */
function humanFileSize(bytes) { function humanFileSize(bytes) {
if(bytes === 0) { if(bytes <= 0) {
return "? B" return "? B"
} }