diff --git a/app.py b/app.py
index 072361b..1d88833 100644
--- a/app.py
+++ b/app.py
@@ -130,10 +130,16 @@ def enqueue():
"FTP is not supported", "danger")
return redirect("/submit")
+ if od_util.is_blacklisted(url):
+ flash("Error: "
+ "Sorry, this website has been blacklisted. If you think "
+ "this is an error, please contact me.", "danger")
+ return redirect("/submit")
+
if not od_util.is_od(url):
flash("Error:"
"The anti-spam algorithm determined that the submitted url is not "
- "an open directory or the server is not responding. If you think"
+ "an open directory or the server is not responding. If you think "
"this is an error, please contact me.", "danger")
return redirect("/submit")
diff --git a/blacklist.txt b/blacklist.txt
new file mode 100644
index 0000000..39ada60
--- /dev/null
+++ b/blacklist.txt
@@ -0,0 +1,5 @@
+https://sdo.gsfc.nasa.gov
+https://drive.google
+https://mirror.math.princeton.edu
+http://mirror.math.princeton.edu
+https://www.dropbox.com
\ No newline at end of file
diff --git a/database.py b/database.py
index 1e0ede8..6a96f25 100644
--- a/database.py
+++ b/database.py
@@ -87,11 +87,14 @@ class Database:
conn.commit()
# Then insert files
- cursor.execute("PRAGMA foreign_keys = OFF")
- conn.commit()
cursor.executemany("INSERT INTO File (path_id, name, size, mime_id) VALUES (?,?,?,?)",
[(website_paths[x.path], x.name, x.size, mimetypes[x.mime]) for x in files])
+ # Update date
+ if len(files) > 0:
+ cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id = ?",
+ (files[0].website_id, ))
+
conn.commit()
def import_json(self, json_file, website: Website):
@@ -302,6 +305,21 @@ class Database:
website_id = cursor.fetchone()
return website_id[0] if website_id else None
+ def website_has_been_scanned(self, url):
+ """Check if a website has at least 1 file"""
+
+ with sqlite3.connect(self.db_path) as conn:
+ cursor = conn.cursor()
+
+ website_id = self.website_exists(url)
+
+ if website_id:
+ cursor.execute("SELECT COUNT(Path.id) FROM Website "
+ "INNER JOIN WebsitePath Path on Website.id = Path.website_id "
+ "WHERE Website.id = ?", (website_id, ))
+ return cursor.fetchone()[0] > 0
+ return None
+
def clear_website(self, website_id):
"""Remove all files from a website and update its last_updated date"""
with sqlite3.connect(self.db_path) as conn:
diff --git a/od_util.py b/od_util.py
index fe4f3f5..775f108 100644
--- a/od_util.py
+++ b/od_util.py
@@ -61,9 +61,6 @@ def is_external_link(base_url, url: str):
def is_od(url):
- if "?" in url:
- print("Url has parameter in url!")
- return False
if not url.endswith("/"):
print("Url does not end with trailing /")
@@ -97,3 +94,13 @@ def is_od(url):
except Exception as e:
print(e)
return False
+
+
+def is_blacklisted(url):
+
+ with open("blacklist.txt", "r") as f:
+ for line in f.readlines():
+ if url.startswith(line.strip()):
+ return True
+
+ return False
diff --git a/queue_reddit_links.py b/queue_reddit_links.py
index 80e8f04..ea23183 100644
--- a/queue_reddit_links.py
+++ b/queue_reddit_links.py
@@ -19,7 +19,7 @@ submissions = []
def handle_exact_repost(website_id, reddit_obj):
stats = db.get_website_stats(website_id)
comment = bot.get_comment({"": stats}, website_id,
- f"I already scanned this website on {website.last_modified} UTC")
+ "I already scanned this website on " + website.last_modified + " UTC")
print(comment)
print("Exact repost!")
bot.reply(reddit_obj, comment)
@@ -33,9 +33,9 @@ def handle_subdir_repost(website_id, reddit_obj):
subdir_stats = db.get_subdir_stats(website_id, subdir)
stats = db.get_website_stats(website_id)
- comment = bot.get_comment({"Parent directory:": stats, f"Subdirectory `/{subdir}`:": subdir_stats},
- website_id, f"I already scanned a parent directory of this website on"
- f" {website.last_modified} UTC")
+ comment = bot.get_comment({"Parent directory:": stats, "Subdirectory `/" + subdir + "`:": subdir_stats},
+ website_id, "I already scanned a parent directory of this website on"
+ + website.last_modified + " UTC")
print(comment)
print("Subdir repost!")
bot.reply(reddit_obj, comment)
@@ -50,16 +50,24 @@ for comment in []: #subreddit.comments(limit=50):
lines = text.split()
if len(lines) > 1:
url = os.path.join(lines[1], "") # Add trailing slash
+ scanned = db.website_has_been_scanned(url)
website = db.get_website_by_url(url)
- if website:
+ if website and not scanned:
+ # in progress
+ pass
+
+ if website and db.website_has_been_scanned(url):
bot.log_crawl(comment.id)
handle_exact_repost(website.id, comment)
continue
website_id = db.website_exists(url)
- if website_id:
+ if website_id and not scanned:
+ # IN progress
+ pass
+ if website_id and db.website_has_been_scanned(url):
bot.log_crawl(comment.id)
handle_subdir_repost(website_id, comment)
continue
@@ -67,19 +75,27 @@ for comment in []: #subreddit.comments(limit=50):
if not od_util.is_valid_url(url):
print("Skipping reddit comment: Invalid url")
bot.log_crawl(comment.id)
- bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `"
- f"{url}` is not valid. Make sure that you include the `http(s)://` prefix. \n")
+ bot.reply(comment, "Hello, " + comment.author + ". Unfortunately it seems that the link you "
+ "provided: `" + url + "` is not valid. Make sure that you include the"
+ "'`http(s)://` prefix. \n")
continue
+ if od_util.is_blacklisted(url):
+ print("Skipping reddit comment: blacklisted")
+ bot.log_crawl(comment.id)
+ bot.reply(comment, "Hello, " + comment.author + ". Unfortunately my programmer has blacklisted "
+ "this website. If you think that this is an error, please "
+ "[contact him](https://www.reddit.com/message/compose?to=Hexahedr_n)")
+
if not od_util.is_od(url):
print("Skipping reddit comment: Not an OD")
print(url)
bot.log_crawl(comment.id)
- bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `"
- f"{url}` does not point to an open directory. This could also mean that the "
- f"website is not responding (in which case, feel free to retry in a few minutes)"
- f" If you think that this is an error, please "
- f"[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)")
+ bot.reply(comment, "Hello, " + comment.author + ". Unfortunately it seems that the link you "
+ "provided: `" + url + "` does not point to an open directory. This could also"
+ " mean that the website is not responding (in which case, feel free to retry in "
+ "a few minutes). If you think that this is an error, please "
+ "[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)")
continue
bot.log_crawl(comment.id)
@@ -116,6 +132,11 @@ for s in submissions:
bot.log_crawl(s.id)
continue
+ if od_util.is_blacklisted(url):
+ print("Skipping reddit post: blacklisted")
+ bot.log_crawl(s.id)
+ continue
+
if not od_util.is_od(url):
print("Skipping reddit post: Not an OD")
print(url)
diff --git a/static/js/report.js b/static/js/report.js
index 5792482..ae46c2e 100644
--- a/static/js/report.js
+++ b/static/js/report.js
@@ -113,7 +113,7 @@ function getRandomColor() {
*/
function humanFileSize(bytes) {
- if(bytes === 0) {
+ if(bytes <= 0) {
return "? B"
}