Added blacklist feature (untested)

2025-10-15 15:36:53 +00:00 · 2018-06-06 10:17:30 -04:00 · 2018-06-06 10:17:30 -04:00 · 0b1d76f478
commit 0b1d76f478
parent cfa6a9f02f
6 changed files with 77 additions and 20 deletions
--- a/app.py
+++ b/app.py
@ -130,10 +130,16 @@ def enqueue():
                  "FTP is not supported", "danger")
            return redirect("/submit")
        if od_util.is_blacklisted(url):
            flash("<strong>Error:</strong> "
                  "Sorry, this website has been blacklisted. If you think "
                  "this is an error, please <a href='/contribute'>contact me</a>.", "danger")
            return redirect("/submit")
        if not od_util.is_od(url):
            flash("<strong>Error:</strong>"
                  "The anti-spam algorithm determined that the submitted url is not "
-                  "an open directory or the server is not responding. If you think"
+                  "an open directory or the server is not responding. If you think "
                  "this is an error, please <a href='/contribute'>contact me</a>.", "danger")
            return redirect("/submit")
--- a/blacklist.txt
+++ b/blacklist.txt
@ -0,0 +1,5 @@
 https://sdo.gsfc.nasa.gov
 https://drive.google
 https://mirror.math.princeton.edu
 http://mirror.math.princeton.edu
 https://www.dropbox.com
--- a/database.py
+++ b/database.py
@ -87,11 +87,14 @@ class Database:
            conn.commit()
            # Then insert files
            cursor.execute("PRAGMA foreign_keys = OFF")
            conn.commit()
            cursor.executemany("INSERT INTO File (path_id, name, size, mime_id) VALUES (?,?,?,?)",
                               [(website_paths[x.path], x.name, x.size, mimetypes[x.mime]) for x in files])
            # Update date
            if len(files) > 0:
                cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id = ?",
                               (files[0].website_id, ))
            conn.commit()
    def import_json(self, json_file, website: Website):
@ -302,6 +305,21 @@ class Database:
            website_id = cursor.fetchone()
            return website_id[0] if website_id else None
    def website_has_been_scanned(self, url):
        """Check if a website has at least 1 file"""
        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.cursor()
            website_id = self.website_exists(url)
            if website_id:
                cursor.execute("SELECT COUNT(Path.id) FROM Website "
                               "INNER JOIN WebsitePath Path on Website.id = Path.website_id "
                               "WHERE Website.id = ?", (website_id, ))
                return cursor.fetchone()[0] > 0
        return None
    def clear_website(self, website_id):
        """Remove all files from a website and update its last_updated date"""
        with sqlite3.connect(self.db_path) as conn:
--- a/od_util.py
+++ b/od_util.py
@ -61,9 +61,6 @@ def is_external_link(base_url, url: str):
 def is_od(url):
    if "?" in url:
        print("Url has parameter in url!")
        return False
    if not url.endswith("/"):
        print("Url does not end with trailing /")
@ -97,3 +94,13 @@ def is_od(url):
    except Exception as e:
        print(e)
        return False
 def is_blacklisted(url):
    with open("blacklist.txt", "r") as f:
        for line in f.readlines():
            if url.startswith(line.strip()):
                return True
    return False
--- a/queue_reddit_links.py
+++ b/queue_reddit_links.py
@ -19,7 +19,7 @@ submissions = []
 def handle_exact_repost(website_id, reddit_obj):
    stats = db.get_website_stats(website_id)
    comment = bot.get_comment({"": stats}, website_id,
-                              f"I already scanned this website on {website.last_modified} UTC")
+                              "I already scanned this website on " + website.last_modified + " UTC")
    print(comment)
    print("Exact repost!")
    bot.reply(reddit_obj, comment)
@ -33,9 +33,9 @@ def handle_subdir_repost(website_id, reddit_obj):
    subdir_stats = db.get_subdir_stats(website_id, subdir)
    stats = db.get_website_stats(website_id)
-    comment = bot.get_comment({"Parent directory:": stats, f"Subdirectory `/{subdir}`:": subdir_stats},
+    comment = bot.get_comment({"Parent directory:": stats, "Subdirectory `/" + subdir + "`:": subdir_stats},
-                              website_id, f"I already scanned a parent directory of this website on"
+                              website_id, "I already scanned a parent directory of this website on"
-                                          f" {website.last_modified} UTC")
+                              + website.last_modified + " UTC")
    print(comment)
    print("Subdir repost!")
    bot.reply(reddit_obj, comment)
@ -50,16 +50,24 @@ for comment in []: #subreddit.comments(limit=50):
            lines = text.split()
            if len(lines) > 1:
                url = os.path.join(lines[1], "")  # Add trailing slash
                scanned = db.website_has_been_scanned(url)
                website = db.get_website_by_url(url)
-                if website:
+                if website and not scanned:
                    # in progress
                    pass
                if website and db.website_has_been_scanned(url):
                    bot.log_crawl(comment.id)
                    handle_exact_repost(website.id, comment)
                    continue
                website_id = db.website_exists(url)
-                if website_id:
+                if website_id and not scanned:
                    # IN progress
                    pass
                if website_id and db.website_has_been_scanned(url):
                    bot.log_crawl(comment.id)
                    handle_subdir_repost(website_id, comment)
                    continue
@ -67,19 +75,27 @@ for comment in []: #subreddit.comments(limit=50):
                if not od_util.is_valid_url(url):
                    print("Skipping reddit comment: Invalid url")
                    bot.log_crawl(comment.id)
-                    bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `"
+                    bot.reply(comment, "Hello, " + comment.author + ". Unfortunately it seems that the link you "
-                                       f"{url}` is not valid. Make sure that you include the `http(s)://` prefix.    \n")
+                                       "provided: `" + url + "` is not valid. Make sure that you include the"
                                       "'`http(s)://` prefix.    \n")
                    continue
                if od_util.is_blacklisted(url):
                    print("Skipping reddit comment: blacklisted")
                    bot.log_crawl(comment.id)
                    bot.reply(comment, "Hello, " + comment.author + ". Unfortunately my programmer has blacklisted "
                                       "this website. If you think that this is an error, please "
                                       "[contact him](https://www.reddit.com/message/compose?to=Hexahedr_n)")
                if not od_util.is_od(url):
                    print("Skipping reddit comment: Not an OD")
                    print(url)
                    bot.log_crawl(comment.id)
-                    bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `"
+                    bot.reply(comment, "Hello, " + comment.author + ". Unfortunately it seems that the link you "
-                                       f"{url}` does not point to an open directory. This could also mean that the "
+                                       "provided: `" + url + "` does not point to an open directory. This could also"
-                                       f"website is not responding (in which case, feel free to retry in a few minutes)"
+                                       " mean that the website is not responding (in which case, feel free to retry in "
-                                       f" If you think that this is an error, please "
+                                       "a few minutes). If you think that this is an error, please "
-                                       f"[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)")
+                                       "[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)")
                    continue
                bot.log_crawl(comment.id)
@ -116,6 +132,11 @@ for s in submissions:
                bot.log_crawl(s.id)
                continue
            if od_util.is_blacklisted(url):
                print("Skipping reddit post: blacklisted")
                bot.log_crawl(s.id)
                continue
            if not od_util.is_od(url):
                print("Skipping reddit post: Not an OD")
                print(url)
--- a/static/js/report.js
+++ b/static/js/report.js
@ -113,7 +113,7 @@ function getRandomColor() {
 */
 function humanFileSize(bytes) {
-    if(bytes === 0) {
+    if(bytes <= 0) {
        return "? B"
    }