Duplicate website w/ reddit post + refactor

2025-10-26 12:06:52 +00:00 · 2018-06-08 10:40:58 -04:00 · 2018-06-08 10:40:58 -04:00 · 537228444b
commit 537228444b
parent 7f1e12cc3c
2 changed files with 56 additions and 48 deletions
--- a/app.py
+++ b/app.py
@ -144,46 +144,47 @@ def submit():
    return render_template("submit.html", queue=db.queue(), recaptcha=recaptcha)


+def try_enqueue(url):
+
+    url = os.path.join(url, "")
+    website = db.get_website_by_url(url)
+
+    if website:
+        return "Website already exists", "danger"
+
+    website = db.website_exists(url)
+    if website:
+        return "A parent directory of this url has already been posted", "danger"
+
+    if not od_util.is_valid_url(url):
+        return "<strong>Error:</strong> Invalid url. Make sure to include the http(s):// suffix. " \
+               "FTP is not supported", "danger"
+
+    if od_util.is_blacklisted(url):
+
+        return "<strong>Error:</strong> " \
+              "Sorry, this website has been blacklisted. If you think " \
+              "this is an error, please <a href='/contribute'>contact me</a>.", "danger"
+
+    if not od_util.is_od(url):
+        return "<strong>Error:</strong>" \
+              "The anti-spam algorithm determined that the submitted url is not " \
+              "an open directory or the server is not responding. If you think " \
+              "this is an error, please <a href='/contribute'>contact me</a>.", "danger"
+
+    web_id = db.insert_website(Website(url, str(request.remote_addr), str(request.user_agent)))
+    db.enqueue(web_id)
+
+    return "The website has been added to the queue", "success"
+
+
@app.route("/enqueue", methods=["POST"])
 def enqueue():
    if recaptcha.verify():

        url = os.path.join(request.form.get("url"), "")
-
-        website = db.get_website_by_url(url)
-
-        if website:
-            flash("Website already exists", "danger")
-            return redirect("/submit")
-
-        website = db.website_exists(url)
-        if website:
-            flash("A parent directory of this url has already been posted", "danger")
-            return redirect("/submit")
-
-        if not od_util.is_valid_url(url):
-            flash("<strong>Error:</strong> "
-                  "Invalid url. Make sure to include the http(s):// suffix. "
-                  "FTP is not supported", "danger")
-            return redirect("/submit")
-
-        if od_util.is_blacklisted(url):
-            flash("<strong>Error:</strong> "
-                  "Sorry, this website has been blacklisted. If you think "
-                  "this is an error, please <a href='/contribute'>contact me</a>.", "danger")
-            return redirect("/submit")
-
-        if not od_util.is_od(url):
-            flash("<strong>Error:</strong>"
-                  "The anti-spam algorithm determined that the submitted url is not "
-                  "an open directory or the server is not responding. If you think "
-                  "this is an error, please <a href='/contribute'>contact me</a>.", "danger")
-
-            return redirect("/submit")
-
-        web_id = db.insert_website(Website(url, str(request.remote_addr), str(request.user_agent)))
-        db.enqueue(web_id)
-        flash("The website has been added to the queue", "success")
+        message, msg_type = try_enqueue(url)
+        flash(message, msg_type)

        return redirect("/submit")
    else:
--- a/queue_reddit_links.py
+++ b/queue_reddit_links.py
@ -57,22 +57,20 @@ for comment in subreddit.comments(limit=50):

                website = db.get_website_by_url(url)

-                if website and not scanned:
-                    # in progress
-                    print(url)
-                    print("In progress")
-                    continue
-
-                if website and db.website_has_been_scanned(url):
+                if website:
+                    if not scanned:
+                        # in progress
+                        print(url)
+                        print("In progress")
+                        continue
                    handle_exact_repost(website.id, comment)
                    continue

                website_id = db.website_exists(url)
-                if website_id and not scanned:
-                    # IN progress
-                    print("Parent in progress")
-                    continue
-                if website_id and db.website_has_been_scanned(url):
+                if website_id:
+                    if not scanned:
+                        print("Parent in progress")
+                        continue
                    handle_subdir_repost(website_id, comment)
                    continue

@ -116,16 +114,25 @@ for s in submissions:
        if not bot.has_crawled(s.id):

            url = os.path.join(s.url, "")  # add trailing slash
+            scanned = db.website_has_been_scanned(url)

            website = db.get_website_by_url(url)

            if website:
+                if not scanned:
+                    print(url)
+                    print("In progress")
+                    continue
                handle_exact_repost(website.id, s)
+                continue

            website_id = db.website_exists(url)
            if website_id:
-                bot.log_crawl(s.id)
+                if not scanned:
+                    print("Parent in progress")
+                    continue
                handle_subdir_repost(website_id, s)
+                continue

            if not od_util.is_valid_url(url):
                print("Skipping reddit post: Invalid url")