Added reply to comments option, fixed some bugs

2025-12-15 07:39:04 +00:00 · 2018-06-02 17:26:15 -04:00
parent bb872a9248
commit 270ab1335a
8 changed files with 200 additions and 63 deletions
--- a/app.py
+++ b/app.py
@@ -1,9 +1,9 @@
 from flask import Flask, render_template, redirect, request, flash, abort, Response
+import os
 import json
 from database import Database, Website, InvalidQueryException
 from flask_recaptcha import ReCaptcha
 import od_util
-from urllib.parse import urljoin
 import sqlite3
 from flask_caching import Cache
 from task import TaskManager
@@ -111,7 +111,7 @@ def submit():
 def enqueue():
    if not recaptcha.verify():

-        url = urljoin(request.form.get("url"), "")
+        url = os.path.join(request.form.get("url"), "")

        website = db.get_website_by_url(url)

--- a/database.py
+++ b/database.py
@@ -139,13 +139,16 @@ class Database:
            else:
                return None

-    def enqueue(self, website_id, reddit_post_id=None, priority=1):
+    def enqueue(self, website_id, reddit_post_id=None, reddit_comment_id=None, priority=1):

        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.cursor()
-
+            if reddit_post_id:
                cursor.execute("INSERT OR IGNORE INTO Queue (website_id, reddit_post_id, priority) VALUES (?,?,?)",
                               (website_id, reddit_post_id, priority))
+            else:
+                cursor.execute("INSERT OR IGNORE INTO Queue (website_id, reddit_comment_id, priority) VALUES (?,?,?)",
+                               (website_id, reddit_comment_id, priority))
            conn.commit()

    def dequeue(self):
@@ -153,12 +156,13 @@ class Database:
        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.cursor()

-            cursor.execute("SELECT website_id, reddit_post_id FROM Queue ORDER BY priority DESC, Queue.id ASC LIMIT 1")
-            website_id = cursor.fetchone()
+            cursor.execute("SELECT website_id, reddit_post_id, reddit_comment_id"
+                           " FROM Queue ORDER BY priority DESC, Queue.id ASC LIMIT 1")
+            website = cursor.fetchone()

-            if website_id:
-                cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id[0],))
-                return website_id[0], website_id[1]
+            if website:
+                cursor.execute("DELETE FROM Queue WHERE website_id=?", (website[0],))
+                return website[0], website[1], website[2]
            else:
                return None

@@ -216,7 +220,8 @@ class Database:
            cursor = conn.cursor()

            cursor.execute("SELECT SUM(File.size), COUNT(*) FROM File "
-                           "WHERE File.path_id IN (SELECT id FROM WebsitePath WHERE website_id = ?)", (website_id, ))
+                           "INNER JOIN WebsitePath Path on File.path_id = Path.id "
+                           "WHERE Path.website_id = ?", (website_id, ))
            file_sum, file_count = cursor.fetchone()

            cursor.execute("SELECT SUM(File.size) as total_size, COUNT(File.id), FileType.mime FROM File "
@@ -230,8 +235,36 @@ class Database:
            website_url, website_date = cursor.fetchone()

            return {
-                "total_size": file_sum,
-                "total_count": file_count,
+                "total_size": file_sum if file_sum else 0,
+                "total_count": file_count if file_count else 0,
+                "base_url": website_url,
+                "report_time": website_date,
+                "mime_stats": db_mime_stats
+            }
+
+    def get_subdir_stats(self, website_id: int, path: str):
+        """Get stats of a sub directory. path must not start with / and must end with /"""
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.cursor()
+
+            cursor.execute("SELECT SUM(File.size), COUNT(*) FROM File "
+                           "INNER JOIN WebsitePath Path on File.path_id = Path.id "
+                           "WHERE Path.website_id = ? AND Path.path LIKE ?", (website_id, path + "%"))
+            file_sum, file_count = cursor.fetchone()
+
+            cursor.execute("SELECT SUM(File.size) as total_size, COUNT(File.id), FileType.mime FROM File "
+                           "INNER JOIN FileType ON FileType.id = File.mime_id "
+                           "INNER JOIN WebsitePath Path on File.path_id = Path.id "
+                           "WHERE Path.website_id = ? AND Path.path LIKE ? "
+                           "GROUP BY FileType.id ORDER BY total_size DESC", (website_id, path + "%"))
+            db_mime_stats = cursor.fetchall()
+
+            cursor.execute("SELECT Website.url, Website.last_modified FROM Website WHERE id = ?", (website_id, ))
+            website_url, website_date = cursor.fetchone()
+
+            return {
+                "total_size": file_sum if file_sum else 0,
+                "total_count": file_count if file_count else 0,
                "base_url": website_url,
                "report_time": website_date,
                "mime_stats": db_mime_stats
@@ -266,7 +299,8 @@ class Database:
            cursor = conn.cursor()

            cursor.execute("SELECT id FROM Website WHERE url = substr(?, 0, length(url) + 1)", (url, ))
-            return True if cursor.fetchone() else False
+            website_id = cursor.fetchone()
+            return website_id[0] if website_id else None

    def clear_website(self, website_id):
        """Remove all files from a website and update its last_updated date"""
--- a/init_script.sql
+++ b/init_script.sql
@@ -36,6 +36,7 @@ CREATE TABLE Queue (
  id INTEGER PRIMARY KEY NOT NULL,
  website_id INTEGER UNIQUE,
  reddit_post_id TEXT,
+  reddit_comment_id TEXT,
  priority INTEGER
 );

--- a/queue_reddit_links.py
+++ b/queue_reddit_links.py
@@ -2,45 +2,127 @@ import praw
 from reddit_bot import RedditBot
 from database import Database, Website
 import od_util
-from urllib.parse import urljoin
+import os
+import re

+pattern = re.compile("[\[\]\\\()]+")
 reddit = praw.Reddit('opendirectories-bot',
                     user_agent='github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)')
 db = Database("db.sqlite3")
 subreddit = reddit.subreddit("opendirectories")
+# subreddit = reddit.subreddit("test")
+bot = RedditBot("crawled.txt", reddit)

 submissions = []

-for submission in subreddit.new(limit=1):
+
+def handle_exact_repost(website_id, reddit_obj):
+    stats = db.get_website_stats(website_id)
+    comment = bot.get_comment({"": stats}, website_id,
+                              f"I already scanned this website on {website.last_modified} UTC")
+    print(comment)
+    print("Exact repost!")
+    bot.reply(reddit_obj, comment)
+
+
+def handle_subdir_repost(website_id, reddit_obj):
+
+    website = db.get_website_by_id(website_id)
+
+    subdir = url[len(website.url):]
+
+    subdir_stats = db.get_subdir_stats(website_id, subdir)
+    stats = db.get_website_stats(website_id)
+    comment = bot.get_comment({"Parent directory:": stats, f"Subdirectory `/{subdir}`:": subdir_stats},
+                              website_id, f"I already scanned a parent directory of this website on"
+                                          f" {website.last_modified} UTC")
+    print(comment)
+    print("Subdir repost!")
+    bot.reply(reddit_obj, comment)
+
+
+# Check comments
+for comment in []: #subreddit.comments(limit=50):
+
+    if not bot.has_crawled(comment):
+        text = pattern.sub(" ", comment.body).strip()
+        if text.startswith("u/opendirectories-bot") or text.startswith("/u/opendirectories-bot"):
+            lines = text.split()
+            if len(lines) > 1:
+                url = os.path.join(lines[1], "")  # Add trailing slash
+
+                website = db.get_website_by_url(url)
+
+                if website:
+                    bot.log_crawl(comment.id)
+                    handle_exact_repost(website.id, comment)
+                    continue
+
+                website_id = db.website_exists(url)
+                if website_id:
+                    bot.log_crawl(comment.id)
+                    handle_subdir_repost(website_id, comment)
+                    continue
+
+                if not od_util.is_valid_url(url):
+                    print("Skipping reddit comment: Invalid url")
+                    bot.log_crawl(comment.id)
+                    bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `"
+                                       f"{url}` is not valid. Make sure that you include the `http(s)://` prefix.    \n")
+                    continue
+
+                if not od_util.is_od(url):
+                    print("Skipping reddit comment: Not an OD")
+                    print(url)
+                    bot.log_crawl(comment.id)
+                    bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `"
+                                       f"{url}` does not point to an open directory. This could also mean that the "
+                                       f"website is not responding (in which case, feel free to retry in a few minutes)"
+                                       f" If you think that this is an error, please "
+                                       f"[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)")
+                    continue
+
+                bot.log_crawl(comment.id)
+                web_id = db.insert_website(Website(url, "localhost", "reddit_bot"))
+                db.enqueue(web_id, reddit_comment_id=comment.id, priority=2)  # Medium priority for reddit comments
+                print("Queued comment post: " + str(web_id))
+
+
+# Check posts
+for submission in subreddit.new(limit=500):
    submissions.append(submission)

-bot = RedditBot("crawled.txt", reddit)

 for s in submissions:

    if not s.is_self:
        if not bot.has_crawled(s.id):

-            url = urljoin(s.url, "")
+            url = os.path.join(s.url, "")  # add trailing slash

            website = db.get_website_by_url(url)

            if website:
-                continue
+                bot.log_crawl(s.id)
+                handle_exact_repost(website.id, s)

-            website = db.website_exists(url)
-            if website:
-                print("Repost!")
-                continue
+            website_id = db.website_exists(url)
+            if website_id:
+                bot.log_crawl(s.id)
+                handle_subdir_repost(website_id, s)

            if not od_util.is_valid_url(url):
-                print("Parent dir already posted!")
+                print("Skipping reddit post: Invalid url")
+                bot.log_crawl(s.id)
                continue

            if not od_util.is_od(url):
+                print("Skipping reddit post: Not an OD")
                print(url)
+                bot.log_crawl(s.id)
                continue

+            bot.log_crawl(s.id)
            web_id = db.insert_website(Website(url, "localhost", "reddit_bot"))
-            db.enqueue(web_id, s.id, priority=2)  # Higher priority for reddit posts
-            print("Queued " + str(web_id))
+            db.enqueue(web_id, reddit_post_id=s.id, priority=3)  # Higher priority for reddit posts
+            print("Queued reddit post: " + str(web_id))
--- a/reddit_bot.py
+++ b/reddit_bot.py
@@ -6,6 +6,9 @@ import humanfriendly

 class RedditBot:

+    bottom_line = "^(Beep boop. I am a bot that calculates the file sizes & count of " \
+                  "open directories posted in /r/opendirectories/)"
+
    def __init__(self, log_file: str, reddit: praw.Reddit):

        self.log_file = log_file
@@ -34,45 +37,52 @@ class RedditBot:
            with open(self.log_file, "r") as f:
                self.crawled = list(filter(None, f.read().split("\n")))

-    def reply(self, post_id: str, comment: str):
-
-        submission = self.reddit.submission(id=post_id)
+    def reply(self, reddit_obj, comment: str):

        while True:
            try:
-                if not self.has_crawled(submission.id):
-                    submission.reply(comment)
-                    self.log_crawl(submission.id)
+                # Double check has_crawled
+                if not self.has_crawled(reddit_obj.id):
+                    # reddit_obj.reply(comment)
+                    print("Skipping comment " + comment)
+                    self.log_crawl(reddit_obj.id)
                break
            except Exception as e:
-                print("Waiting 10 minutes: " + str(e))
-                time.sleep(600)
+                print("Waiting 5 minutes: " + str(e))
+                time.sleep(300)
                continue

    @staticmethod
-    def get_comment(stats, website_id):
+    def get_comment(stats: dict, website_id, message: str = ""):
+        comment = message + "    \n" if len(message) > 0 else ""

-        comment = "File types | Count | Total Size\n"
-        comment += ":-- | :-- | :--    \n"
-        print(stats["mime_stats"])
+        for stat in stats:
+            comment += stat + "    \n" if len(stat) > 0 else ""
+            comment += RedditBot.format_stats(stats[stat])
+
+        comment += "[Full Report](https://simon987.net/od-database/website/" + str(website_id) + "/)"
+        comment += " | [Link list](https://simon987.net/od-database/website/" + str(website_id) + "/links)    \n"
+        comment += "***    \n"
+        comment += RedditBot.bottom_line
+
+        return comment
+
+    @staticmethod
+    def format_stats(stats):
+
+        result = "    \n"
+        result += "File types | Count | Total Size\n"
+        result += ":-- | :-- | :--    \n"
        counter = 0
        for mime in stats["mime_stats"]:
-            print(mime)
-            comment += mime[2]
-            comment += " | " + str(mime[1]) + "    \n"
-            comment += " | " + str(mime[0]) + "    \n"
+            result += mime[2]
+            result += " | " + str(mime[1])
+            result += " | " + humanfriendly.format_size(mime[0]) + "    \n"

            counter += 1
            if counter >= 3:
                break

-        comment += "**Total** | **" + str(stats["total_count"]) + "** | **"
-        comment += humanfriendly.format_size(stats["total_size"]) + "**    \n\n"
-
-        comment += "[Full Report](https://simon987.net/od-database/website/" + str(website_id) + "/)"
-        comment += " | [Link list](https://simon987.net/od-database/website/" + str(website_id) + "/links)    \n"
-        comment += "***    \n^(Beep boop. I am a bot that calculates the file sizes & count of"
-        comment += " open directories posted in /r/opendirectories/)"
-
-        return comment
-
+        result += "**Total** | **" + str(stats["total_count"]) + "** | **"
+        result += humanfriendly.format_size(stats["total_size"]) + "**    \n\n"
+        return result
--- a/scrapy_od_database/settings.py
+++ b/scrapy_od_database/settings.py
@@ -24,15 +24,15 @@ ROBOTSTXT_OBEY = False

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 CONCURRENT_REQUESTS = 40
-RETRY_TIMES = 5
-DOWNLOAD_TIMEOUT = 50
+RETRY_TIMES = 6
+DOWNLOAD_TIMEOUT = 90

 # Configure a delay for requests for the same website (default: 0)
 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 #DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
-CONCURRENT_REQUESTS_PER_DOMAIN = 50
+CONCURRENT_REQUESTS_PER_DOMAIN = 40
 # CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
--- a/task.py
+++ b/task.py
@@ -30,10 +30,10 @@ class TaskManager:
            task = self.db.dequeue()

            if task:
-                website_id, post_id = task
+                website_id, post_id, comment_id = task
                website = self.db.get_website_by_id(website_id)
                self.current_task = Process(target=self.execute_task,
-                                            args=(website, self.busy, post_id))
+                                            args=(website, self.busy, post_id, comment_id))
                self.current_website = website
                self.current_task.start()

@@ -42,7 +42,7 @@ class TaskManager:
            self.current_task = None
            self.current_website = None

-    def execute_task(self, website: Website, busy: Value, post_id: str):
+    def execute_task(self, website: Website, busy: Value, post_id: str, comment_id: str):
        busy.value = 1
        if os.path.exists("data.json"):
            os.remove("data.json")
@@ -57,12 +57,22 @@ class TaskManager:
        print("Imported in SQLite3")

        if post_id:
-            # TODO check should_comment()
+            # Reply to post
            stats = self.db.get_website_stats(website.id)
-            comment = self.reddit_bot.get_comment(stats, website.id)
+            comment = self.reddit_bot.get_comment({"": stats}, website.id)
            print(comment)
-            print(self.reddit_bot.reddit.submission(post_id))
+            if "total_size" in stats and stats["total_size"] > 10000000:
+                post = self.reddit_bot.reddit.submission(post_id)
+                self.reddit_bot.reply(post, comment)
+                pass

+        elif comment_id:
+            # Reply to comment
+            stats = self.db.get_website_stats(website.id)
+            comment = self.reddit_bot.get_comment({"There you go!": stats}, website.id)
+            print(comment)
+            reddit_comment = self.reddit_bot.reddit.comment(comment_id)
+            self.reddit_bot.reply(reddit_comment, comment)
        busy.value = 0
        print("Done crawling task")

--- a/templates/websites.html
+++ b/templates/websites.html
@@ -8,7 +8,7 @@
        <div class="card">
            <div class="card-header">Last updated websites</div>
            <div class="card-body">
-                <table class="table">
+                <table class="table table-striped">
                    <thead>
                    <tr>
                        <th>Url</th>