From 270ab1335a1ab36e7a13f7eaf8937d6c95488ca4 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Sat, 2 Jun 2018 17:26:15 -0400
Subject: [PATCH] Added reply to comments option, fixed some bugs

---
 app.py                         |   4 +-
 database.py                    |  60 +++++++++++++++----
 init_script.sql                |   1 +
 queue_reddit_links.py          | 106 +++++++++++++++++++++++++++++----
 reddit_bot.py                  |  62 +++++++++++--------
 scrapy_od_database/settings.py |   6 +-
 task.py                        |  22 +++++--
 templates/websites.html        |   2 +-
 8 files changed, 200 insertions(+), 63 deletions(-)

diff --git a/app.py b/app.py
index 8b5e9f3..072361b 100644
--- a/app.py
+++ b/app.py
@@ -1,9 +1,9 @@
 from flask import Flask, render_template, redirect, request, flash, abort, Response
+import os
 import json
 from database import Database, Website, InvalidQueryException
 from flask_recaptcha import ReCaptcha
 import od_util
-from urllib.parse import urljoin
 import sqlite3
 from flask_caching import Cache
 from task import TaskManager
@@ -111,7 +111,7 @@ def submit():
 def enqueue():
     if not recaptcha.verify():
 
-        url = urljoin(request.form.get("url"), "")
+        url = os.path.join(request.form.get("url"), "")
 
         website = db.get_website_by_url(url)
 
diff --git a/database.py b/database.py
index 79b42ec..1e0ede8 100644
--- a/database.py
+++ b/database.py
@@ -139,13 +139,16 @@ class Database:
             else:
                 return None
 
-    def enqueue(self, website_id, reddit_post_id=None, priority=1):
+    def enqueue(self, website_id, reddit_post_id=None, reddit_comment_id=None, priority=1):
 
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
-
-            cursor.execute("INSERT OR IGNORE INTO Queue (website_id, reddit_post_id, priority) VALUES (?,?,?)",
-                           (website_id, reddit_post_id, priority))
+            if reddit_post_id:
+                cursor.execute("INSERT OR IGNORE INTO Queue (website_id, reddit_post_id, priority) VALUES (?,?,?)",
+                               (website_id, reddit_post_id, priority))
+            else:
+                cursor.execute("INSERT OR IGNORE INTO Queue (website_id, reddit_comment_id, priority) VALUES (?,?,?)",
+                               (website_id, reddit_comment_id, priority))
             conn.commit()
 
     def dequeue(self):
@@ -153,12 +156,13 @@ class Database:
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
 
-            cursor.execute("SELECT website_id, reddit_post_id FROM Queue ORDER BY priority DESC, Queue.id ASC LIMIT 1")
-            website_id = cursor.fetchone()
+            cursor.execute("SELECT website_id, reddit_post_id, reddit_comment_id"
+                           " FROM Queue ORDER BY priority DESC, Queue.id ASC LIMIT 1")
+            website = cursor.fetchone()
 
-            if website_id:
-                cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id[0],))
-                return website_id[0], website_id[1]
+            if website:
+                cursor.execute("DELETE FROM Queue WHERE website_id=?", (website[0],))
+                return website[0], website[1], website[2]
             else:
                 return None
 
@@ -216,7 +220,8 @@ class Database:
             cursor = conn.cursor()
 
             cursor.execute("SELECT SUM(File.size), COUNT(*) FROM File "
-                           "WHERE File.path_id IN (SELECT id FROM WebsitePath WHERE website_id = ?)", (website_id, ))
+                           "INNER JOIN WebsitePath Path on File.path_id = Path.id "
+                           "WHERE Path.website_id = ?", (website_id, ))
             file_sum, file_count = cursor.fetchone()
 
             cursor.execute("SELECT SUM(File.size) as total_size, COUNT(File.id), FileType.mime FROM File "
@@ -230,8 +235,36 @@ class Database:
             website_url, website_date = cursor.fetchone()
 
             return {
-                "total_size": file_sum,
-                "total_count": file_count,
+                "total_size": file_sum if file_sum else 0,
+                "total_count": file_count if file_count else 0,
+                "base_url": website_url,
+                "report_time": website_date,
+                "mime_stats": db_mime_stats
+            }
+
+    def get_subdir_stats(self, website_id: int, path: str):
+        """Get stats of a sub directory. path must not start with / and must end with /"""
+        with sqlite3.connect(self.db_path) as conn:
+            cursor = conn.cursor()
+
+            cursor.execute("SELECT SUM(File.size), COUNT(*) FROM File "
+                           "INNER JOIN WebsitePath Path on File.path_id = Path.id "
+                           "WHERE Path.website_id = ? AND Path.path LIKE ?", (website_id, path + "%"))
+            file_sum, file_count = cursor.fetchone()
+
+            cursor.execute("SELECT SUM(File.size) as total_size, COUNT(File.id), FileType.mime FROM File "
+                           "INNER JOIN FileType ON FileType.id = File.mime_id "
+                           "INNER JOIN WebsitePath Path on File.path_id = Path.id "
+                           "WHERE Path.website_id = ? AND Path.path LIKE ? "
+                           "GROUP BY FileType.id ORDER BY total_size DESC", (website_id, path + "%"))
+            db_mime_stats = cursor.fetchall()
+
+            cursor.execute("SELECT Website.url, Website.last_modified FROM Website WHERE id = ?", (website_id, ))
+            website_url, website_date = cursor.fetchone()
+
+            return {
+                "total_size": file_sum if file_sum else 0,
+                "total_count": file_count if file_count else 0,
                 "base_url": website_url,
                 "report_time": website_date,
                 "mime_stats": db_mime_stats
@@ -266,7 +299,8 @@ class Database:
             cursor = conn.cursor()
 
             cursor.execute("SELECT id FROM Website WHERE url = substr(?, 0, length(url) + 1)", (url, ))
-            return True if cursor.fetchone() else False
+            website_id = cursor.fetchone()
+            return website_id[0] if website_id else None
 
     def clear_website(self, website_id):
         """Remove all files from a website and update its last_updated date"""
diff --git a/init_script.sql b/init_script.sql
index 79df53f..9dbca20 100644
--- a/init_script.sql
+++ b/init_script.sql
@@ -36,6 +36,7 @@ CREATE TABLE Queue (
   id INTEGER PRIMARY KEY NOT NULL,
   website_id INTEGER UNIQUE,
   reddit_post_id TEXT,
+  reddit_comment_id TEXT,
   priority INTEGER
 );
 
diff --git a/queue_reddit_links.py b/queue_reddit_links.py
index f09d10b..80e8f04 100644
--- a/queue_reddit_links.py
+++ b/queue_reddit_links.py
@@ -2,45 +2,127 @@ import praw
 from reddit_bot import RedditBot
 from database import Database, Website
 import od_util
-from urllib.parse import urljoin
+import os
+import re
 
+pattern = re.compile("[\[\]\\\()]+")
 reddit = praw.Reddit('opendirectories-bot',
                      user_agent='github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)')
 db = Database("db.sqlite3")
 subreddit = reddit.subreddit("opendirectories")
+# subreddit = reddit.subreddit("test")
+bot = RedditBot("crawled.txt", reddit)
 
 submissions = []
 
-for submission in subreddit.new(limit=1):
+
+def handle_exact_repost(website_id, reddit_obj):
+    stats = db.get_website_stats(website_id)
+    comment = bot.get_comment({"": stats}, website_id,
+                              f"I already scanned this website on {website.last_modified} UTC")
+    print(comment)
+    print("Exact repost!")
+    bot.reply(reddit_obj, comment)
+
+
+def handle_subdir_repost(website_id, reddit_obj):
+
+    website = db.get_website_by_id(website_id)
+
+    subdir = url[len(website.url):]
+
+    subdir_stats = db.get_subdir_stats(website_id, subdir)
+    stats = db.get_website_stats(website_id)
+    comment = bot.get_comment({"Parent directory:": stats, f"Subdirectory `/{subdir}`:": subdir_stats},
+                              website_id, f"I already scanned a parent directory of this website on"
+                                          f" {website.last_modified} UTC")
+    print(comment)
+    print("Subdir repost!")
+    bot.reply(reddit_obj, comment)
+
+
+# Check comments
+for comment in []: #subreddit.comments(limit=50):
+
+    if not bot.has_crawled(comment):
+        text = pattern.sub(" ", comment.body).strip()
+        if text.startswith("u/opendirectories-bot") or text.startswith("/u/opendirectories-bot"):
+            lines = text.split()
+            if len(lines) > 1:
+                url = os.path.join(lines[1], "")  # Add trailing slash
+
+                website = db.get_website_by_url(url)
+
+                if website:
+                    bot.log_crawl(comment.id)
+                    handle_exact_repost(website.id, comment)
+                    continue
+
+                website_id = db.website_exists(url)
+                if website_id:
+                    bot.log_crawl(comment.id)
+                    handle_subdir_repost(website_id, comment)
+                    continue
+
+                if not od_util.is_valid_url(url):
+                    print("Skipping reddit comment: Invalid url")
+                    bot.log_crawl(comment.id)
+                    bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `"
+                                       f"{url}` is not valid. Make sure that you include the `http(s)://` prefix.    \n")
+                    continue
+
+                if not od_util.is_od(url):
+                    print("Skipping reddit comment: Not an OD")
+                    print(url)
+                    bot.log_crawl(comment.id)
+                    bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `"
+                                       f"{url}` does not point to an open directory. This could also mean that the "
+                                       f"website is not responding (in which case, feel free to retry in a few minutes)"
+                                       f" If you think that this is an error, please "
+                                       f"[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)")
+                    continue
+
+                bot.log_crawl(comment.id)
+                web_id = db.insert_website(Website(url, "localhost", "reddit_bot"))
+                db.enqueue(web_id, reddit_comment_id=comment.id, priority=2)  # Medium priority for reddit comments
+                print("Queued comment post: " + str(web_id))
+
+
+# Check posts
+for submission in subreddit.new(limit=500):
     submissions.append(submission)
 
-bot = RedditBot("crawled.txt", reddit)
 
 for s in submissions:
 
     if not s.is_self:
         if not bot.has_crawled(s.id):
 
-            url = urljoin(s.url, "")
+            url = os.path.join(s.url, "")  # add trailing slash
 
             website = db.get_website_by_url(url)
 
             if website:
-                continue
+                bot.log_crawl(s.id)
+                handle_exact_repost(website.id, s)
 
-            website = db.website_exists(url)
-            if website:
-                print("Repost!")
-                continue
+            website_id = db.website_exists(url)
+            if website_id:
+                bot.log_crawl(s.id)
+                handle_subdir_repost(website_id, s)
 
             if not od_util.is_valid_url(url):
-                print("Parent dir already posted!")
+                print("Skipping reddit post: Invalid url")
+                bot.log_crawl(s.id)
                 continue
 
             if not od_util.is_od(url):
+                print("Skipping reddit post: Not an OD")
                 print(url)
+                bot.log_crawl(s.id)
                 continue
 
+            bot.log_crawl(s.id)
             web_id = db.insert_website(Website(url, "localhost", "reddit_bot"))
-            db.enqueue(web_id, s.id, priority=2)  # Higher priority for reddit posts
-            print("Queued " + str(web_id))
+            db.enqueue(web_id, reddit_post_id=s.id, priority=3)  # Higher priority for reddit posts
+            print("Queued reddit post: " + str(web_id))
diff --git a/reddit_bot.py b/reddit_bot.py
index f4d62bc..fe314ee 100644
--- a/reddit_bot.py
+++ b/reddit_bot.py
@@ -6,6 +6,9 @@ import humanfriendly
 
 class RedditBot:
 
+    bottom_line = "^(Beep boop. I am a bot that calculates the file sizes & count of " \
+                  "open directories posted in /r/opendirectories/)"
+
     def __init__(self, log_file: str, reddit: praw.Reddit):
 
         self.log_file = log_file
@@ -34,45 +37,52 @@ class RedditBot:
             with open(self.log_file, "r") as f:
                 self.crawled = list(filter(None, f.read().split("\n")))
 
-    def reply(self, post_id: str, comment: str):
-
-        submission = self.reddit.submission(id=post_id)
+    def reply(self, reddit_obj, comment: str):
 
         while True:
             try:
-                if not self.has_crawled(submission.id):
-                    submission.reply(comment)
-                    self.log_crawl(submission.id)
+                # Double check has_crawled
+                if not self.has_crawled(reddit_obj.id):
+                    # reddit_obj.reply(comment)
+                    print("Skipping comment " + comment)
+                    self.log_crawl(reddit_obj.id)
                 break
             except Exception as e:
-                print("Waiting 10 minutes: " + str(e))
-                time.sleep(600)
+                print("Waiting 5 minutes: " + str(e))
+                time.sleep(300)
                 continue
 
     @staticmethod
-    def get_comment(stats, website_id):
+    def get_comment(stats: dict, website_id, message: str = ""):
+        comment = message + "    \n" if len(message) > 0 else ""
 
-        comment = "File types | Count | Total Size\n"
-        comment += ":-- | :-- | :--    \n"
-        print(stats["mime_stats"])
+        for stat in stats:
+            comment += stat + "    \n" if len(stat) > 0 else ""
+            comment += RedditBot.format_stats(stats[stat])
+
+        comment += "[Full Report](https://simon987.net/od-database/website/" + str(website_id) + "/)"
+        comment += " | [Link list](https://simon987.net/od-database/website/" + str(website_id) + "/links)    \n"
+        comment += "***    \n"
+        comment += RedditBot.bottom_line
+
+        return comment
+
+    @staticmethod
+    def format_stats(stats):
+
+        result = "    \n"
+        result += "File types | Count | Total Size\n"
+        result += ":-- | :-- | :--    \n"
         counter = 0
         for mime in stats["mime_stats"]:
-            print(mime)
-            comment += mime[2]
-            comment += " | " + str(mime[1]) + "    \n"
-            comment += " | " + str(mime[0]) + "    \n"
+            result += mime[2]
+            result += " | " + str(mime[1])
+            result += " | " + humanfriendly.format_size(mime[0]) + "    \n"
 
             counter += 1
             if counter >= 3:
                 break
 
-        comment += "**Total** | **" + str(stats["total_count"]) + "** | **"
-        comment += humanfriendly.format_size(stats["total_size"]) + "**    \n\n"
-
-        comment += "[Full Report](https://simon987.net/od-database/website/" + str(website_id) + "/)"
-        comment += " | [Link list](https://simon987.net/od-database/website/" + str(website_id) + "/links)    \n"
-        comment += "***    \n^(Beep boop. I am a bot that calculates the file sizes & count of"
-        comment += " open directories posted in /r/opendirectories/)"
-
-        return comment
-
+        result += "**Total** | **" + str(stats["total_count"]) + "** | **"
+        result += humanfriendly.format_size(stats["total_size"]) + "**    \n\n"
+        return result
diff --git a/scrapy_od_database/settings.py b/scrapy_od_database/settings.py
index 093fcc9..5241e55 100644
--- a/scrapy_od_database/settings.py
+++ b/scrapy_od_database/settings.py
@@ -24,15 +24,15 @@ ROBOTSTXT_OBEY = False
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 CONCURRENT_REQUESTS = 40
-RETRY_TIMES = 5
-DOWNLOAD_TIMEOUT = 50
+RETRY_TIMES = 6
+DOWNLOAD_TIMEOUT = 90
 
 # Configure a delay for requests for the same website (default: 0)
 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 #DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
-CONCURRENT_REQUESTS_PER_DOMAIN = 50
+CONCURRENT_REQUESTS_PER_DOMAIN = 40
 # CONCURRENT_REQUESTS_PER_IP = 16
 
 # Disable cookies (enabled by default)
diff --git a/task.py b/task.py
index 764a23d..187121c 100644
--- a/task.py
+++ b/task.py
@@ -30,10 +30,10 @@ class TaskManager:
             task = self.db.dequeue()
 
             if task:
-                website_id, post_id = task
+                website_id, post_id, comment_id = task
                 website = self.db.get_website_by_id(website_id)
                 self.current_task = Process(target=self.execute_task,
-                                            args=(website, self.busy, post_id))
+                                            args=(website, self.busy, post_id, comment_id))
                 self.current_website = website
                 self.current_task.start()
 
@@ -42,7 +42,7 @@ class TaskManager:
             self.current_task = None
             self.current_website = None
 
-    def execute_task(self, website: Website, busy: Value, post_id: str):
+    def execute_task(self, website: Website, busy: Value, post_id: str, comment_id: str):
         busy.value = 1
         if os.path.exists("data.json"):
             os.remove("data.json")
@@ -57,12 +57,22 @@ class TaskManager:
         print("Imported in SQLite3")
 
         if post_id:
-            # TODO check should_comment()
+            # Reply to post
             stats = self.db.get_website_stats(website.id)
-            comment = self.reddit_bot.get_comment(stats, website.id)
+            comment = self.reddit_bot.get_comment({"": stats}, website.id)
             print(comment)
-            print(self.reddit_bot.reddit.submission(post_id))
+            if "total_size" in stats and stats["total_size"] > 10000000:
+                post = self.reddit_bot.reddit.submission(post_id)
+                self.reddit_bot.reply(post, comment)
+                pass
 
+        elif comment_id:
+            # Reply to comment
+            stats = self.db.get_website_stats(website.id)
+            comment = self.reddit_bot.get_comment({"There you go!": stats}, website.id)
+            print(comment)
+            reddit_comment = self.reddit_bot.reddit.comment(comment_id)
+            self.reddit_bot.reply(reddit_comment, comment)
         busy.value = 0
         print("Done crawling task")
 
diff --git a/templates/websites.html b/templates/websites.html
index 28347e9..adc2357 100644
--- a/templates/websites.html
+++ b/templates/websites.html
@@ -8,7 +8,7 @@
         <div class="card">
             <div class="card-header">Last updated websites</div>
             <div class="card-body">
-                <table class="table">
+                <table class="table table-striped">
                     <thead>
                     <tr>
                         <th>Url</th>