From 270ab1335a1ab36e7a13f7eaf8937d6c95488ca4 Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 2 Jun 2018 17:26:15 -0400 Subject: [PATCH] Added reply to comments option, fixed some bugs --- app.py | 4 +- database.py | 60 +++++++++++++++---- init_script.sql | 1 + queue_reddit_links.py | 106 +++++++++++++++++++++++++++++---- reddit_bot.py | 62 +++++++++++-------- scrapy_od_database/settings.py | 6 +- task.py | 22 +++++-- templates/websites.html | 2 +- 8 files changed, 200 insertions(+), 63 deletions(-) diff --git a/app.py b/app.py index 8b5e9f3..072361b 100644 --- a/app.py +++ b/app.py @@ -1,9 +1,9 @@ from flask import Flask, render_template, redirect, request, flash, abort, Response +import os import json from database import Database, Website, InvalidQueryException from flask_recaptcha import ReCaptcha import od_util -from urllib.parse import urljoin import sqlite3 from flask_caching import Cache from task import TaskManager @@ -111,7 +111,7 @@ def submit(): def enqueue(): if not recaptcha.verify(): - url = urljoin(request.form.get("url"), "") + url = os.path.join(request.form.get("url"), "") website = db.get_website_by_url(url) diff --git a/database.py b/database.py index 79b42ec..1e0ede8 100644 --- a/database.py +++ b/database.py @@ -139,13 +139,16 @@ class Database: else: return None - def enqueue(self, website_id, reddit_post_id=None, priority=1): + def enqueue(self, website_id, reddit_post_id=None, reddit_comment_id=None, priority=1): with sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() - - cursor.execute("INSERT OR IGNORE INTO Queue (website_id, reddit_post_id, priority) VALUES (?,?,?)", - (website_id, reddit_post_id, priority)) + if reddit_post_id: + cursor.execute("INSERT OR IGNORE INTO Queue (website_id, reddit_post_id, priority) VALUES (?,?,?)", + (website_id, reddit_post_id, priority)) + else: + cursor.execute("INSERT OR IGNORE INTO Queue (website_id, reddit_comment_id, priority) VALUES (?,?,?)", + (website_id, reddit_comment_id, priority)) conn.commit() def dequeue(self): @@ -153,12 +156,13 @@ class Database: with sqlite3.connect(self.db_path) as conn: cursor = conn.cursor() - cursor.execute("SELECT website_id, reddit_post_id FROM Queue ORDER BY priority DESC, Queue.id ASC LIMIT 1") - website_id = cursor.fetchone() + cursor.execute("SELECT website_id, reddit_post_id, reddit_comment_id" + " FROM Queue ORDER BY priority DESC, Queue.id ASC LIMIT 1") + website = cursor.fetchone() - if website_id: - cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id[0],)) - return website_id[0], website_id[1] + if website: + cursor.execute("DELETE FROM Queue WHERE website_id=?", (website[0],)) + return website[0], website[1], website[2] else: return None @@ -216,7 +220,8 @@ class Database: cursor = conn.cursor() cursor.execute("SELECT SUM(File.size), COUNT(*) FROM File " - "WHERE File.path_id IN (SELECT id FROM WebsitePath WHERE website_id = ?)", (website_id, )) + "INNER JOIN WebsitePath Path on File.path_id = Path.id " + "WHERE Path.website_id = ?", (website_id, )) file_sum, file_count = cursor.fetchone() cursor.execute("SELECT SUM(File.size) as total_size, COUNT(File.id), FileType.mime FROM File " @@ -230,8 +235,36 @@ class Database: website_url, website_date = cursor.fetchone() return { - "total_size": file_sum, - "total_count": file_count, + "total_size": file_sum if file_sum else 0, + "total_count": file_count if file_count else 0, + "base_url": website_url, + "report_time": website_date, + "mime_stats": db_mime_stats + } + + def get_subdir_stats(self, website_id: int, path: str): + """Get stats of a sub directory. path must not start with / and must end with /""" + with sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + + cursor.execute("SELECT SUM(File.size), COUNT(*) FROM File " + "INNER JOIN WebsitePath Path on File.path_id = Path.id " + "WHERE Path.website_id = ? AND Path.path LIKE ?", (website_id, path + "%")) + file_sum, file_count = cursor.fetchone() + + cursor.execute("SELECT SUM(File.size) as total_size, COUNT(File.id), FileType.mime FROM File " + "INNER JOIN FileType ON FileType.id = File.mime_id " + "INNER JOIN WebsitePath Path on File.path_id = Path.id " + "WHERE Path.website_id = ? AND Path.path LIKE ? " + "GROUP BY FileType.id ORDER BY total_size DESC", (website_id, path + "%")) + db_mime_stats = cursor.fetchall() + + cursor.execute("SELECT Website.url, Website.last_modified FROM Website WHERE id = ?", (website_id, )) + website_url, website_date = cursor.fetchone() + + return { + "total_size": file_sum if file_sum else 0, + "total_count": file_count if file_count else 0, "base_url": website_url, "report_time": website_date, "mime_stats": db_mime_stats @@ -266,7 +299,8 @@ class Database: cursor = conn.cursor() cursor.execute("SELECT id FROM Website WHERE url = substr(?, 0, length(url) + 1)", (url, )) - return True if cursor.fetchone() else False + website_id = cursor.fetchone() + return website_id[0] if website_id else None def clear_website(self, website_id): """Remove all files from a website and update its last_updated date""" diff --git a/init_script.sql b/init_script.sql index 79df53f..9dbca20 100644 --- a/init_script.sql +++ b/init_script.sql @@ -36,6 +36,7 @@ CREATE TABLE Queue ( id INTEGER PRIMARY KEY NOT NULL, website_id INTEGER UNIQUE, reddit_post_id TEXT, + reddit_comment_id TEXT, priority INTEGER ); diff --git a/queue_reddit_links.py b/queue_reddit_links.py index f09d10b..80e8f04 100644 --- a/queue_reddit_links.py +++ b/queue_reddit_links.py @@ -2,45 +2,127 @@ import praw from reddit_bot import RedditBot from database import Database, Website import od_util -from urllib.parse import urljoin +import os +import re +pattern = re.compile("[\[\]\\\()]+") reddit = praw.Reddit('opendirectories-bot', user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)') db = Database("db.sqlite3") subreddit = reddit.subreddit("opendirectories") +# subreddit = reddit.subreddit("test") +bot = RedditBot("crawled.txt", reddit) submissions = [] -for submission in subreddit.new(limit=1): + +def handle_exact_repost(website_id, reddit_obj): + stats = db.get_website_stats(website_id) + comment = bot.get_comment({"": stats}, website_id, + f"I already scanned this website on {website.last_modified} UTC") + print(comment) + print("Exact repost!") + bot.reply(reddit_obj, comment) + + +def handle_subdir_repost(website_id, reddit_obj): + + website = db.get_website_by_id(website_id) + + subdir = url[len(website.url):] + + subdir_stats = db.get_subdir_stats(website_id, subdir) + stats = db.get_website_stats(website_id) + comment = bot.get_comment({"Parent directory:": stats, f"Subdirectory `/{subdir}`:": subdir_stats}, + website_id, f"I already scanned a parent directory of this website on" + f" {website.last_modified} UTC") + print(comment) + print("Subdir repost!") + bot.reply(reddit_obj, comment) + + +# Check comments +for comment in []: #subreddit.comments(limit=50): + + if not bot.has_crawled(comment): + text = pattern.sub(" ", comment.body).strip() + if text.startswith("u/opendirectories-bot") or text.startswith("/u/opendirectories-bot"): + lines = text.split() + if len(lines) > 1: + url = os.path.join(lines[1], "") # Add trailing slash + + website = db.get_website_by_url(url) + + if website: + bot.log_crawl(comment.id) + handle_exact_repost(website.id, comment) + continue + + website_id = db.website_exists(url) + if website_id: + bot.log_crawl(comment.id) + handle_subdir_repost(website_id, comment) + continue + + if not od_util.is_valid_url(url): + print("Skipping reddit comment: Invalid url") + bot.log_crawl(comment.id) + bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `" + f"{url}` is not valid. Make sure that you include the `http(s)://` prefix. \n") + continue + + if not od_util.is_od(url): + print("Skipping reddit comment: Not an OD") + print(url) + bot.log_crawl(comment.id) + bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `" + f"{url}` does not point to an open directory. This could also mean that the " + f"website is not responding (in which case, feel free to retry in a few minutes)" + f" If you think that this is an error, please " + f"[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)") + continue + + bot.log_crawl(comment.id) + web_id = db.insert_website(Website(url, "localhost", "reddit_bot")) + db.enqueue(web_id, reddit_comment_id=comment.id, priority=2) # Medium priority for reddit comments + print("Queued comment post: " + str(web_id)) + + +# Check posts +for submission in subreddit.new(limit=500): submissions.append(submission) -bot = RedditBot("crawled.txt", reddit) for s in submissions: if not s.is_self: if not bot.has_crawled(s.id): - url = urljoin(s.url, "") + url = os.path.join(s.url, "") # add trailing slash website = db.get_website_by_url(url) if website: - continue + bot.log_crawl(s.id) + handle_exact_repost(website.id, s) - website = db.website_exists(url) - if website: - print("Repost!") - continue + website_id = db.website_exists(url) + if website_id: + bot.log_crawl(s.id) + handle_subdir_repost(website_id, s) if not od_util.is_valid_url(url): - print("Parent dir already posted!") + print("Skipping reddit post: Invalid url") + bot.log_crawl(s.id) continue if not od_util.is_od(url): + print("Skipping reddit post: Not an OD") print(url) + bot.log_crawl(s.id) continue + bot.log_crawl(s.id) web_id = db.insert_website(Website(url, "localhost", "reddit_bot")) - db.enqueue(web_id, s.id, priority=2) # Higher priority for reddit posts - print("Queued " + str(web_id)) + db.enqueue(web_id, reddit_post_id=s.id, priority=3) # Higher priority for reddit posts + print("Queued reddit post: " + str(web_id)) diff --git a/reddit_bot.py b/reddit_bot.py index f4d62bc..fe314ee 100644 --- a/reddit_bot.py +++ b/reddit_bot.py @@ -6,6 +6,9 @@ import humanfriendly class RedditBot: + bottom_line = "^(Beep boop. I am a bot that calculates the file sizes & count of " \ + "open directories posted in /r/opendirectories/)" + def __init__(self, log_file: str, reddit: praw.Reddit): self.log_file = log_file @@ -34,45 +37,52 @@ class RedditBot: with open(self.log_file, "r") as f: self.crawled = list(filter(None, f.read().split("\n"))) - def reply(self, post_id: str, comment: str): - - submission = self.reddit.submission(id=post_id) + def reply(self, reddit_obj, comment: str): while True: try: - if not self.has_crawled(submission.id): - submission.reply(comment) - self.log_crawl(submission.id) + # Double check has_crawled + if not self.has_crawled(reddit_obj.id): + # reddit_obj.reply(comment) + print("Skipping comment " + comment) + self.log_crawl(reddit_obj.id) break except Exception as e: - print("Waiting 10 minutes: " + str(e)) - time.sleep(600) + print("Waiting 5 minutes: " + str(e)) + time.sleep(300) continue @staticmethod - def get_comment(stats, website_id): + def get_comment(stats: dict, website_id, message: str = ""): + comment = message + " \n" if len(message) > 0 else "" - comment = "File types | Count | Total Size\n" - comment += ":-- | :-- | :-- \n" - print(stats["mime_stats"]) + for stat in stats: + comment += stat + " \n" if len(stat) > 0 else "" + comment += RedditBot.format_stats(stats[stat]) + + comment += "[Full Report](https://simon987.net/od-database/website/" + str(website_id) + "/)" + comment += " | [Link list](https://simon987.net/od-database/website/" + str(website_id) + "/links) \n" + comment += "*** \n" + comment += RedditBot.bottom_line + + return comment + + @staticmethod + def format_stats(stats): + + result = " \n" + result += "File types | Count | Total Size\n" + result += ":-- | :-- | :-- \n" counter = 0 for mime in stats["mime_stats"]: - print(mime) - comment += mime[2] - comment += " | " + str(mime[1]) + " \n" - comment += " | " + str(mime[0]) + " \n" + result += mime[2] + result += " | " + str(mime[1]) + result += " | " + humanfriendly.format_size(mime[0]) + " \n" counter += 1 if counter >= 3: break - comment += "**Total** | **" + str(stats["total_count"]) + "** | **" - comment += humanfriendly.format_size(stats["total_size"]) + "** \n\n" - - comment += "[Full Report](https://simon987.net/od-database/website/" + str(website_id) + "/)" - comment += " | [Link list](https://simon987.net/od-database/website/" + str(website_id) + "/links) \n" - comment += "*** \n^(Beep boop. I am a bot that calculates the file sizes & count of" - comment += " open directories posted in /r/opendirectories/)" - - return comment - + result += "**Total** | **" + str(stats["total_count"]) + "** | **" + result += humanfriendly.format_size(stats["total_size"]) + "** \n\n" + return result diff --git a/scrapy_od_database/settings.py b/scrapy_od_database/settings.py index 093fcc9..5241e55 100644 --- a/scrapy_od_database/settings.py +++ b/scrapy_od_database/settings.py @@ -24,15 +24,15 @@ ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 40 -RETRY_TIMES = 5 -DOWNLOAD_TIMEOUT = 50 +RETRY_TIMES = 6 +DOWNLOAD_TIMEOUT = 90 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -CONCURRENT_REQUESTS_PER_DOMAIN = 50 +CONCURRENT_REQUESTS_PER_DOMAIN = 40 # CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) diff --git a/task.py b/task.py index 764a23d..187121c 100644 --- a/task.py +++ b/task.py @@ -30,10 +30,10 @@ class TaskManager: task = self.db.dequeue() if task: - website_id, post_id = task + website_id, post_id, comment_id = task website = self.db.get_website_by_id(website_id) self.current_task = Process(target=self.execute_task, - args=(website, self.busy, post_id)) + args=(website, self.busy, post_id, comment_id)) self.current_website = website self.current_task.start() @@ -42,7 +42,7 @@ class TaskManager: self.current_task = None self.current_website = None - def execute_task(self, website: Website, busy: Value, post_id: str): + def execute_task(self, website: Website, busy: Value, post_id: str, comment_id: str): busy.value = 1 if os.path.exists("data.json"): os.remove("data.json") @@ -57,12 +57,22 @@ class TaskManager: print("Imported in SQLite3") if post_id: - # TODO check should_comment() + # Reply to post stats = self.db.get_website_stats(website.id) - comment = self.reddit_bot.get_comment(stats, website.id) + comment = self.reddit_bot.get_comment({"": stats}, website.id) print(comment) - print(self.reddit_bot.reddit.submission(post_id)) + if "total_size" in stats and stats["total_size"] > 10000000: + post = self.reddit_bot.reddit.submission(post_id) + self.reddit_bot.reply(post, comment) + pass + elif comment_id: + # Reply to comment + stats = self.db.get_website_stats(website.id) + comment = self.reddit_bot.get_comment({"There you go!": stats}, website.id) + print(comment) + reddit_comment = self.reddit_bot.reddit.comment(comment_id) + self.reddit_bot.reply(reddit_comment, comment) busy.value = 0 print("Done crawling task") diff --git a/templates/websites.html b/templates/websites.html index 28347e9..adc2357 100644 --- a/templates/websites.html +++ b/templates/websites.html @@ -8,7 +8,7 @@
Last updated websites
- +
Url