mirror of
https://github.com/simon987/od-database.git
synced 2025-04-16 17:06:46 +00:00
Added reply to comments option, fixed some bugs
This commit is contained in:
parent
bb872a9248
commit
270ab1335a
4
app.py
4
app.py
@ -1,9 +1,9 @@
|
||||
from flask import Flask, render_template, redirect, request, flash, abort, Response
|
||||
import os
|
||||
import json
|
||||
from database import Database, Website, InvalidQueryException
|
||||
from flask_recaptcha import ReCaptcha
|
||||
import od_util
|
||||
from urllib.parse import urljoin
|
||||
import sqlite3
|
||||
from flask_caching import Cache
|
||||
from task import TaskManager
|
||||
@ -111,7 +111,7 @@ def submit():
|
||||
def enqueue():
|
||||
if not recaptcha.verify():
|
||||
|
||||
url = urljoin(request.form.get("url"), "")
|
||||
url = os.path.join(request.form.get("url"), "")
|
||||
|
||||
website = db.get_website_by_url(url)
|
||||
|
||||
|
60
database.py
60
database.py
@ -139,13 +139,16 @@ class Database:
|
||||
else:
|
||||
return None
|
||||
|
||||
def enqueue(self, website_id, reddit_post_id=None, priority=1):
|
||||
def enqueue(self, website_id, reddit_post_id=None, reddit_comment_id=None, priority=1):
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("INSERT OR IGNORE INTO Queue (website_id, reddit_post_id, priority) VALUES (?,?,?)",
|
||||
(website_id, reddit_post_id, priority))
|
||||
if reddit_post_id:
|
||||
cursor.execute("INSERT OR IGNORE INTO Queue (website_id, reddit_post_id, priority) VALUES (?,?,?)",
|
||||
(website_id, reddit_post_id, priority))
|
||||
else:
|
||||
cursor.execute("INSERT OR IGNORE INTO Queue (website_id, reddit_comment_id, priority) VALUES (?,?,?)",
|
||||
(website_id, reddit_comment_id, priority))
|
||||
conn.commit()
|
||||
|
||||
def dequeue(self):
|
||||
@ -153,12 +156,13 @@ class Database:
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT website_id, reddit_post_id FROM Queue ORDER BY priority DESC, Queue.id ASC LIMIT 1")
|
||||
website_id = cursor.fetchone()
|
||||
cursor.execute("SELECT website_id, reddit_post_id, reddit_comment_id"
|
||||
" FROM Queue ORDER BY priority DESC, Queue.id ASC LIMIT 1")
|
||||
website = cursor.fetchone()
|
||||
|
||||
if website_id:
|
||||
cursor.execute("DELETE FROM Queue WHERE website_id=?", (website_id[0],))
|
||||
return website_id[0], website_id[1]
|
||||
if website:
|
||||
cursor.execute("DELETE FROM Queue WHERE website_id=?", (website[0],))
|
||||
return website[0], website[1], website[2]
|
||||
else:
|
||||
return None
|
||||
|
||||
@ -216,7 +220,8 @@ class Database:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT SUM(File.size), COUNT(*) FROM File "
|
||||
"WHERE File.path_id IN (SELECT id FROM WebsitePath WHERE website_id = ?)", (website_id, ))
|
||||
"INNER JOIN WebsitePath Path on File.path_id = Path.id "
|
||||
"WHERE Path.website_id = ?", (website_id, ))
|
||||
file_sum, file_count = cursor.fetchone()
|
||||
|
||||
cursor.execute("SELECT SUM(File.size) as total_size, COUNT(File.id), FileType.mime FROM File "
|
||||
@ -230,8 +235,36 @@ class Database:
|
||||
website_url, website_date = cursor.fetchone()
|
||||
|
||||
return {
|
||||
"total_size": file_sum,
|
||||
"total_count": file_count,
|
||||
"total_size": file_sum if file_sum else 0,
|
||||
"total_count": file_count if file_count else 0,
|
||||
"base_url": website_url,
|
||||
"report_time": website_date,
|
||||
"mime_stats": db_mime_stats
|
||||
}
|
||||
|
||||
def get_subdir_stats(self, website_id: int, path: str):
|
||||
"""Get stats of a sub directory. path must not start with / and must end with /"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT SUM(File.size), COUNT(*) FROM File "
|
||||
"INNER JOIN WebsitePath Path on File.path_id = Path.id "
|
||||
"WHERE Path.website_id = ? AND Path.path LIKE ?", (website_id, path + "%"))
|
||||
file_sum, file_count = cursor.fetchone()
|
||||
|
||||
cursor.execute("SELECT SUM(File.size) as total_size, COUNT(File.id), FileType.mime FROM File "
|
||||
"INNER JOIN FileType ON FileType.id = File.mime_id "
|
||||
"INNER JOIN WebsitePath Path on File.path_id = Path.id "
|
||||
"WHERE Path.website_id = ? AND Path.path LIKE ? "
|
||||
"GROUP BY FileType.id ORDER BY total_size DESC", (website_id, path + "%"))
|
||||
db_mime_stats = cursor.fetchall()
|
||||
|
||||
cursor.execute("SELECT Website.url, Website.last_modified FROM Website WHERE id = ?", (website_id, ))
|
||||
website_url, website_date = cursor.fetchone()
|
||||
|
||||
return {
|
||||
"total_size": file_sum if file_sum else 0,
|
||||
"total_count": file_count if file_count else 0,
|
||||
"base_url": website_url,
|
||||
"report_time": website_date,
|
||||
"mime_stats": db_mime_stats
|
||||
@ -266,7 +299,8 @@ class Database:
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT id FROM Website WHERE url = substr(?, 0, length(url) + 1)", (url, ))
|
||||
return True if cursor.fetchone() else False
|
||||
website_id = cursor.fetchone()
|
||||
return website_id[0] if website_id else None
|
||||
|
||||
def clear_website(self, website_id):
|
||||
"""Remove all files from a website and update its last_updated date"""
|
||||
|
@ -36,6 +36,7 @@ CREATE TABLE Queue (
|
||||
id INTEGER PRIMARY KEY NOT NULL,
|
||||
website_id INTEGER UNIQUE,
|
||||
reddit_post_id TEXT,
|
||||
reddit_comment_id TEXT,
|
||||
priority INTEGER
|
||||
);
|
||||
|
||||
|
@ -2,45 +2,127 @@ import praw
|
||||
from reddit_bot import RedditBot
|
||||
from database import Database, Website
|
||||
import od_util
|
||||
from urllib.parse import urljoin
|
||||
import os
|
||||
import re
|
||||
|
||||
pattern = re.compile("[\[\]\\\()]+")
|
||||
reddit = praw.Reddit('opendirectories-bot',
|
||||
user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)')
|
||||
db = Database("db.sqlite3")
|
||||
subreddit = reddit.subreddit("opendirectories")
|
||||
# subreddit = reddit.subreddit("test")
|
||||
bot = RedditBot("crawled.txt", reddit)
|
||||
|
||||
submissions = []
|
||||
|
||||
for submission in subreddit.new(limit=1):
|
||||
|
||||
def handle_exact_repost(website_id, reddit_obj):
|
||||
stats = db.get_website_stats(website_id)
|
||||
comment = bot.get_comment({"": stats}, website_id,
|
||||
f"I already scanned this website on {website.last_modified} UTC")
|
||||
print(comment)
|
||||
print("Exact repost!")
|
||||
bot.reply(reddit_obj, comment)
|
||||
|
||||
|
||||
def handle_subdir_repost(website_id, reddit_obj):
|
||||
|
||||
website = db.get_website_by_id(website_id)
|
||||
|
||||
subdir = url[len(website.url):]
|
||||
|
||||
subdir_stats = db.get_subdir_stats(website_id, subdir)
|
||||
stats = db.get_website_stats(website_id)
|
||||
comment = bot.get_comment({"Parent directory:": stats, f"Subdirectory `/{subdir}`:": subdir_stats},
|
||||
website_id, f"I already scanned a parent directory of this website on"
|
||||
f" {website.last_modified} UTC")
|
||||
print(comment)
|
||||
print("Subdir repost!")
|
||||
bot.reply(reddit_obj, comment)
|
||||
|
||||
|
||||
# Check comments
|
||||
for comment in []: #subreddit.comments(limit=50):
|
||||
|
||||
if not bot.has_crawled(comment):
|
||||
text = pattern.sub(" ", comment.body).strip()
|
||||
if text.startswith("u/opendirectories-bot") or text.startswith("/u/opendirectories-bot"):
|
||||
lines = text.split()
|
||||
if len(lines) > 1:
|
||||
url = os.path.join(lines[1], "") # Add trailing slash
|
||||
|
||||
website = db.get_website_by_url(url)
|
||||
|
||||
if website:
|
||||
bot.log_crawl(comment.id)
|
||||
handle_exact_repost(website.id, comment)
|
||||
continue
|
||||
|
||||
website_id = db.website_exists(url)
|
||||
if website_id:
|
||||
bot.log_crawl(comment.id)
|
||||
handle_subdir_repost(website_id, comment)
|
||||
continue
|
||||
|
||||
if not od_util.is_valid_url(url):
|
||||
print("Skipping reddit comment: Invalid url")
|
||||
bot.log_crawl(comment.id)
|
||||
bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `"
|
||||
f"{url}` is not valid. Make sure that you include the `http(s)://` prefix. \n")
|
||||
continue
|
||||
|
||||
if not od_util.is_od(url):
|
||||
print("Skipping reddit comment: Not an OD")
|
||||
print(url)
|
||||
bot.log_crawl(comment.id)
|
||||
bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `"
|
||||
f"{url}` does not point to an open directory. This could also mean that the "
|
||||
f"website is not responding (in which case, feel free to retry in a few minutes)"
|
||||
f" If you think that this is an error, please "
|
||||
f"[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)")
|
||||
continue
|
||||
|
||||
bot.log_crawl(comment.id)
|
||||
web_id = db.insert_website(Website(url, "localhost", "reddit_bot"))
|
||||
db.enqueue(web_id, reddit_comment_id=comment.id, priority=2) # Medium priority for reddit comments
|
||||
print("Queued comment post: " + str(web_id))
|
||||
|
||||
|
||||
# Check posts
|
||||
for submission in subreddit.new(limit=500):
|
||||
submissions.append(submission)
|
||||
|
||||
bot = RedditBot("crawled.txt", reddit)
|
||||
|
||||
for s in submissions:
|
||||
|
||||
if not s.is_self:
|
||||
if not bot.has_crawled(s.id):
|
||||
|
||||
url = urljoin(s.url, "")
|
||||
url = os.path.join(s.url, "") # add trailing slash
|
||||
|
||||
website = db.get_website_by_url(url)
|
||||
|
||||
if website:
|
||||
continue
|
||||
bot.log_crawl(s.id)
|
||||
handle_exact_repost(website.id, s)
|
||||
|
||||
website = db.website_exists(url)
|
||||
if website:
|
||||
print("Repost!")
|
||||
continue
|
||||
website_id = db.website_exists(url)
|
||||
if website_id:
|
||||
bot.log_crawl(s.id)
|
||||
handle_subdir_repost(website_id, s)
|
||||
|
||||
if not od_util.is_valid_url(url):
|
||||
print("Parent dir already posted!")
|
||||
print("Skipping reddit post: Invalid url")
|
||||
bot.log_crawl(s.id)
|
||||
continue
|
||||
|
||||
if not od_util.is_od(url):
|
||||
print("Skipping reddit post: Not an OD")
|
||||
print(url)
|
||||
bot.log_crawl(s.id)
|
||||
continue
|
||||
|
||||
bot.log_crawl(s.id)
|
||||
web_id = db.insert_website(Website(url, "localhost", "reddit_bot"))
|
||||
db.enqueue(web_id, s.id, priority=2) # Higher priority for reddit posts
|
||||
print("Queued " + str(web_id))
|
||||
db.enqueue(web_id, reddit_post_id=s.id, priority=3) # Higher priority for reddit posts
|
||||
print("Queued reddit post: " + str(web_id))
|
||||
|
@ -6,6 +6,9 @@ import humanfriendly
|
||||
|
||||
class RedditBot:
|
||||
|
||||
bottom_line = "^(Beep boop. I am a bot that calculates the file sizes & count of " \
|
||||
"open directories posted in /r/opendirectories/)"
|
||||
|
||||
def __init__(self, log_file: str, reddit: praw.Reddit):
|
||||
|
||||
self.log_file = log_file
|
||||
@ -34,45 +37,52 @@ class RedditBot:
|
||||
with open(self.log_file, "r") as f:
|
||||
self.crawled = list(filter(None, f.read().split("\n")))
|
||||
|
||||
def reply(self, post_id: str, comment: str):
|
||||
|
||||
submission = self.reddit.submission(id=post_id)
|
||||
def reply(self, reddit_obj, comment: str):
|
||||
|
||||
while True:
|
||||
try:
|
||||
if not self.has_crawled(submission.id):
|
||||
submission.reply(comment)
|
||||
self.log_crawl(submission.id)
|
||||
# Double check has_crawled
|
||||
if not self.has_crawled(reddit_obj.id):
|
||||
# reddit_obj.reply(comment)
|
||||
print("Skipping comment " + comment)
|
||||
self.log_crawl(reddit_obj.id)
|
||||
break
|
||||
except Exception as e:
|
||||
print("Waiting 10 minutes: " + str(e))
|
||||
time.sleep(600)
|
||||
print("Waiting 5 minutes: " + str(e))
|
||||
time.sleep(300)
|
||||
continue
|
||||
|
||||
@staticmethod
|
||||
def get_comment(stats, website_id):
|
||||
def get_comment(stats: dict, website_id, message: str = ""):
|
||||
comment = message + " \n" if len(message) > 0 else ""
|
||||
|
||||
comment = "File types | Count | Total Size\n"
|
||||
comment += ":-- | :-- | :-- \n"
|
||||
print(stats["mime_stats"])
|
||||
for stat in stats:
|
||||
comment += stat + " \n" if len(stat) > 0 else ""
|
||||
comment += RedditBot.format_stats(stats[stat])
|
||||
|
||||
comment += "[Full Report](https://simon987.net/od-database/website/" + str(website_id) + "/)"
|
||||
comment += " | [Link list](https://simon987.net/od-database/website/" + str(website_id) + "/links) \n"
|
||||
comment += "*** \n"
|
||||
comment += RedditBot.bottom_line
|
||||
|
||||
return comment
|
||||
|
||||
@staticmethod
|
||||
def format_stats(stats):
|
||||
|
||||
result = " \n"
|
||||
result += "File types | Count | Total Size\n"
|
||||
result += ":-- | :-- | :-- \n"
|
||||
counter = 0
|
||||
for mime in stats["mime_stats"]:
|
||||
print(mime)
|
||||
comment += mime[2]
|
||||
comment += " | " + str(mime[1]) + " \n"
|
||||
comment += " | " + str(mime[0]) + " \n"
|
||||
result += mime[2]
|
||||
result += " | " + str(mime[1])
|
||||
result += " | " + humanfriendly.format_size(mime[0]) + " \n"
|
||||
|
||||
counter += 1
|
||||
if counter >= 3:
|
||||
break
|
||||
|
||||
comment += "**Total** | **" + str(stats["total_count"]) + "** | **"
|
||||
comment += humanfriendly.format_size(stats["total_size"]) + "** \n\n"
|
||||
|
||||
comment += "[Full Report](https://simon987.net/od-database/website/" + str(website_id) + "/)"
|
||||
comment += " | [Link list](https://simon987.net/od-database/website/" + str(website_id) + "/links) \n"
|
||||
comment += "*** \n^(Beep boop. I am a bot that calculates the file sizes & count of"
|
||||
comment += " open directories posted in /r/opendirectories/)"
|
||||
|
||||
return comment
|
||||
|
||||
result += "**Total** | **" + str(stats["total_count"]) + "** | **"
|
||||
result += humanfriendly.format_size(stats["total_size"]) + "** \n\n"
|
||||
return result
|
||||
|
@ -24,15 +24,15 @@ ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
CONCURRENT_REQUESTS = 40
|
||||
RETRY_TIMES = 5
|
||||
DOWNLOAD_TIMEOUT = 50
|
||||
RETRY_TIMES = 6
|
||||
DOWNLOAD_TIMEOUT = 90
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
CONCURRENT_REQUESTS_PER_DOMAIN = 50
|
||||
CONCURRENT_REQUESTS_PER_DOMAIN = 40
|
||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
|
22
task.py
22
task.py
@ -30,10 +30,10 @@ class TaskManager:
|
||||
task = self.db.dequeue()
|
||||
|
||||
if task:
|
||||
website_id, post_id = task
|
||||
website_id, post_id, comment_id = task
|
||||
website = self.db.get_website_by_id(website_id)
|
||||
self.current_task = Process(target=self.execute_task,
|
||||
args=(website, self.busy, post_id))
|
||||
args=(website, self.busy, post_id, comment_id))
|
||||
self.current_website = website
|
||||
self.current_task.start()
|
||||
|
||||
@ -42,7 +42,7 @@ class TaskManager:
|
||||
self.current_task = None
|
||||
self.current_website = None
|
||||
|
||||
def execute_task(self, website: Website, busy: Value, post_id: str):
|
||||
def execute_task(self, website: Website, busy: Value, post_id: str, comment_id: str):
|
||||
busy.value = 1
|
||||
if os.path.exists("data.json"):
|
||||
os.remove("data.json")
|
||||
@ -57,12 +57,22 @@ class TaskManager:
|
||||
print("Imported in SQLite3")
|
||||
|
||||
if post_id:
|
||||
# TODO check should_comment()
|
||||
# Reply to post
|
||||
stats = self.db.get_website_stats(website.id)
|
||||
comment = self.reddit_bot.get_comment(stats, website.id)
|
||||
comment = self.reddit_bot.get_comment({"": stats}, website.id)
|
||||
print(comment)
|
||||
print(self.reddit_bot.reddit.submission(post_id))
|
||||
if "total_size" in stats and stats["total_size"] > 10000000:
|
||||
post = self.reddit_bot.reddit.submission(post_id)
|
||||
self.reddit_bot.reply(post, comment)
|
||||
pass
|
||||
|
||||
elif comment_id:
|
||||
# Reply to comment
|
||||
stats = self.db.get_website_stats(website.id)
|
||||
comment = self.reddit_bot.get_comment({"There you go!": stats}, website.id)
|
||||
print(comment)
|
||||
reddit_comment = self.reddit_bot.reddit.comment(comment_id)
|
||||
self.reddit_bot.reply(reddit_comment, comment)
|
||||
busy.value = 0
|
||||
print("Done crawling task")
|
||||
|
||||
|
@ -8,7 +8,7 @@
|
||||
<div class="card">
|
||||
<div class="card-header">Last updated websites</div>
|
||||
<div class="card-body">
|
||||
<table class="table">
|
||||
<table class="table table-striped">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Url</th>
|
||||
|
Loading…
x
Reference in New Issue
Block a user