mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
Added blacklist feature (untested)
This commit is contained in:
parent
cfa6a9f02f
commit
0b1d76f478
6
app.py
6
app.py
@ -130,6 +130,12 @@ def enqueue():
|
|||||||
"FTP is not supported", "danger")
|
"FTP is not supported", "danger")
|
||||||
return redirect("/submit")
|
return redirect("/submit")
|
||||||
|
|
||||||
|
if od_util.is_blacklisted(url):
|
||||||
|
flash("<strong>Error:</strong> "
|
||||||
|
"Sorry, this website has been blacklisted. If you think "
|
||||||
|
"this is an error, please <a href='/contribute'>contact me</a>.", "danger")
|
||||||
|
return redirect("/submit")
|
||||||
|
|
||||||
if not od_util.is_od(url):
|
if not od_util.is_od(url):
|
||||||
flash("<strong>Error:</strong>"
|
flash("<strong>Error:</strong>"
|
||||||
"The anti-spam algorithm determined that the submitted url is not "
|
"The anti-spam algorithm determined that the submitted url is not "
|
||||||
|
5
blacklist.txt
Normal file
5
blacklist.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
https://sdo.gsfc.nasa.gov
|
||||||
|
https://drive.google
|
||||||
|
https://mirror.math.princeton.edu
|
||||||
|
http://mirror.math.princeton.edu
|
||||||
|
https://www.dropbox.com
|
22
database.py
22
database.py
@ -87,11 +87,14 @@ class Database:
|
|||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
# Then insert files
|
# Then insert files
|
||||||
cursor.execute("PRAGMA foreign_keys = OFF")
|
|
||||||
conn.commit()
|
|
||||||
cursor.executemany("INSERT INTO File (path_id, name, size, mime_id) VALUES (?,?,?,?)",
|
cursor.executemany("INSERT INTO File (path_id, name, size, mime_id) VALUES (?,?,?,?)",
|
||||||
[(website_paths[x.path], x.name, x.size, mimetypes[x.mime]) for x in files])
|
[(website_paths[x.path], x.name, x.size, mimetypes[x.mime]) for x in files])
|
||||||
|
|
||||||
|
# Update date
|
||||||
|
if len(files) > 0:
|
||||||
|
cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id = ?",
|
||||||
|
(files[0].website_id, ))
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
def import_json(self, json_file, website: Website):
|
def import_json(self, json_file, website: Website):
|
||||||
@ -302,6 +305,21 @@ class Database:
|
|||||||
website_id = cursor.fetchone()
|
website_id = cursor.fetchone()
|
||||||
return website_id[0] if website_id else None
|
return website_id[0] if website_id else None
|
||||||
|
|
||||||
|
def website_has_been_scanned(self, url):
|
||||||
|
"""Check if a website has at least 1 file"""
|
||||||
|
|
||||||
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
website_id = self.website_exists(url)
|
||||||
|
|
||||||
|
if website_id:
|
||||||
|
cursor.execute("SELECT COUNT(Path.id) FROM Website "
|
||||||
|
"INNER JOIN WebsitePath Path on Website.id = Path.website_id "
|
||||||
|
"WHERE Website.id = ?", (website_id, ))
|
||||||
|
return cursor.fetchone()[0] > 0
|
||||||
|
return None
|
||||||
|
|
||||||
def clear_website(self, website_id):
|
def clear_website(self, website_id):
|
||||||
"""Remove all files from a website and update its last_updated date"""
|
"""Remove all files from a website and update its last_updated date"""
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
13
od_util.py
13
od_util.py
@ -61,9 +61,6 @@ def is_external_link(base_url, url: str):
|
|||||||
|
|
||||||
|
|
||||||
def is_od(url):
|
def is_od(url):
|
||||||
if "?" in url:
|
|
||||||
print("Url has parameter in url!")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if not url.endswith("/"):
|
if not url.endswith("/"):
|
||||||
print("Url does not end with trailing /")
|
print("Url does not end with trailing /")
|
||||||
@ -97,3 +94,13 @@ def is_od(url):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_blacklisted(url):
|
||||||
|
|
||||||
|
with open("blacklist.txt", "r") as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
if url.startswith(line.strip()):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
@ -19,7 +19,7 @@ submissions = []
|
|||||||
def handle_exact_repost(website_id, reddit_obj):
|
def handle_exact_repost(website_id, reddit_obj):
|
||||||
stats = db.get_website_stats(website_id)
|
stats = db.get_website_stats(website_id)
|
||||||
comment = bot.get_comment({"": stats}, website_id,
|
comment = bot.get_comment({"": stats}, website_id,
|
||||||
f"I already scanned this website on {website.last_modified} UTC")
|
"I already scanned this website on " + website.last_modified + " UTC")
|
||||||
print(comment)
|
print(comment)
|
||||||
print("Exact repost!")
|
print("Exact repost!")
|
||||||
bot.reply(reddit_obj, comment)
|
bot.reply(reddit_obj, comment)
|
||||||
@ -33,9 +33,9 @@ def handle_subdir_repost(website_id, reddit_obj):
|
|||||||
|
|
||||||
subdir_stats = db.get_subdir_stats(website_id, subdir)
|
subdir_stats = db.get_subdir_stats(website_id, subdir)
|
||||||
stats = db.get_website_stats(website_id)
|
stats = db.get_website_stats(website_id)
|
||||||
comment = bot.get_comment({"Parent directory:": stats, f"Subdirectory `/{subdir}`:": subdir_stats},
|
comment = bot.get_comment({"Parent directory:": stats, "Subdirectory `/" + subdir + "`:": subdir_stats},
|
||||||
website_id, f"I already scanned a parent directory of this website on"
|
website_id, "I already scanned a parent directory of this website on"
|
||||||
f" {website.last_modified} UTC")
|
+ website.last_modified + " UTC")
|
||||||
print(comment)
|
print(comment)
|
||||||
print("Subdir repost!")
|
print("Subdir repost!")
|
||||||
bot.reply(reddit_obj, comment)
|
bot.reply(reddit_obj, comment)
|
||||||
@ -50,16 +50,24 @@ for comment in []: #subreddit.comments(limit=50):
|
|||||||
lines = text.split()
|
lines = text.split()
|
||||||
if len(lines) > 1:
|
if len(lines) > 1:
|
||||||
url = os.path.join(lines[1], "") # Add trailing slash
|
url = os.path.join(lines[1], "") # Add trailing slash
|
||||||
|
scanned = db.website_has_been_scanned(url)
|
||||||
|
|
||||||
website = db.get_website_by_url(url)
|
website = db.get_website_by_url(url)
|
||||||
|
|
||||||
if website:
|
if website and not scanned:
|
||||||
|
# in progress
|
||||||
|
pass
|
||||||
|
|
||||||
|
if website and db.website_has_been_scanned(url):
|
||||||
bot.log_crawl(comment.id)
|
bot.log_crawl(comment.id)
|
||||||
handle_exact_repost(website.id, comment)
|
handle_exact_repost(website.id, comment)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
website_id = db.website_exists(url)
|
website_id = db.website_exists(url)
|
||||||
if website_id:
|
if website_id and not scanned:
|
||||||
|
# IN progress
|
||||||
|
pass
|
||||||
|
if website_id and db.website_has_been_scanned(url):
|
||||||
bot.log_crawl(comment.id)
|
bot.log_crawl(comment.id)
|
||||||
handle_subdir_repost(website_id, comment)
|
handle_subdir_repost(website_id, comment)
|
||||||
continue
|
continue
|
||||||
@ -67,19 +75,27 @@ for comment in []: #subreddit.comments(limit=50):
|
|||||||
if not od_util.is_valid_url(url):
|
if not od_util.is_valid_url(url):
|
||||||
print("Skipping reddit comment: Invalid url")
|
print("Skipping reddit comment: Invalid url")
|
||||||
bot.log_crawl(comment.id)
|
bot.log_crawl(comment.id)
|
||||||
bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `"
|
bot.reply(comment, "Hello, " + comment.author + ". Unfortunately it seems that the link you "
|
||||||
f"{url}` is not valid. Make sure that you include the `http(s)://` prefix. \n")
|
"provided: `" + url + "` is not valid. Make sure that you include the"
|
||||||
|
"'`http(s)://` prefix. \n")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if od_util.is_blacklisted(url):
|
||||||
|
print("Skipping reddit comment: blacklisted")
|
||||||
|
bot.log_crawl(comment.id)
|
||||||
|
bot.reply(comment, "Hello, " + comment.author + ". Unfortunately my programmer has blacklisted "
|
||||||
|
"this website. If you think that this is an error, please "
|
||||||
|
"[contact him](https://www.reddit.com/message/compose?to=Hexahedr_n)")
|
||||||
|
|
||||||
if not od_util.is_od(url):
|
if not od_util.is_od(url):
|
||||||
print("Skipping reddit comment: Not an OD")
|
print("Skipping reddit comment: Not an OD")
|
||||||
print(url)
|
print(url)
|
||||||
bot.log_crawl(comment.id)
|
bot.log_crawl(comment.id)
|
||||||
bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `"
|
bot.reply(comment, "Hello, " + comment.author + ". Unfortunately it seems that the link you "
|
||||||
f"{url}` does not point to an open directory. This could also mean that the "
|
"provided: `" + url + "` does not point to an open directory. This could also"
|
||||||
f"website is not responding (in which case, feel free to retry in a few minutes)"
|
" mean that the website is not responding (in which case, feel free to retry in "
|
||||||
f" If you think that this is an error, please "
|
"a few minutes). If you think that this is an error, please "
|
||||||
f"[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)")
|
"[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
bot.log_crawl(comment.id)
|
bot.log_crawl(comment.id)
|
||||||
@ -116,6 +132,11 @@ for s in submissions:
|
|||||||
bot.log_crawl(s.id)
|
bot.log_crawl(s.id)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if od_util.is_blacklisted(url):
|
||||||
|
print("Skipping reddit post: blacklisted")
|
||||||
|
bot.log_crawl(s.id)
|
||||||
|
continue
|
||||||
|
|
||||||
if not od_util.is_od(url):
|
if not od_util.is_od(url):
|
||||||
print("Skipping reddit post: Not an OD")
|
print("Skipping reddit post: Not an OD")
|
||||||
print(url)
|
print(url)
|
||||||
|
@ -113,7 +113,7 @@ function getRandomColor() {
|
|||||||
*/
|
*/
|
||||||
function humanFileSize(bytes) {
|
function humanFileSize(bytes) {
|
||||||
|
|
||||||
if(bytes === 0) {
|
if(bytes <= 0) {
|
||||||
return "? B"
|
return "? B"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user