mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
Added blacklist feature (untested)
This commit is contained in:
parent
cfa6a9f02f
commit
0b1d76f478
8
app.py
8
app.py
@ -130,10 +130,16 @@ def enqueue():
|
||||
"FTP is not supported", "danger")
|
||||
return redirect("/submit")
|
||||
|
||||
if od_util.is_blacklisted(url):
|
||||
flash("<strong>Error:</strong> "
|
||||
"Sorry, this website has been blacklisted. If you think "
|
||||
"this is an error, please <a href='/contribute'>contact me</a>.", "danger")
|
||||
return redirect("/submit")
|
||||
|
||||
if not od_util.is_od(url):
|
||||
flash("<strong>Error:</strong>"
|
||||
"The anti-spam algorithm determined that the submitted url is not "
|
||||
"an open directory or the server is not responding. If you think"
|
||||
"an open directory or the server is not responding. If you think "
|
||||
"this is an error, please <a href='/contribute'>contact me</a>.", "danger")
|
||||
|
||||
return redirect("/submit")
|
||||
|
5
blacklist.txt
Normal file
5
blacklist.txt
Normal file
@ -0,0 +1,5 @@
|
||||
https://sdo.gsfc.nasa.gov
|
||||
https://drive.google
|
||||
https://mirror.math.princeton.edu
|
||||
http://mirror.math.princeton.edu
|
||||
https://www.dropbox.com
|
22
database.py
22
database.py
@ -87,11 +87,14 @@ class Database:
|
||||
|
||||
conn.commit()
|
||||
# Then insert files
|
||||
cursor.execute("PRAGMA foreign_keys = OFF")
|
||||
conn.commit()
|
||||
cursor.executemany("INSERT INTO File (path_id, name, size, mime_id) VALUES (?,?,?,?)",
|
||||
[(website_paths[x.path], x.name, x.size, mimetypes[x.mime]) for x in files])
|
||||
|
||||
# Update date
|
||||
if len(files) > 0:
|
||||
cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id = ?",
|
||||
(files[0].website_id, ))
|
||||
|
||||
conn.commit()
|
||||
|
||||
def import_json(self, json_file, website: Website):
|
||||
@ -302,6 +305,21 @@ class Database:
|
||||
website_id = cursor.fetchone()
|
||||
return website_id[0] if website_id else None
|
||||
|
||||
def website_has_been_scanned(self, url):
|
||||
"""Check if a website has at least 1 file"""
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
website_id = self.website_exists(url)
|
||||
|
||||
if website_id:
|
||||
cursor.execute("SELECT COUNT(Path.id) FROM Website "
|
||||
"INNER JOIN WebsitePath Path on Website.id = Path.website_id "
|
||||
"WHERE Website.id = ?", (website_id, ))
|
||||
return cursor.fetchone()[0] > 0
|
||||
return None
|
||||
|
||||
def clear_website(self, website_id):
|
||||
"""Remove all files from a website and update its last_updated date"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
|
13
od_util.py
13
od_util.py
@ -61,9 +61,6 @@ def is_external_link(base_url, url: str):
|
||||
|
||||
|
||||
def is_od(url):
|
||||
if "?" in url:
|
||||
print("Url has parameter in url!")
|
||||
return False
|
||||
|
||||
if not url.endswith("/"):
|
||||
print("Url does not end with trailing /")
|
||||
@ -97,3 +94,13 @@ def is_od(url):
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return False
|
||||
|
||||
|
||||
def is_blacklisted(url):
|
||||
|
||||
with open("blacklist.txt", "r") as f:
|
||||
for line in f.readlines():
|
||||
if url.startswith(line.strip()):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
@ -19,7 +19,7 @@ submissions = []
|
||||
def handle_exact_repost(website_id, reddit_obj):
|
||||
stats = db.get_website_stats(website_id)
|
||||
comment = bot.get_comment({"": stats}, website_id,
|
||||
f"I already scanned this website on {website.last_modified} UTC")
|
||||
"I already scanned this website on " + website.last_modified + " UTC")
|
||||
print(comment)
|
||||
print("Exact repost!")
|
||||
bot.reply(reddit_obj, comment)
|
||||
@ -33,9 +33,9 @@ def handle_subdir_repost(website_id, reddit_obj):
|
||||
|
||||
subdir_stats = db.get_subdir_stats(website_id, subdir)
|
||||
stats = db.get_website_stats(website_id)
|
||||
comment = bot.get_comment({"Parent directory:": stats, f"Subdirectory `/{subdir}`:": subdir_stats},
|
||||
website_id, f"I already scanned a parent directory of this website on"
|
||||
f" {website.last_modified} UTC")
|
||||
comment = bot.get_comment({"Parent directory:": stats, "Subdirectory `/" + subdir + "`:": subdir_stats},
|
||||
website_id, "I already scanned a parent directory of this website on"
|
||||
+ website.last_modified + " UTC")
|
||||
print(comment)
|
||||
print("Subdir repost!")
|
||||
bot.reply(reddit_obj, comment)
|
||||
@ -50,16 +50,24 @@ for comment in []: #subreddit.comments(limit=50):
|
||||
lines = text.split()
|
||||
if len(lines) > 1:
|
||||
url = os.path.join(lines[1], "") # Add trailing slash
|
||||
scanned = db.website_has_been_scanned(url)
|
||||
|
||||
website = db.get_website_by_url(url)
|
||||
|
||||
if website:
|
||||
if website and not scanned:
|
||||
# in progress
|
||||
pass
|
||||
|
||||
if website and db.website_has_been_scanned(url):
|
||||
bot.log_crawl(comment.id)
|
||||
handle_exact_repost(website.id, comment)
|
||||
continue
|
||||
|
||||
website_id = db.website_exists(url)
|
||||
if website_id:
|
||||
if website_id and not scanned:
|
||||
# IN progress
|
||||
pass
|
||||
if website_id and db.website_has_been_scanned(url):
|
||||
bot.log_crawl(comment.id)
|
||||
handle_subdir_repost(website_id, comment)
|
||||
continue
|
||||
@ -67,19 +75,27 @@ for comment in []: #subreddit.comments(limit=50):
|
||||
if not od_util.is_valid_url(url):
|
||||
print("Skipping reddit comment: Invalid url")
|
||||
bot.log_crawl(comment.id)
|
||||
bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `"
|
||||
f"{url}` is not valid. Make sure that you include the `http(s)://` prefix. \n")
|
||||
bot.reply(comment, "Hello, " + comment.author + ". Unfortunately it seems that the link you "
|
||||
"provided: `" + url + "` is not valid. Make sure that you include the"
|
||||
"'`http(s)://` prefix. \n")
|
||||
continue
|
||||
|
||||
if od_util.is_blacklisted(url):
|
||||
print("Skipping reddit comment: blacklisted")
|
||||
bot.log_crawl(comment.id)
|
||||
bot.reply(comment, "Hello, " + comment.author + ". Unfortunately my programmer has blacklisted "
|
||||
"this website. If you think that this is an error, please "
|
||||
"[contact him](https://www.reddit.com/message/compose?to=Hexahedr_n)")
|
||||
|
||||
if not od_util.is_od(url):
|
||||
print("Skipping reddit comment: Not an OD")
|
||||
print(url)
|
||||
bot.log_crawl(comment.id)
|
||||
bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `"
|
||||
f"{url}` does not point to an open directory. This could also mean that the "
|
||||
f"website is not responding (in which case, feel free to retry in a few minutes)"
|
||||
f" If you think that this is an error, please "
|
||||
f"[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)")
|
||||
bot.reply(comment, "Hello, " + comment.author + ". Unfortunately it seems that the link you "
|
||||
"provided: `" + url + "` does not point to an open directory. This could also"
|
||||
" mean that the website is not responding (in which case, feel free to retry in "
|
||||
"a few minutes). If you think that this is an error, please "
|
||||
"[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)")
|
||||
continue
|
||||
|
||||
bot.log_crawl(comment.id)
|
||||
@ -116,6 +132,11 @@ for s in submissions:
|
||||
bot.log_crawl(s.id)
|
||||
continue
|
||||
|
||||
if od_util.is_blacklisted(url):
|
||||
print("Skipping reddit post: blacklisted")
|
||||
bot.log_crawl(s.id)
|
||||
continue
|
||||
|
||||
if not od_util.is_od(url):
|
||||
print("Skipping reddit post: Not an OD")
|
||||
print(url)
|
||||
|
@ -113,7 +113,7 @@ function getRandomColor() {
|
||||
*/
|
||||
function humanFileSize(bytes) {
|
||||
|
||||
if(bytes === 0) {
|
||||
if(bytes <= 0) {
|
||||
return "? B"
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user