mirror of
				https://github.com/simon987/od-database.git
				synced 2025-10-25 11:56:51 +00:00 
			
		
		
		
	Added blacklist feature (untested)
This commit is contained in:
		
							parent
							
								
									cfa6a9f02f
								
							
						
					
					
						commit
						0b1d76f478
					
				
							
								
								
									
										8
									
								
								app.py
									
									
									
									
									
								
							
							
						
						
									
										8
									
								
								app.py
									
									
									
									
									
								
							| @ -130,10 +130,16 @@ def enqueue(): | ||||
|                   "FTP is not supported", "danger") | ||||
|             return redirect("/submit") | ||||
| 
 | ||||
|         if od_util.is_blacklisted(url): | ||||
|             flash("<strong>Error:</strong> " | ||||
|                   "Sorry, this website has been blacklisted. If you think " | ||||
|                   "this is an error, please <a href='/contribute'>contact me</a>.", "danger") | ||||
|             return redirect("/submit") | ||||
| 
 | ||||
|         if not od_util.is_od(url): | ||||
|             flash("<strong>Error:</strong>" | ||||
|                   "The anti-spam algorithm determined that the submitted url is not " | ||||
|                   "an open directory or the server is not responding. If you think" | ||||
|                   "an open directory or the server is not responding. If you think " | ||||
|                   "this is an error, please <a href='/contribute'>contact me</a>.", "danger") | ||||
| 
 | ||||
|             return redirect("/submit") | ||||
|  | ||||
							
								
								
									
										5
									
								
								blacklist.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								blacklist.txt
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,5 @@ | ||||
| https://sdo.gsfc.nasa.gov | ||||
| https://drive.google | ||||
| https://mirror.math.princeton.edu | ||||
| http://mirror.math.princeton.edu | ||||
| https://www.dropbox.com | ||||
							
								
								
									
										22
									
								
								database.py
									
									
									
									
									
								
							
							
						
						
									
										22
									
								
								database.py
									
									
									
									
									
								
							| @ -87,11 +87,14 @@ class Database: | ||||
| 
 | ||||
|             conn.commit() | ||||
|             # Then insert files | ||||
|             cursor.execute("PRAGMA foreign_keys = OFF") | ||||
|             conn.commit() | ||||
|             cursor.executemany("INSERT INTO File (path_id, name, size, mime_id) VALUES (?,?,?,?)", | ||||
|                                [(website_paths[x.path], x.name, x.size, mimetypes[x.mime]) for x in files]) | ||||
| 
 | ||||
|             # Update date | ||||
|             if len(files) > 0: | ||||
|                 cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id = ?", | ||||
|                                (files[0].website_id, )) | ||||
| 
 | ||||
|             conn.commit() | ||||
| 
 | ||||
|     def import_json(self, json_file, website: Website): | ||||
| @ -302,6 +305,21 @@ class Database: | ||||
|             website_id = cursor.fetchone() | ||||
|             return website_id[0] if website_id else None | ||||
| 
 | ||||
|     def website_has_been_scanned(self, url): | ||||
|         """Check if a website has at least 1 file""" | ||||
| 
 | ||||
|         with sqlite3.connect(self.db_path) as conn: | ||||
|             cursor = conn.cursor() | ||||
| 
 | ||||
|             website_id = self.website_exists(url) | ||||
| 
 | ||||
|             if website_id: | ||||
|                 cursor.execute("SELECT COUNT(Path.id) FROM Website " | ||||
|                                "INNER JOIN WebsitePath Path on Website.id = Path.website_id " | ||||
|                                "WHERE Website.id = ?", (website_id, )) | ||||
|                 return cursor.fetchone()[0] > 0 | ||||
|         return None | ||||
| 
 | ||||
|     def clear_website(self, website_id): | ||||
|         """Remove all files from a website and update its last_updated date""" | ||||
|         with sqlite3.connect(self.db_path) as conn: | ||||
|  | ||||
							
								
								
									
										13
									
								
								od_util.py
									
									
									
									
									
								
							
							
						
						
									
										13
									
								
								od_util.py
									
									
									
									
									
								
							| @ -61,9 +61,6 @@ def is_external_link(base_url, url: str): | ||||
| 
 | ||||
| 
 | ||||
| def is_od(url): | ||||
|     if "?" in url: | ||||
|         print("Url has parameter in url!") | ||||
|         return False | ||||
| 
 | ||||
|     if not url.endswith("/"): | ||||
|         print("Url does not end with trailing /") | ||||
| @ -97,3 +94,13 @@ def is_od(url): | ||||
|     except Exception as e: | ||||
|         print(e) | ||||
|         return False | ||||
| 
 | ||||
| 
 | ||||
| def is_blacklisted(url): | ||||
| 
 | ||||
|     with open("blacklist.txt", "r") as f: | ||||
|         for line in f.readlines(): | ||||
|             if url.startswith(line.strip()): | ||||
|                 return True | ||||
| 
 | ||||
|     return False | ||||
|  | ||||
| @ -19,7 +19,7 @@ submissions = [] | ||||
| def handle_exact_repost(website_id, reddit_obj): | ||||
|     stats = db.get_website_stats(website_id) | ||||
|     comment = bot.get_comment({"": stats}, website_id, | ||||
|                               f"I already scanned this website on {website.last_modified} UTC") | ||||
|                               "I already scanned this website on " + website.last_modified + " UTC") | ||||
|     print(comment) | ||||
|     print("Exact repost!") | ||||
|     bot.reply(reddit_obj, comment) | ||||
| @ -33,9 +33,9 @@ def handle_subdir_repost(website_id, reddit_obj): | ||||
| 
 | ||||
|     subdir_stats = db.get_subdir_stats(website_id, subdir) | ||||
|     stats = db.get_website_stats(website_id) | ||||
|     comment = bot.get_comment({"Parent directory:": stats, f"Subdirectory `/{subdir}`:": subdir_stats}, | ||||
|                               website_id, f"I already scanned a parent directory of this website on" | ||||
|                                           f" {website.last_modified} UTC") | ||||
|     comment = bot.get_comment({"Parent directory:": stats, "Subdirectory `/" + subdir + "`:": subdir_stats}, | ||||
|                               website_id, "I already scanned a parent directory of this website on" | ||||
|                               + website.last_modified + " UTC") | ||||
|     print(comment) | ||||
|     print("Subdir repost!") | ||||
|     bot.reply(reddit_obj, comment) | ||||
| @ -50,16 +50,24 @@ for comment in []: #subreddit.comments(limit=50): | ||||
|             lines = text.split() | ||||
|             if len(lines) > 1: | ||||
|                 url = os.path.join(lines[1], "")  # Add trailing slash | ||||
|                 scanned = db.website_has_been_scanned(url) | ||||
| 
 | ||||
|                 website = db.get_website_by_url(url) | ||||
| 
 | ||||
|                 if website: | ||||
|                 if website and not scanned: | ||||
|                     # in progress | ||||
|                     pass | ||||
| 
 | ||||
|                 if website and db.website_has_been_scanned(url): | ||||
|                     bot.log_crawl(comment.id) | ||||
|                     handle_exact_repost(website.id, comment) | ||||
|                     continue | ||||
| 
 | ||||
|                 website_id = db.website_exists(url) | ||||
|                 if website_id: | ||||
|                 if website_id and not scanned: | ||||
|                     # IN progress | ||||
|                     pass | ||||
|                 if website_id and db.website_has_been_scanned(url): | ||||
|                     bot.log_crawl(comment.id) | ||||
|                     handle_subdir_repost(website_id, comment) | ||||
|                     continue | ||||
| @ -67,19 +75,27 @@ for comment in []: #subreddit.comments(limit=50): | ||||
|                 if not od_util.is_valid_url(url): | ||||
|                     print("Skipping reddit comment: Invalid url") | ||||
|                     bot.log_crawl(comment.id) | ||||
|                     bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `" | ||||
|                                        f"{url}` is not valid. Make sure that you include the `http(s)://` prefix.    \n") | ||||
|                     bot.reply(comment, "Hello, " + comment.author + ". Unfortunately it seems that the link you " | ||||
|                                        "provided: `" + url + "` is not valid. Make sure that you include the" | ||||
|                                        "'`http(s)://` prefix.    \n") | ||||
|                     continue | ||||
| 
 | ||||
|                 if od_util.is_blacklisted(url): | ||||
|                     print("Skipping reddit comment: blacklisted") | ||||
|                     bot.log_crawl(comment.id) | ||||
|                     bot.reply(comment, "Hello, " + comment.author + ". Unfortunately my programmer has blacklisted " | ||||
|                                        "this website. If you think that this is an error, please " | ||||
|                                        "[contact him](https://www.reddit.com/message/compose?to=Hexahedr_n)") | ||||
| 
 | ||||
|                 if not od_util.is_od(url): | ||||
|                     print("Skipping reddit comment: Not an OD") | ||||
|                     print(url) | ||||
|                     bot.log_crawl(comment.id) | ||||
|                     bot.reply(comment, f"Hello, {comment.author}. Unfortunately it seems that the link you provided: `" | ||||
|                                        f"{url}` does not point to an open directory. This could also mean that the " | ||||
|                                        f"website is not responding (in which case, feel free to retry in a few minutes)" | ||||
|                                        f" If you think that this is an error, please " | ||||
|                                        f"[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)") | ||||
|                     bot.reply(comment, "Hello, " + comment.author + ". Unfortunately it seems that the link you " | ||||
|                                        "provided: `" + url + "` does not point to an open directory. This could also" | ||||
|                                        " mean that the website is not responding (in which case, feel free to retry in " | ||||
|                                        "a few minutes). If you think that this is an error, please " | ||||
|                                        "[contact my programmer](https://www.reddit.com/message/compose?to=Hexahedr_n)") | ||||
|                     continue | ||||
| 
 | ||||
|                 bot.log_crawl(comment.id) | ||||
| @ -116,6 +132,11 @@ for s in submissions: | ||||
|                 bot.log_crawl(s.id) | ||||
|                 continue | ||||
| 
 | ||||
|             if od_util.is_blacklisted(url): | ||||
|                 print("Skipping reddit post: blacklisted") | ||||
|                 bot.log_crawl(s.id) | ||||
|                 continue | ||||
| 
 | ||||
|             if not od_util.is_od(url): | ||||
|                 print("Skipping reddit post: Not an OD") | ||||
|                 print(url) | ||||
|  | ||||
| @ -113,7 +113,7 @@ function getRandomColor() { | ||||
|  */ | ||||
| function humanFileSize(bytes) { | ||||
| 
 | ||||
|     if(bytes === 0) { | ||||
|     if(bytes <= 0) { | ||||
|         return "? B" | ||||
|     } | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user