mirror of
https://github.com/simon987/od-database.git
synced 2025-04-18 01:46:46 +00:00
107 lines
2.3 KiB
Python
107 lines
2.3 KiB
Python
import requests
|
|
from urllib.parse import urljoin
|
|
from bs4 import BeautifulSoup
|
|
import os
|
|
import validators
|
|
import re
|
|
import mimetypes
|
|
|
|
|
|
def truncate_path(path, max_len):
|
|
pattern = re.compile(r"/?.*?/")
|
|
|
|
for i in range(1, path.count("/")):
|
|
new_path = pattern.sub(".../", path, i)
|
|
if len(new_path) < max_len:
|
|
return new_path
|
|
return ".../" + path.rsplit("/", maxsplit=1)[1] if "/" in path else path
|
|
|
|
|
|
colors = {
|
|
"application": "bg-application",
|
|
"text": "bg-text",
|
|
"video": "bg-video",
|
|
"image": "bg-image",
|
|
"audio": "bg-audio"
|
|
}
|
|
|
|
|
|
def get_color(mime):
|
|
return colors.get(mime.split("/", maxsplit=1)[0], None)
|
|
|
|
|
|
def get_mime(file_name):
|
|
mime = mimetypes.guess_type(file_name)
|
|
if mime[0]:
|
|
return mime[0]
|
|
else:
|
|
return None
|
|
|
|
|
|
def is_valid_url(url):
|
|
if not url.endswith("/"):
|
|
return False
|
|
|
|
if not url.startswith(("http://", "https://")):
|
|
return False
|
|
|
|
return validators.url(url)
|
|
|
|
|
|
def has_extension(link):
|
|
return len(os.path.splitext(link)[1]) > 0
|
|
|
|
|
|
def is_external_link(base_url, url: str):
|
|
url = urljoin(base_url, url).strip()
|
|
|
|
if base_url in url:
|
|
return False
|
|
return True
|
|
|
|
|
|
def is_od(url):
|
|
|
|
if not url.endswith("/"):
|
|
print("Url does not end with trailing /")
|
|
return False
|
|
|
|
try:
|
|
r = requests.get(url, timeout=15, allow_redirects=False)
|
|
if r.status_code != 200:
|
|
print("No redirects allowed!")
|
|
return False
|
|
soup = BeautifulSoup(r.text, "lxml")
|
|
|
|
external_links = sum(1 if is_external_link(url, a.get("href")) else 0 for a in soup.find_all("a"))
|
|
link_tags = len(list(soup.find_all("link")))
|
|
script_tags = len(list(soup.find_all("script")))
|
|
|
|
if external_links > 11:
|
|
print("Too many external links!")
|
|
return False
|
|
|
|
if link_tags > 5:
|
|
print("Too many link tags!")
|
|
return False
|
|
|
|
if script_tags > 7:
|
|
print("Too many script tags!")
|
|
return False
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(e)
|
|
return False
|
|
|
|
|
|
def is_blacklisted(url):
|
|
|
|
with open("blacklist.txt", "r") as f:
|
|
for line in f.readlines():
|
|
if url.startswith(line.strip()):
|
|
return True
|
|
|
|
return False
|