mirror of
https://github.com/simon987/od-database.git
synced 2025-12-13 23:09:01 +00:00
Initial commit
This commit is contained in:
99
od_util.py
Normal file
99
od_util.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import requests
|
||||
from urllib.parse import urljoin
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
import validators
|
||||
import re
|
||||
import mimetypes
|
||||
|
||||
|
||||
def truncate_path(path, max_len):
|
||||
pattern = re.compile(r"/?.*?/")
|
||||
|
||||
for i in range(1, path.count("/")):
|
||||
new_path = pattern.sub(".../", path, i)
|
||||
if len(new_path) < max_len:
|
||||
return new_path
|
||||
return ".../" + path.rsplit("/", maxsplit=1)[1] if "/" in path else path
|
||||
|
||||
|
||||
colors = {
|
||||
"application": "bg-application",
|
||||
"text": "bg-text",
|
||||
"video": "bg-video",
|
||||
"image": "bg-image",
|
||||
"audio": "bg-audio"
|
||||
}
|
||||
|
||||
|
||||
def get_color(mime):
|
||||
return colors.get(mime.split("/", maxsplit=1)[0], None)
|
||||
|
||||
|
||||
def get_mime(file_name):
|
||||
mime = mimetypes.guess_type(file_name)
|
||||
if mime[0]:
|
||||
return mime[0]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def is_valid_url(url):
|
||||
if not url.endswith("/"):
|
||||
return False
|
||||
|
||||
if not url.startswith(("http://", "https://")):
|
||||
return False
|
||||
|
||||
return validators.url(url)
|
||||
|
||||
|
||||
def has_extension(link):
|
||||
return len(os.path.splitext(link)[1]) > 0
|
||||
|
||||
|
||||
def is_external_link(base_url, url: str):
|
||||
url = urljoin(base_url, url).strip()
|
||||
|
||||
if base_url in url:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_od(url):
|
||||
if "?" in url:
|
||||
print("Url has parameter in url!")
|
||||
return False
|
||||
|
||||
if not url.endswith("/"):
|
||||
print("Url does not end with trailing /")
|
||||
return False
|
||||
|
||||
try:
|
||||
r = requests.get(url, timeout=15, allow_redirects=False)
|
||||
if r.status_code != 200:
|
||||
print("No redirects allowed!")
|
||||
return False
|
||||
soup = BeautifulSoup(r.text, "lxml")
|
||||
|
||||
external_links = sum(1 if is_external_link(url, a.get("href")) else 0 for a in soup.find_all("a"))
|
||||
link_tags = len(list(soup.find_all("link")))
|
||||
script_tags = len(list(soup.find_all("script")))
|
||||
|
||||
if external_links > 11:
|
||||
print("Too many external links!")
|
||||
return False
|
||||
|
||||
if link_tags > 5:
|
||||
print("Too many link tags!")
|
||||
return False
|
||||
|
||||
if script_tags > 7:
|
||||
print("Too many script tags!")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return False
|
||||
Reference in New Issue
Block a user