od-database/od_util.py
2018-05-28 20:35:04 -04:00

100 lines
2.2 KiB
Python

import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import os
import validators
import re
import mimetypes
def truncate_path(path, max_len):
pattern = re.compile(r"/?.*?/")
for i in range(1, path.count("/")):
new_path = pattern.sub(".../", path, i)
if len(new_path) < max_len:
return new_path
return ".../" + path.rsplit("/", maxsplit=1)[1] if "/" in path else path
colors = {
"application": "bg-application",
"text": "bg-text",
"video": "bg-video",
"image": "bg-image",
"audio": "bg-audio"
}
def get_color(mime):
return colors.get(mime.split("/", maxsplit=1)[0], None)
def get_mime(file_name):
mime = mimetypes.guess_type(file_name)
if mime[0]:
return mime[0]
else:
return None
def is_valid_url(url):
if not url.endswith("/"):
return False
if not url.startswith(("http://", "https://")):
return False
return validators.url(url)
def has_extension(link):
return len(os.path.splitext(link)[1]) > 0
def is_external_link(base_url, url: str):
url = urljoin(base_url, url).strip()
if base_url in url:
return False
return True
def is_od(url):
if "?" in url:
print("Url has parameter in url!")
return False
if not url.endswith("/"):
print("Url does not end with trailing /")
return False
try:
r = requests.get(url, timeout=15, allow_redirects=False)
if r.status_code != 200:
print("No redirects allowed!")
return False
soup = BeautifulSoup(r.text, "lxml")
external_links = sum(1 if is_external_link(url, a.get("href")) else 0 for a in soup.find_all("a"))
link_tags = len(list(soup.find_all("link")))
script_tags = len(list(soup.find_all("script")))
if external_links > 11:
print("Too many external links!")
return False
if link_tags > 5:
print("Too many link tags!")
return False
if script_tags > 7:
print("Too many script tags!")
return False
return True
except Exception as e:
print(e)
return False