mirror of
https://github.com/simon987/od-database.git
synced 2025-04-04 06:52:59 +00:00
249 lines
8.8 KiB
Python
249 lines
8.8 KiB
Python
import os
|
|
import re
|
|
from ftplib import FTP
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
import requests
|
|
import validators
|
|
from bs4 import BeautifulSoup
|
|
|
|
# TODO: find a better way to do this
|
|
try:
|
|
from . import config
|
|
except (ImportError, SystemError):
|
|
import config
|
|
|
|
import urllib3
|
|
urllib3.disable_warnings()
|
|
|
|
|
|
def truncate_path(path, max_len):
|
|
pattern = re.compile(r"/?.*?/")
|
|
|
|
for i in range(1, path.count("/")):
|
|
new_path = pattern.sub(".../", path, i)
|
|
if len(new_path) < max_len:
|
|
return new_path
|
|
return ".../" + path.rsplit("/", maxsplit=1)[1] if "/" in path else path
|
|
|
|
|
|
category_map = {
|
|
|
|
# Application category
|
|
'bcpio': 'application', 'bin': 'application', 'cdf': 'application',
|
|
'csh': 'application', 'dll': 'application', 'doc': 'application',
|
|
'dot': 'application', 'dvi': 'application', 'eml': 'application',
|
|
'exe': 'application', 'hdf': 'application',
|
|
'man': 'application', 'me': 'application', 'mht': 'application',
|
|
'mhtml': 'application', 'mif': 'application', 'ms': 'application',
|
|
'nc': 'application', 'nws': 'application', 'o': 'application',
|
|
'obj': 'application', 'oda': 'application', 'p12': 'application',
|
|
'p7c': 'application', 'pfx': 'application', 'tr': 'application',
|
|
'ppa': 'application', 'pps': 'application', 'ppt': 'application',
|
|
'ps': 'application', 'pwz': 'application', 'pyc': 'application',
|
|
'pyo': 'application', 'ram': 'application', 'rdf': 'application',
|
|
'roff': 'application', 'sh': 'application', 'so': 'application',
|
|
'src': 'application', 'sv4cpio': 'application', 'sv4crc': 'application',
|
|
't': 'application', 'tcl': 'application', 'tex': 'application',
|
|
'texi': 'application', 'texinfo': 'application', 'ustar': 'application',
|
|
'wiz': 'application', 'wsdl': 'application', 'xlb': 'application',
|
|
'xls': 'application', 'xpdl': 'application', 'xsl': 'application',
|
|
'torrent': 'application', 'rpm': 'application', 'deb': 'application',
|
|
'atr': 'application', 'class': 'application', 'ttf': 'application',
|
|
'img': 'application', 'msi': 'application', 'run': 'application',
|
|
'drpm': 'application', 'udeb': 'application', 'patch': 'application',
|
|
'nes': 'application', 'ebuild': 'application', 'scr': 'application',
|
|
# Text category
|
|
'java': 'text', 'cpp': 'text', 'rb': 'text',
|
|
'bat': 'text', 'latex': 'text', 'xml': 'text',
|
|
'etx': 'text', 'htm': 'text', 'c': 'text',
|
|
'css': 'text', 'csv': 'text', 'html': 'text',
|
|
'js': 'text', 'json': 'text', 'ksh': 'text',
|
|
'pl': 'text', 'pot': 'application', 'py': 'text',
|
|
'h': 'text', 'tsv': 'text', 'rtx': 'text',
|
|
'sgm': 'text', 'sgml': 'text', 'txt': 'text',
|
|
'vcf': 'text', 'pdf': 'text', 'epub': 'text',
|
|
'srt': 'text', 'inc': 'text', 'php': 'text',
|
|
'cbz': 'text', 'docx': 'text', 'mobi': 'text',
|
|
'chm': 'text', 'xlsx': "text", 'djvu': 'text',
|
|
'rtf': 'text', 'log': 'text', 'md': 'text',
|
|
'dsc': 'text', 'info': 'text',
|
|
# Video category
|
|
'3g2': 'video', '3gp': 'video', 'asf': 'video',
|
|
'asx': 'video', 'avi': 'video', 'flv': 'video',
|
|
'swf': 'video', 'vob:': 'video', 'qt': 'video',
|
|
'webm': 'video', 'mov': 'video', 'm1v': 'video',
|
|
'm3u': 'video', 'm3u8': 'video', 'movie': 'video',
|
|
'mp4': 'video', 'mpa': 'video', 'mpe': 'video',
|
|
'mpeg': 'video', 'mpg': 'video', 'mkv': 'video',
|
|
'wmv': 'video', 'm4s': 'video', 'ogv': 'video',
|
|
'm4b': 'video', 'm4v': 'video', 'ts': 'video',
|
|
|
|
# Audio category
|
|
'wav': 'audio', 'snd': 'audio', 'mp2': 'audio',
|
|
'aif': 'audio', 'iff': 'audio', 'm4a': 'audio',
|
|
'mid': 'audio', 'midi': 'audio', 'mp3': 'audio',
|
|
'wma': 'audio', 'ra': 'audio', 'aifc': 'audio',
|
|
'aiff': 'audio', 'au': 'audio', 'flac': 'audio',
|
|
'ogg': 'audio', 'oga': 'audio', 'mka': 'video',
|
|
'ac3': 'audio',
|
|
# Image category
|
|
'bmp': 'image', 'gif': 'image', 'jpg': 'image',
|
|
'xwd': 'image', 'tif': 'image', 'tiff': 'image',
|
|
'png': 'image', 'pnm': 'image', 'ras': 'image',
|
|
'ico': 'image', 'ief': 'image', 'pgm': 'image',
|
|
'jpe': 'image', 'pbm': 'image', 'jpeg': 'image',
|
|
'ppm': 'image', 'xpm': 'image', 'xbm': 'image',
|
|
'rgb': 'image', 'svg': 'image', 'psd': 'image',
|
|
'yuv': 'image', 'ai': 'image', 'eps': 'image',
|
|
'bw': 'image', 'hdr': 'image',
|
|
# Archive category
|
|
'ar': 'archive', 'cpio': 'archive', 'shar': 'archive',
|
|
'iso': 'archive', 'lbr': 'archive', 'mar': 'archive',
|
|
'sbx': 'archive', 'bz2': 'archive', 'f': 'archive',
|
|
'gz': 'archive', 'lz': 'archive', 'lzma': 'archive',
|
|
'lzo': 'archive', 'rz': 'archive', 'sfark': 'archive',
|
|
'sz': 'archive', 'z': 'archive', '7z': 'archive',
|
|
's7z': 'archive', 'ace': 'archive', 'afa': 'archive',
|
|
'alz': 'archive', 'apk': 'archive', 'arc': 'archive',
|
|
'arj': 'archive', 'b1': 'archive', 'b6z': 'archive',
|
|
'a': 'archive', 'bh': 'archive', 'cab': 'archive',
|
|
'car': 'archive', 'cfs': 'archive', 'cpt': 'archive',
|
|
'dar': 'archive', 'dd': 'archive', 'dgc': 'archive',
|
|
'dmg': 'archive', 'ear': 'archive', 'gca': 'archive',
|
|
'ha': 'archive', 'hki': 'archive', 'ice': 'archive',
|
|
'jar': 'archive', 'kgb': 'archive', 'lzh': 'archive',
|
|
'lha': 'archive', 'lzx': 'archive', 'pak': 'archive',
|
|
'partimg': 'archive', 'paq6': 'archive', 'paq7': 'archive',
|
|
'paq8': 'archive', 'pea': 'archive', 'pim': 'archive',
|
|
'pit': 'archive', 'qda': 'archive', 'rar': 'archive',
|
|
'rk': 'archive', 'sda': 'archive', 'sea': 'archive',
|
|
'sen': 'archive', 'sfx': 'archive', 'shk': 'archive',
|
|
'sit': 'archive', 'sitx': 'archive', 'sqx': 'archive',
|
|
'tbz2': 'archive', 'tlz': 'archive', 'xz': 'archive',
|
|
'txz': 'archive', 'uc': 'archive', 'uc0': 'archive',
|
|
'uc2': 'archive', 'ucn': 'archive', 'ur2': 'archive',
|
|
'ue2': 'archive', 'uca': 'archive', 'uha': 'archive',
|
|
'war': 'archive', 'wim': 'archive', 'xar': 'archive',
|
|
'xp3': 'archive', 'yz1': 'archive', 'zip': 'archive',
|
|
'zipx': 'archive', 'zoo': 'archive', 'zpaq': 'archive',
|
|
'zz': 'archive', 'xpi': 'archive', 'tgz': 'archive',
|
|
'tbz': 'archive', 'tar': 'archive', 'bz': 'archive',
|
|
'diz': 'archive',
|
|
}
|
|
|
|
colors = {
|
|
"application": "bg-application",
|
|
"text": "bg-text",
|
|
"video": "bg-video",
|
|
"image": "bg-image",
|
|
"audio": "bg-audio",
|
|
"archive": "bg-archive"
|
|
}
|
|
|
|
|
|
def get_color(category):
|
|
return colors.get(category, None)
|
|
|
|
|
|
def get_category(extension):
|
|
return category_map.get(extension, None)
|
|
|
|
|
|
def is_valid_url(url):
|
|
if not url.endswith("/"):
|
|
return False
|
|
|
|
if not url.startswith(("http://", "https://", "ftp://")):
|
|
return False
|
|
|
|
return validators.url(url)
|
|
|
|
|
|
def has_extension(link):
|
|
return len(os.path.splitext(link)[1]) > 0
|
|
|
|
|
|
def is_external_link(base_url, url: str):
|
|
url = urljoin(base_url, url).strip()
|
|
|
|
if base_url in url:
|
|
return False
|
|
return True
|
|
|
|
|
|
def is_od(url):
|
|
if not url.endswith("/"):
|
|
print("Url does not end with trailing /")
|
|
return False
|
|
|
|
try:
|
|
if url.startswith("ftp://") and config.SUBMIT_FTP:
|
|
ftp = FTP(urlparse(url).netloc)
|
|
ftp.login()
|
|
ftp.close()
|
|
return True
|
|
elif config.SUBMIT_HTTP:
|
|
r = requests.get(url, timeout=30, allow_redirects=False, verify=False)
|
|
if r.status_code != 200:
|
|
# print("No redirects allowed!")
|
|
return False
|
|
soup = BeautifulSoup(r.text, "lxml")
|
|
|
|
external_links = sum(1 if is_external_link(url, a.get("href")) else 0 for a in soup.find_all("a"))
|
|
link_tags = len(list(soup.find_all("link")))
|
|
script_tags = len(list(soup.find_all("script")))
|
|
|
|
if external_links > 11:
|
|
# print("Too many external links!")
|
|
return False
|
|
|
|
if link_tags > 5:
|
|
# print("Too many link tags!")
|
|
return False
|
|
|
|
if script_tags > 7:
|
|
# print("Too many script tags!")
|
|
return False
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
# print(e)
|
|
return False
|
|
|
|
|
|
def has_parent_dir(url):
|
|
|
|
parsed_url = urlparse(url)
|
|
|
|
if parsed_url.path == "/":
|
|
return False
|
|
|
|
parent_url = urljoin(url, "../")
|
|
try:
|
|
r = requests.get(parent_url, timeout=30, allow_redirects=False, verify=False)
|
|
if r.status_code != 200:
|
|
return False
|
|
soup = BeautifulSoup(r.text, "lxml")
|
|
|
|
for anchor in soup.find_all("a"):
|
|
if anchor.get("href") and anchor.get("href").endswith("/") and urljoin(parent_url, anchor.get("href")) == url:
|
|
# The parent page exists, and has a link to the child directory
|
|
return is_od(parent_url)
|
|
|
|
except:
|
|
return False
|
|
|
|
# Parent page exists, but does not have a link to the child directory
|
|
return False
|
|
|
|
|
|
def get_top_directory(url):
|
|
if url.startswith("ftp://"):
|
|
return url
|
|
|
|
while has_parent_dir(url):
|
|
url = urljoin(url, "../")
|
|
return url
|