from bs4 import BeautifulSoup from urllib.parse import urljoin import os import re import humanfriendly class PageParser: def get_links(self, text: str, base_url: str): raise NotImplementedError() @staticmethod def get_parser_type(headers): """Get appropriate parser type for a a server based on its header""" server = headers["Server"] if server is not None: if server == "nginx": return NginxParser @staticmethod def should_save_link(text): return text.lower().find("parent directory") == -1 and text != "Name" and text != "Last modified" and \ text != "Size" and text != "Description " and text != "Description" and text != "../" @staticmethod def file_type(link): return "d" if link.endswith("/") else "f" class NginxParser(PageParser): def get_links(self, text, base_url: str): links = dict() soup = BeautifulSoup(text, "html.parser") # Handle weird character formats and tag names text = text.replace(" 0: continue link = row.find("a") if link is None: # Exited directory listing return links if PageParser.should_save_link(link.text): target = link.get("href") full_link = urljoin(base_url, target) file_type = PageParser.file_type(full_link) if file_type == "f": extension = os.path.splitext(full_link)[1].strip(".") cols = row.find_all("td") for i in range(len(cols)): cols[i] = cols[i].string if cols[i].string is not None else "" size = self.get_size(cols) links[link.text] = dict(link=full_link, size=size, ext=extension, type=file_type) else: links[link.text] = dict(link=full_link, type=file_type) else: for link in soup.find_all("a"): if PageParser.should_save_link(link.text): target = link.get("href") full_link = urljoin(base_url, target) file_type = PageParser.file_type(full_link) if file_type == "f": extension = os.path.splitext(full_link)[1].strip(".") target_index = text.find("