mirror of
https://github.com/simon987/opendirectories-bot.git
synced 2025-12-14 07:39:04 +00:00
Fixed bugs, enhanced parser
This commit is contained in:
182
parser.py
182
parser.py
@@ -1,14 +1,43 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
import os
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import humanfriendly
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
class PageParser:
|
||||
|
||||
def __init__(self):
|
||||
self.col_start = None
|
||||
self.col_end = None
|
||||
self.size_unknown = True
|
||||
|
||||
def get_links(self, text: str, base_url: str):
|
||||
raise NotImplementedError()
|
||||
|
||||
@staticmethod
|
||||
def get_size_columns(cols):
|
||||
|
||||
for i in range(len(cols)):
|
||||
|
||||
if i == len(cols) - 1:
|
||||
try:
|
||||
humanfriendly.parse_size(cols[i])
|
||||
return tuple([i, i])
|
||||
except humanfriendly.InvalidSize:
|
||||
return None
|
||||
|
||||
try:
|
||||
humanfriendly.parse_size(cols[i] + cols[i + 1])
|
||||
return tuple([i, i + 1])
|
||||
except humanfriendly.InvalidSize:
|
||||
try:
|
||||
humanfriendly.parse_size(cols[i])
|
||||
return tuple([i, i])
|
||||
except humanfriendly.InvalidSize:
|
||||
continue
|
||||
|
||||
@staticmethod
|
||||
def get_parser_type(headers):
|
||||
"""Get appropriate parser type for a a server based on its header"""
|
||||
@@ -26,26 +55,82 @@ class PageParser:
|
||||
|
||||
@staticmethod
|
||||
def file_type(link):
|
||||
return "d" if link.endswith("/") else "f"
|
||||
|
||||
if link.endswith("/") or link.startswith("?"):
|
||||
return "d"
|
||||
return "f"
|
||||
|
||||
|
||||
@staticmethod
|
||||
def clean_page(text):
|
||||
text = text.replace("<A", "<a")
|
||||
text = text.replace("</A", "</a")
|
||||
# text = text.replace("&", "&")
|
||||
text = text.replace("<hr>", "")
|
||||
|
||||
return text
|
||||
|
||||
def get_size(self, cols):
|
||||
|
||||
# Figure out which column(s) is the size one
|
||||
size_cols = self.get_size_columns(cols)
|
||||
if size_cols is not None:
|
||||
col_start, col_end = size_cols
|
||||
self.size_unknown = False
|
||||
|
||||
size_human = cols[col_start] if col_start == col_end else cols[col_start] + cols[col_end]
|
||||
|
||||
try:
|
||||
size = humanfriendly.parse_size(size_human)
|
||||
except humanfriendly.InvalidSize:
|
||||
size = 0
|
||||
else:
|
||||
size = 0
|
||||
|
||||
return size
|
||||
|
||||
|
||||
class NginxParser(PageParser):
|
||||
def get_links(self, text, base_url: str):
|
||||
|
||||
links = dict()
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
|
||||
# Handle weird character formats and tag names
|
||||
text = text.replace("<A", "<a")
|
||||
text = text.replace("</A", "</a")
|
||||
text = text.replace("&", "&")
|
||||
text = self.clean_page(text)
|
||||
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
|
||||
for link in soup.find("pre").find_all("a"):
|
||||
|
||||
if link.text != "../":
|
||||
parsed_link = self.parse_link(link, text, base_url)
|
||||
if parsed_link is not None:
|
||||
links[parsed_link[0]] = parsed_link[1]
|
||||
|
||||
return links
|
||||
|
||||
def page_is_valid(self, text):
|
||||
# Handle weird character formats and tag names
|
||||
text = self.clean_page(text)
|
||||
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
|
||||
if soup.find("pre") is None:
|
||||
return False
|
||||
|
||||
# try to parse a single link
|
||||
for link in soup.find("pre").find_all("a"):
|
||||
if PageParser.should_save_link(link.text):
|
||||
if self.parse_link(link, text, "") is None:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def parse_link(self, link, text, base_url):
|
||||
|
||||
try:
|
||||
if PageParser.should_save_link(link.text):
|
||||
target = link.get("href")
|
||||
full_link = urljoin(base_url, target)
|
||||
file_type = PageParser.file_type(full_link)
|
||||
file_type = PageParser.file_type(target)
|
||||
|
||||
if file_type == "f":
|
||||
extension = os.path.splitext(full_link)[1].strip(".")
|
||||
@@ -53,46 +138,30 @@ class NginxParser(PageParser):
|
||||
# Parse size
|
||||
target_index = text.find("</a", text.find(target))
|
||||
date_and_size = text[target_index:text.find("<a", target_index)]
|
||||
size = humanfriendly.parse_size(re.split("\s+", date_and_size)[3])
|
||||
|
||||
links[link.text] = dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||
cols = re.split("\s+", date_and_size)
|
||||
size = self.get_size(cols)
|
||||
|
||||
return target, dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||
else:
|
||||
links[link.text] = dict(link=full_link, type=file_type)
|
||||
return target, dict(link=full_link, type=file_type)
|
||||
except Exception as e:
|
||||
print("Couldn't parse link " + link.get("href") + str(e))
|
||||
raise e
|
||||
|
||||
return links
|
||||
return None
|
||||
|
||||
|
||||
class ApacheParser(PageParser):
|
||||
|
||||
def __init__(self):
|
||||
self.col_start = None
|
||||
self.col_end = None
|
||||
self.size_unknown = True
|
||||
|
||||
def get_size_columns(self, cols):
|
||||
|
||||
for i in range(len(cols) - 1):
|
||||
try:
|
||||
humanfriendly.parse_size(cols[i] + cols[i + 1])
|
||||
return tuple([i, i + 1])
|
||||
except humanfriendly.InvalidSize:
|
||||
try:
|
||||
humanfriendly.parse_size(cols[i])
|
||||
return tuple([i, i])
|
||||
except humanfriendly.InvalidSize:
|
||||
continue
|
||||
|
||||
def get_links(self, text, base_url: str):
|
||||
|
||||
links = dict()
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
|
||||
# Handle weird character formats and tag names
|
||||
text = text.replace("<A", "<a")
|
||||
text = text.replace("</A", "</a")
|
||||
text = text.replace("&", "&")
|
||||
|
||||
text = self.clean_page(text)
|
||||
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
|
||||
if soup.find("table"):
|
||||
|
||||
@@ -109,20 +178,20 @@ class ApacheParser(PageParser):
|
||||
if PageParser.should_save_link(link.text):
|
||||
|
||||
target = link.get("href")
|
||||
file_type = PageParser.file_type(target)
|
||||
full_link = urljoin(base_url, target)
|
||||
file_type = PageParser.file_type(full_link)
|
||||
|
||||
if file_type == "f":
|
||||
extension = os.path.splitext(full_link)[1].strip(".")
|
||||
|
||||
cols = row.find_all("td")
|
||||
for i in range(len(cols)):
|
||||
cols[i] = cols[i].string if cols[i].string is not None else ""
|
||||
cols[i] = cols[i].string if cols[i].string is not None else "-"
|
||||
size = self.get_size(cols)
|
||||
|
||||
links[link.text] = dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||
links[target] = dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||
else:
|
||||
links[link.text] = dict(link=full_link, type=file_type)
|
||||
links[target] = dict(link=full_link, type=file_type)
|
||||
else:
|
||||
|
||||
for link in soup.find_all("a"):
|
||||
@@ -131,36 +200,33 @@ class ApacheParser(PageParser):
|
||||
|
||||
target = link.get("href")
|
||||
full_link = urljoin(base_url, target)
|
||||
file_type = PageParser.file_type(full_link)
|
||||
file_type = PageParser.file_type(target)
|
||||
|
||||
if file_type == "f":
|
||||
extension = os.path.splitext(full_link)[1].strip(".")
|
||||
|
||||
target_index = text.find("</a", text.find(target))
|
||||
date_and_size = text[target_index:text.find("<a", target_index)]
|
||||
date_and_size = text[target_index:text.find("<a", target_index)] # in some cases we,re looking for </pre instead
|
||||
date_and_size = text[target_index:text.find("</pre", target_index)] if text.find("<a", target_index) == -1 else date_and_size
|
||||
|
||||
cols = re.split("\s+", date_and_size)
|
||||
size = self.get_size(cols)
|
||||
|
||||
links[link.text] = dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||
links[target] = dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||
else:
|
||||
links[link.text] = dict(link=full_link, type=file_type)
|
||||
links[target] = dict(link=full_link, type=file_type)
|
||||
|
||||
return links
|
||||
|
||||
def get_size(self, cols):
|
||||
if self.col_start is None:
|
||||
# Figure out which column(s) is the size one
|
||||
size_cols = self.get_size_columns(cols)
|
||||
if size_cols is not None:
|
||||
self.col_start, self.col_end = size_cols
|
||||
self.size_unknown = False
|
||||
def page_is_valid(self, text):
|
||||
|
||||
try:
|
||||
links = self.get_links(text, "")
|
||||
print(links)
|
||||
return True
|
||||
except Exception as e:
|
||||
print("This is not recognised Apache open directory: " + str(e))
|
||||
|
||||
|
||||
if self.size_unknown:
|
||||
size = 0
|
||||
else:
|
||||
size_human = cols[self.col_start] if self.col_start == self.col_end else cols[self.col_start] + cols[self.col_end]
|
||||
size = humanfriendly.parse_size(size_human)
|
||||
return size
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user