mirror of
https://github.com/simon987/opendirectories-bot.git
synced 2025-04-19 02:06:45 +00:00
Fixes problem with link tags with no href
This commit is contained in:
parent
56b28f534e
commit
90c5835cb5
79
parser.py
79
parser.py
@ -136,23 +136,24 @@ class NginxParser(PageParser):
|
|||||||
try:
|
try:
|
||||||
if PageParser.should_save_link(link.text):
|
if PageParser.should_save_link(link.text):
|
||||||
target = link.get("href")
|
target = link.get("href")
|
||||||
short_file_name = os.path.split(target)[1]
|
if target is not None:
|
||||||
full_link = urljoin(base_url, target)
|
short_file_name = os.path.split(target)[1]
|
||||||
file_type = PageParser.file_type(target)
|
full_link = urljoin(base_url, target)
|
||||||
|
file_type = PageParser.file_type(target)
|
||||||
|
|
||||||
if file_type == "f":
|
if file_type == "f":
|
||||||
extension = os.path.splitext(full_link)[1].strip(".")
|
extension = os.path.splitext(full_link)[1].strip(".")
|
||||||
|
|
||||||
# Parse size
|
# Parse size
|
||||||
target_index = text.find("</a", text.find(target))
|
target_index = text.find("</a", text.find(target))
|
||||||
date_and_size = text[target_index:text.find("<a", target_index)]
|
date_and_size = text[target_index:text.find("<a", target_index)]
|
||||||
|
|
||||||
cols = re.split("\s+", date_and_size)
|
cols = re.split("\s+", date_and_size)
|
||||||
size = self.get_size(cols[1:], short_file_name)
|
size = self.get_size(cols[1:], short_file_name)
|
||||||
|
|
||||||
return target, dict(link=full_link, size=size, ext=extension, type=file_type)
|
return target, dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||||
else:
|
else:
|
||||||
return target, dict(link=full_link, type=file_type)
|
return target, dict(link=full_link, type=file_type)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Couldn't parse link " + link.get("href") + str(e))
|
print("Couldn't parse link " + link.get("href") + str(e))
|
||||||
raise e
|
raise e
|
||||||
@ -188,21 +189,22 @@ class ApacheParser(PageParser):
|
|||||||
if PageParser.should_save_link(link.text):
|
if PageParser.should_save_link(link.text):
|
||||||
|
|
||||||
target = link.get("href")
|
target = link.get("href")
|
||||||
short_file_name = os.path.split(target)[1]
|
if target is not None:
|
||||||
file_type = PageParser.file_type(target)
|
short_file_name = os.path.split(target)[1]
|
||||||
full_link = urljoin(base_url, target)
|
file_type = PageParser.file_type(target)
|
||||||
|
full_link = urljoin(base_url, target)
|
||||||
|
|
||||||
if file_type == "f":
|
if file_type == "f":
|
||||||
extension = os.path.splitext(full_link)[1].strip(".")
|
extension = os.path.splitext(full_link)[1].strip(".")
|
||||||
|
|
||||||
cols = row.find_all("td")
|
cols = row.find_all("td")
|
||||||
for i in range(len(cols)):
|
for i in range(len(cols)):
|
||||||
cols[i] = cols[i].string if cols[i].string is not None else "-"
|
cols[i] = cols[i].string if cols[i].string is not None else "-"
|
||||||
size = self.get_size(cols[1:], short_file_name)
|
size = self.get_size(cols[1:], short_file_name)
|
||||||
|
|
||||||
links[target] = dict(link=full_link, size=size, ext=extension, type=file_type)
|
links[target] = dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||||
else:
|
else:
|
||||||
links[target] = dict(link=full_link, type=file_type)
|
links[target] = dict(link=full_link, type=file_type)
|
||||||
else:
|
else:
|
||||||
|
|
||||||
for link in soup.find_all("a"):
|
for link in soup.find_all("a"):
|
||||||
@ -210,23 +212,24 @@ class ApacheParser(PageParser):
|
|||||||
if PageParser.should_save_link(link.text):
|
if PageParser.should_save_link(link.text):
|
||||||
|
|
||||||
target = link.get("href")
|
target = link.get("href")
|
||||||
short_file_name = os.path.split(target)[1]
|
if target is not None:
|
||||||
full_link = urljoin(base_url, target)
|
short_file_name = os.path.split(target)[1]
|
||||||
file_type = PageParser.file_type(target)
|
full_link = urljoin(base_url, target)
|
||||||
|
file_type = PageParser.file_type(target)
|
||||||
|
|
||||||
if file_type == "f":
|
if file_type == "f":
|
||||||
extension = os.path.splitext(full_link)[1].strip(".")
|
extension = os.path.splitext(full_link)[1].strip(".")
|
||||||
|
|
||||||
target_index = text.find("</a", text.find(target))
|
target_index = text.find("</a", text.find(target))
|
||||||
date_and_size = text[target_index:text.find("<a", target_index)] # in some cases we,re looking for </pre instead
|
date_and_size = text[target_index:text.find("<a", target_index)] # in some cases we,re looking for </pre instead
|
||||||
date_and_size = text[target_index:text.find("</pre", target_index)] if text.find("<a", target_index) == -1 else date_and_size
|
date_and_size = text[target_index:text.find("</pre", target_index)] if text.find("<a", target_index) == -1 else date_and_size
|
||||||
|
|
||||||
cols = re.split("\s+", date_and_size)
|
cols = re.split("\s+", date_and_size)
|
||||||
size = self.get_size(cols[1:], short_file_name)
|
size = self.get_size(cols[1:], short_file_name)
|
||||||
|
|
||||||
links[target] = dict(link=full_link, size=size, ext=extension, type=file_type)
|
links[target] = dict(link=full_link, size=size, ext=extension, type=file_type)
|
||||||
else:
|
else:
|
||||||
links[target] = dict(link=full_link, type=file_type)
|
links[target] = dict(link=full_link, type=file_type)
|
||||||
|
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user