Fixed a problematic website

This commit is contained in:
simon 2018-02-06 17:15:50 -05:00
parent 88cebae047
commit 8e1f4543fd
2 changed files with 7 additions and 9 deletions

View File

@ -68,12 +68,10 @@ class PageParser:
return "d"
return "f"
@staticmethod
def clean_page(text):
text = text.replace("<A", "<a")
text = text.replace("</A", "</a")
# text = text.replace("&", "&amp;")
text = text.replace("<hr>", "")
return text
@ -108,11 +106,11 @@ class NginxParser(PageParser):
soup = BeautifulSoup(text, "html.parser")
for link in soup.find("pre").find_all("a"):
parsed_link = self.parse_link(link, text, base_url)
if parsed_link is not None:
links[parsed_link[0]] = parsed_link[1]
for pre in soup.find_all("pre"):
for link in pre.find_all("a"):
parsed_link = self.parse_link(link, text, base_url)
if parsed_link is not None:
links[parsed_link[0]] = parsed_link[1]
return links

View File

@ -7,7 +7,6 @@ https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apac
https://filepursuit.com/ (recursion problem - not an OD)
https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded)
http://www.gamers.org/pub/archives/uwp-uml/ (?)
@ -23,4 +22,5 @@ http://www.serenitystreetnews.com/videos/
https://www.datto.com/resource-downloads/
https://www.annmariegianni.com/wp-content/uploads/
http://archive.scene.org/pub/resources/docs/bbs_finland/
http://dl.apkhome.org
http://dl.apkhome.org
http://www.gamers.org/pub/archives/uwp-uml/