diff --git a/parser.py b/parser.py index c81ac2f..9fc7ea6 100644 --- a/parser.py +++ b/parser.py @@ -68,12 +68,10 @@ class PageParser: return "d" return "f" - @staticmethod def clean_page(text): text = text.replace("", "") return text @@ -108,11 +106,11 @@ class NginxParser(PageParser): soup = BeautifulSoup(text, "html.parser") - for link in soup.find("pre").find_all("a"): - - parsed_link = self.parse_link(link, text, base_url) - if parsed_link is not None: - links[parsed_link[0]] = parsed_link[1] + for pre in soup.find_all("pre"): + for link in pre.find_all("a"): + parsed_link = self.parse_link(link, text, base_url) + if parsed_link is not None: + links[parsed_link[0]] = parsed_link[1] return links diff --git a/problematic websites b/problematic websites index 9496011..557b961 100644 --- a/problematic websites +++ b/problematic websites @@ -7,7 +7,6 @@ https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apac https://filepursuit.com/ (recursion problem - not an OD) https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded) -http://www.gamers.org/pub/archives/uwp-uml/ (?) @@ -23,4 +22,5 @@ http://www.serenitystreetnews.com/videos/ https://www.datto.com/resource-downloads/ https://www.annmariegianni.com/wp-content/uploads/ http://archive.scene.org/pub/resources/docs/bbs_finland/ -http://dl.apkhome.org \ No newline at end of file +http://dl.apkhome.org +http://www.gamers.org/pub/archives/uwp-uml/ \ No newline at end of file