Fixed problem with a problematic type of OD

This commit is contained in:
simon 2018-02-06 16:03:18 -05:00
parent 854fc76cc1
commit f32f9ee3a6
3 changed files with 24 additions and 20 deletions

View File

@ -92,7 +92,8 @@ class Crawler:
if __name__ == "__main__": if __name__ == "__main__":
c = Crawler("http://cnqzu.com/library/Anarchy%20Folder/Computers/", True) c = Crawler("https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/", True)
c.crawl() c.crawl()
print(c.files)
c.store_report("000008") c.store_report("000008")

View File

@ -50,8 +50,10 @@ class PageParser:
@staticmethod @staticmethod
def should_save_link(text): def should_save_link(text):
return text.lower().find("parent directory") == -1 and text != "Name" and text != "Last modified" and \ return text.lower().find("parent directory") == -1 and text != "Name" and text != "Last modified" and \
text != "Size" and text != "Description " and text != "Description" and text != "../" text != "Size" and text != "Description " and text != "Description" and text != "../" and text != "" and\
text is not None
@staticmethod @staticmethod
def file_type(link): def file_type(link):
@ -170,11 +172,13 @@ class ApacheParser(PageParser):
if len(row.find_all("th")) > 0: if len(row.find_all("th")) > 0:
continue continue
link = row.find("a") links_in_row = row.find_all("a")
for link in links_in_row:
if link is None: if link is None:
# Exited directory listing # Exited directory listing
return links return links
if PageParser.should_save_link(link.text): if PageParser.should_save_link(link.text):
target = link.get("href") target = link.get("href")

View File

@ -3,7 +3,6 @@ Breaks:
http://cnqzu.com/library/Anarchy%20Folder/Computers/Hacking,%20Security/Practical%20Unix%20&%20Internet%20Security/ (points to html files with links) http://cnqzu.com/library/Anarchy%20Folder/Computers/Hacking,%20Security/Practical%20Unix%20&%20Internet%20Security/ (points to html files with links)
https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apache) https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apache)
https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/ (recursion problem)
http://archive.scene.org/pub/resources/docs/bbs_finland/ (recursion problem) http://archive.scene.org/pub/resources/docs/bbs_finland/ (recursion problem)
https://filepursuit.com/ (recursion problem - not an OD) https://filepursuit.com/ (recursion problem - not an OD)
https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded) https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded)