mirror of
https://github.com/simon987/opendirectories-bot.git
synced 2025-04-19 18:26:44 +00:00
Fixed problem with a problematic type of OD
This commit is contained in:
parent
854fc76cc1
commit
f32f9ee3a6
@ -92,7 +92,8 @@ class Crawler:
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
c = Crawler("http://cnqzu.com/library/Anarchy%20Folder/Computers/", True)
|
c = Crawler("https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/", True)
|
||||||
c.crawl()
|
c.crawl()
|
||||||
|
print(c.files)
|
||||||
c.store_report("000008")
|
c.store_report("000008")
|
||||||
|
|
||||||
|
@ -50,8 +50,10 @@ class PageParser:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def should_save_link(text):
|
def should_save_link(text):
|
||||||
|
|
||||||
return text.lower().find("parent directory") == -1 and text != "Name" and text != "Last modified" and \
|
return text.lower().find("parent directory") == -1 and text != "Name" and text != "Last modified" and \
|
||||||
text != "Size" and text != "Description " and text != "Description" and text != "../"
|
text != "Size" and text != "Description " and text != "Description" and text != "../" and text != "" and\
|
||||||
|
text is not None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def file_type(link):
|
def file_type(link):
|
||||||
@ -170,11 +172,13 @@ class ApacheParser(PageParser):
|
|||||||
if len(row.find_all("th")) > 0:
|
if len(row.find_all("th")) > 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
link = row.find("a")
|
links_in_row = row.find_all("a")
|
||||||
|
|
||||||
|
for link in links_in_row:
|
||||||
if link is None:
|
if link is None:
|
||||||
# Exited directory listing
|
# Exited directory listing
|
||||||
return links
|
return links
|
||||||
|
|
||||||
if PageParser.should_save_link(link.text):
|
if PageParser.should_save_link(link.text):
|
||||||
|
|
||||||
target = link.get("href")
|
target = link.get("href")
|
||||||
|
@ -3,7 +3,6 @@ Breaks:
|
|||||||
http://cnqzu.com/library/Anarchy%20Folder/Computers/Hacking,%20Security/Practical%20Unix%20&%20Internet%20Security/ (points to html files with links)
|
http://cnqzu.com/library/Anarchy%20Folder/Computers/Hacking,%20Security/Practical%20Unix%20&%20Internet%20Security/ (points to html files with links)
|
||||||
https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apache)
|
https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apache)
|
||||||
|
|
||||||
https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/ (recursion problem)
|
|
||||||
http://archive.scene.org/pub/resources/docs/bbs_finland/ (recursion problem)
|
http://archive.scene.org/pub/resources/docs/bbs_finland/ (recursion problem)
|
||||||
https://filepursuit.com/ (recursion problem - not an OD)
|
https://filepursuit.com/ (recursion problem - not an OD)
|
||||||
https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded)
|
https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user