mirror of
https://github.com/simon987/opendirectories-bot.git
synced 2025-04-19 18:26:44 +00:00
Fixed problem with non-http websites
This commit is contained in:
parent
23775ec126
commit
854fc76cc1
25
crawler.py
25
crawler.py
@ -9,15 +9,28 @@ class Crawler:
|
|||||||
self.files = []
|
self.files = []
|
||||||
self.base_url = url
|
self.base_url = url
|
||||||
|
|
||||||
|
if url.startswith("http"):
|
||||||
if test_url:
|
if test_url:
|
||||||
# Test url
|
# Test url
|
||||||
r = requests.get(self.base_url, timeout=30)
|
try:
|
||||||
|
r = requests.get(self.base_url, timeout=10) # todo change to 30
|
||||||
|
|
||||||
|
if r.status_code == 200:
|
||||||
self.parser = self.guess_parser(r.text, r.headers)()
|
self.parser = self.guess_parser(r.text, r.headers)()
|
||||||
|
|
||||||
print("Using " + self.parser.__class__.__name__ + " as parser")
|
print("Using " + self.parser.__class__.__name__ + " as parser")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
print("Couldn't connect (" + str(r.status_code) + ")")
|
||||||
|
self.parser = None
|
||||||
|
|
||||||
|
except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError):
|
||||||
|
print("Timed out / Connection refused")
|
||||||
|
self.parser = None
|
||||||
|
else:
|
||||||
|
print("Using ApacheParser by default because test_url was set to False")
|
||||||
|
self.parser = ApacheParser() # Default parser
|
||||||
|
else:
|
||||||
|
print("Invalid Schema")
|
||||||
self.parser = None
|
self.parser = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -39,6 +52,9 @@ class Crawler:
|
|||||||
|
|
||||||
def crawl(self, address=None):
|
def crawl(self, address=None):
|
||||||
|
|
||||||
|
if self.parser is None:
|
||||||
|
return
|
||||||
|
|
||||||
if address is None:
|
if address is None:
|
||||||
address = self.base_url
|
address = self.base_url
|
||||||
|
|
||||||
@ -76,6 +92,7 @@ class Crawler:
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
c = Crawler("https://repo.zenk-security.com/", True)
|
c = Crawler("http://cnqzu.com/library/Anarchy%20Folder/Computers/", True)
|
||||||
c.crawl()
|
c.crawl()
|
||||||
c.store_report("000007")
|
c.store_report("000008")
|
||||||
|
|
||||||
|
@ -221,8 +221,7 @@ class ApacheParser(PageParser):
|
|||||||
def page_is_valid(self, text):
|
def page_is_valid(self, text):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
links = self.get_links(text, "")
|
self.get_links(text, "")
|
||||||
print(links)
|
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("This is not recognised Apache open directory: " + str(e))
|
print("This is not recognised Apache open directory: " + str(e))
|
||||||
|
25
problematic websites
Normal file
25
problematic websites
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
Breaks:
|
||||||
|
|
||||||
|
http://cnqzu.com/library/Anarchy%20Folder/Computers/Hacking,%20Security/Practical%20Unix%20&%20Internet%20Security/ (points to html files with links)
|
||||||
|
https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apache)
|
||||||
|
|
||||||
|
https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/ (recursion problem)
|
||||||
|
http://archive.scene.org/pub/resources/docs/bbs_finland/ (recursion problem)
|
||||||
|
https://filepursuit.com/ (recursion problem - not an OD)
|
||||||
|
https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded)
|
||||||
|
|
||||||
|
http://www.gamers.org/pub/archives/uwp-uml/ (?)
|
||||||
|
|
||||||
|
http://nesninja.com/public/GoodNES_3.14_goodmerged/ (invalid size)
|
||||||
|
http://www.serenitystreetnews.com/videos/ (invalid size)
|
||||||
|
https://www.datto.com/resource-downloads/ (invalid size)
|
||||||
|
https://www.annmariegianni.com/wp-content/uploads/(invalid size)
|
||||||
|
http://dl.apkhome.org (size is incorrect)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Working:
|
||||||
|
http://www.cheeseheadhosting.us/downloads/
|
||||||
|
http://www.michellemariephotographie.com/wp-content/gallery/
|
||||||
|
http://jenserserver.no-ip.biz/movieserver/
|
||||||
|
http://files.duspectacle.com/mp3/
|
@ -23,10 +23,22 @@ class CrawlerTest(TestCase):
|
|||||||
|
|
||||||
self.assertEqual(c.guess_parser(text, {}), NginxParser)
|
self.assertEqual(c.guess_parser(text, {}), NginxParser)
|
||||||
|
|
||||||
def test_guess_parser3(self):
|
# def test_guess_parser3(self):
|
||||||
with open("test_invalid.html", "r") as f:
|
# with open("test_invalid.html", "r") as f:
|
||||||
text = f.read()
|
# text = f.read()
|
||||||
|
#
|
||||||
|
# c = Crawler("http://some.website", False)
|
||||||
|
#
|
||||||
|
# self.assertEqual(c.guess_parser(text, {}), None)
|
||||||
|
|
||||||
c = Crawler("http://some.website", False)
|
def test_invalid_schema(self):
|
||||||
|
|
||||||
self.assertEqual(c.guess_parser(text, {}), None)
|
c1 = Crawler("http://google.com/", False)
|
||||||
|
c2 = Crawler("https://google.com/", False)
|
||||||
|
c3 = Crawler("ftp://website.com/", False)
|
||||||
|
c4 = Crawler("ws://website.com/", False)
|
||||||
|
|
||||||
|
self.assertIsNotNone(c1.parser)
|
||||||
|
self.assertIsNotNone(c2.parser)
|
||||||
|
self.assertIsNone(c3.parser)
|
||||||
|
self.assertIsNone(c4.parser)
|
||||||
|
771
spec/test_invalid.html
Normal file
771
spec/test_invalid.html
Normal file
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user