Fixed problem with non-http websites

This commit is contained in:
simon 2018-02-06 15:50:47 -05:00
parent 23775ec126
commit 854fc76cc1
5 changed files with 838 additions and 14 deletions

View File

@ -9,15 +9,28 @@ class Crawler:
self.files = []
self.base_url = url
if test_url:
# Test url
r = requests.get(self.base_url, timeout=30)
if url.startswith("http"):
if test_url:
# Test url
try:
r = requests.get(self.base_url, timeout=10) # todo change to 30
self.parser = self.guess_parser(r.text, r.headers)()
if r.status_code == 200:
self.parser = self.guess_parser(r.text, r.headers)()
print("Using " + self.parser.__class__.__name__ + " as parser")
print("Using " + self.parser.__class__.__name__ + " as parser")
else:
print("Couldn't connect (" + str(r.status_code) + ")")
self.parser = None
except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError):
print("Timed out / Connection refused")
self.parser = None
else:
print("Using ApacheParser by default because test_url was set to False")
self.parser = ApacheParser() # Default parser
else:
print("Invalid Schema")
self.parser = None
@staticmethod
@ -39,6 +52,9 @@ class Crawler:
def crawl(self, address=None):
if self.parser is None:
return
if address is None:
address = self.base_url
@ -76,6 +92,7 @@ class Crawler:
if __name__ == "__main__":
c = Crawler("https://repo.zenk-security.com/", True)
c = Crawler("http://cnqzu.com/library/Anarchy%20Folder/Computers/", True)
c.crawl()
c.store_report("000007")
c.store_report("000008")

View File

@ -221,8 +221,7 @@ class ApacheParser(PageParser):
def page_is_valid(self, text):
try:
links = self.get_links(text, "")
print(links)
self.get_links(text, "")
return True
except Exception as e:
print("This is not recognised Apache open directory: " + str(e))

25
problematic websites Normal file
View File

@ -0,0 +1,25 @@
Breaks:
http://cnqzu.com/library/Anarchy%20Folder/Computers/Hacking,%20Security/Practical%20Unix%20&%20Internet%20Security/ (points to html files with links)
https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apache)
https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/ (recursion problem)
http://archive.scene.org/pub/resources/docs/bbs_finland/ (recursion problem)
https://filepursuit.com/ (recursion problem - not an OD)
https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded)
http://www.gamers.org/pub/archives/uwp-uml/ (?)
http://nesninja.com/public/GoodNES_3.14_goodmerged/ (invalid size)
http://www.serenitystreetnews.com/videos/ (invalid size)
https://www.datto.com/resource-downloads/ (invalid size)
https://www.annmariegianni.com/wp-content/uploads/(invalid size)
http://dl.apkhome.org (size is incorrect)
Working:
http://www.cheeseheadhosting.us/downloads/
http://www.michellemariephotographie.com/wp-content/gallery/
http://jenserserver.no-ip.biz/movieserver/
http://files.duspectacle.com/mp3/

View File

@ -23,10 +23,22 @@ class CrawlerTest(TestCase):
self.assertEqual(c.guess_parser(text, {}), NginxParser)
def test_guess_parser3(self):
with open("test_invalid.html", "r") as f:
text = f.read()
# def test_guess_parser3(self):
# with open("test_invalid.html", "r") as f:
# text = f.read()
#
# c = Crawler("http://some.website", False)
#
# self.assertEqual(c.guess_parser(text, {}), None)
c = Crawler("http://some.website", False)
def test_invalid_schema(self):
self.assertEqual(c.guess_parser(text, {}), None)
c1 = Crawler("http://google.com/", False)
c2 = Crawler("https://google.com/", False)
c3 = Crawler("ftp://website.com/", False)
c4 = Crawler("ws://website.com/", False)
self.assertIsNotNone(c1.parser)
self.assertIsNotNone(c2.parser)
self.assertIsNone(c3.parser)
self.assertIsNone(c4.parser)

771
spec/test_invalid.html Normal file

File diff suppressed because one or more lines are too long