mirror of
https://github.com/simon987/opendirectories-bot.git
synced 2025-04-10 14:06:44 +00:00
Fixed problem with non-http websites
This commit is contained in:
parent
23775ec126
commit
854fc76cc1
31
crawler.py
31
crawler.py
@ -9,15 +9,28 @@ class Crawler:
|
||||
self.files = []
|
||||
self.base_url = url
|
||||
|
||||
if test_url:
|
||||
# Test url
|
||||
r = requests.get(self.base_url, timeout=30)
|
||||
if url.startswith("http"):
|
||||
if test_url:
|
||||
# Test url
|
||||
try:
|
||||
r = requests.get(self.base_url, timeout=10) # todo change to 30
|
||||
|
||||
self.parser = self.guess_parser(r.text, r.headers)()
|
||||
if r.status_code == 200:
|
||||
self.parser = self.guess_parser(r.text, r.headers)()
|
||||
|
||||
print("Using " + self.parser.__class__.__name__ + " as parser")
|
||||
print("Using " + self.parser.__class__.__name__ + " as parser")
|
||||
else:
|
||||
print("Couldn't connect (" + str(r.status_code) + ")")
|
||||
self.parser = None
|
||||
|
||||
except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError):
|
||||
print("Timed out / Connection refused")
|
||||
self.parser = None
|
||||
else:
|
||||
print("Using ApacheParser by default because test_url was set to False")
|
||||
self.parser = ApacheParser() # Default parser
|
||||
else:
|
||||
print("Invalid Schema")
|
||||
self.parser = None
|
||||
|
||||
@staticmethod
|
||||
@ -39,6 +52,9 @@ class Crawler:
|
||||
|
||||
def crawl(self, address=None):
|
||||
|
||||
if self.parser is None:
|
||||
return
|
||||
|
||||
if address is None:
|
||||
address = self.base_url
|
||||
|
||||
@ -76,6 +92,7 @@ class Crawler:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
c = Crawler("https://repo.zenk-security.com/", True)
|
||||
c = Crawler("http://cnqzu.com/library/Anarchy%20Folder/Computers/", True)
|
||||
c.crawl()
|
||||
c.store_report("000007")
|
||||
c.store_report("000008")
|
||||
|
||||
|
@ -221,8 +221,7 @@ class ApacheParser(PageParser):
|
||||
def page_is_valid(self, text):
|
||||
|
||||
try:
|
||||
links = self.get_links(text, "")
|
||||
print(links)
|
||||
self.get_links(text, "")
|
||||
return True
|
||||
except Exception as e:
|
||||
print("This is not recognised Apache open directory: " + str(e))
|
||||
|
25
problematic websites
Normal file
25
problematic websites
Normal file
@ -0,0 +1,25 @@
|
||||
Breaks:
|
||||
|
||||
http://cnqzu.com/library/Anarchy%20Folder/Computers/Hacking,%20Security/Practical%20Unix%20&%20Internet%20Security/ (points to html files with links)
|
||||
https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apache)
|
||||
|
||||
https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/ (recursion problem)
|
||||
http://archive.scene.org/pub/resources/docs/bbs_finland/ (recursion problem)
|
||||
https://filepursuit.com/ (recursion problem - not an OD)
|
||||
https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded)
|
||||
|
||||
http://www.gamers.org/pub/archives/uwp-uml/ (?)
|
||||
|
||||
http://nesninja.com/public/GoodNES_3.14_goodmerged/ (invalid size)
|
||||
http://www.serenitystreetnews.com/videos/ (invalid size)
|
||||
https://www.datto.com/resource-downloads/ (invalid size)
|
||||
https://www.annmariegianni.com/wp-content/uploads/(invalid size)
|
||||
http://dl.apkhome.org (size is incorrect)
|
||||
|
||||
|
||||
|
||||
Working:
|
||||
http://www.cheeseheadhosting.us/downloads/
|
||||
http://www.michellemariephotographie.com/wp-content/gallery/
|
||||
http://jenserserver.no-ip.biz/movieserver/
|
||||
http://files.duspectacle.com/mp3/
|
@ -23,10 +23,22 @@ class CrawlerTest(TestCase):
|
||||
|
||||
self.assertEqual(c.guess_parser(text, {}), NginxParser)
|
||||
|
||||
def test_guess_parser3(self):
|
||||
with open("test_invalid.html", "r") as f:
|
||||
text = f.read()
|
||||
# def test_guess_parser3(self):
|
||||
# with open("test_invalid.html", "r") as f:
|
||||
# text = f.read()
|
||||
#
|
||||
# c = Crawler("http://some.website", False)
|
||||
#
|
||||
# self.assertEqual(c.guess_parser(text, {}), None)
|
||||
|
||||
c = Crawler("http://some.website", False)
|
||||
def test_invalid_schema(self):
|
||||
|
||||
self.assertEqual(c.guess_parser(text, {}), None)
|
||||
c1 = Crawler("http://google.com/", False)
|
||||
c2 = Crawler("https://google.com/", False)
|
||||
c3 = Crawler("ftp://website.com/", False)
|
||||
c4 = Crawler("ws://website.com/", False)
|
||||
|
||||
self.assertIsNotNone(c1.parser)
|
||||
self.assertIsNotNone(c2.parser)
|
||||
self.assertIsNone(c3.parser)
|
||||
self.assertIsNone(c4.parser)
|
||||
|
771
spec/test_invalid.html
Normal file
771
spec/test_invalid.html
Normal file
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user