diff --git a/crawler.py b/crawler.py index e577422..5e7ab7f 100644 --- a/crawler.py +++ b/crawler.py @@ -9,15 +9,28 @@ class Crawler: self.files = [] self.base_url = url - if test_url: - # Test url - r = requests.get(self.base_url, timeout=30) + if url.startswith("http"): + if test_url: + # Test url + try: + r = requests.get(self.base_url, timeout=10) # todo change to 30 - self.parser = self.guess_parser(r.text, r.headers)() + if r.status_code == 200: + self.parser = self.guess_parser(r.text, r.headers)() - print("Using " + self.parser.__class__.__name__ + " as parser") + print("Using " + self.parser.__class__.__name__ + " as parser") + else: + print("Couldn't connect (" + str(r.status_code) + ")") + self.parser = None + except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError): + print("Timed out / Connection refused") + self.parser = None + else: + print("Using ApacheParser by default because test_url was set to False") + self.parser = ApacheParser() # Default parser else: + print("Invalid Schema") self.parser = None @staticmethod @@ -39,6 +52,9 @@ class Crawler: def crawl(self, address=None): + if self.parser is None: + return + if address is None: address = self.base_url @@ -76,6 +92,7 @@ class Crawler: if __name__ == "__main__": - c = Crawler("https://repo.zenk-security.com/", True) + c = Crawler("http://cnqzu.com/library/Anarchy%20Folder/Computers/", True) c.crawl() - c.store_report("000007") + c.store_report("000008") + diff --git a/parser.py b/parser.py index e71a30a..b6cb2aa 100644 --- a/parser.py +++ b/parser.py @@ -221,8 +221,7 @@ class ApacheParser(PageParser): def page_is_valid(self, text): try: - links = self.get_links(text, "") - print(links) + self.get_links(text, "") return True except Exception as e: print("This is not recognised Apache open directory: " + str(e)) diff --git a/problematic websites b/problematic websites new file mode 100644 index 0000000..8ff3245 --- /dev/null +++ b/problematic websites @@ -0,0 +1,25 @@ +Breaks: + +http://cnqzu.com/library/Anarchy%20Folder/Computers/Hacking,%20Security/Practical%20Unix%20&%20Internet%20Security/ (points to html files with links) +https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apache) + +https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/ (recursion problem) +http://archive.scene.org/pub/resources/docs/bbs_finland/ (recursion problem) +https://filepursuit.com/ (recursion problem - not an OD) +https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded) + +http://www.gamers.org/pub/archives/uwp-uml/ (?) + +http://nesninja.com/public/GoodNES_3.14_goodmerged/ (invalid size) +http://www.serenitystreetnews.com/videos/ (invalid size) +https://www.datto.com/resource-downloads/ (invalid size) +https://www.annmariegianni.com/wp-content/uploads/(invalid size) +http://dl.apkhome.org (size is incorrect) + + + +Working: +http://www.cheeseheadhosting.us/downloads/ +http://www.michellemariephotographie.com/wp-content/gallery/ +http://jenserserver.no-ip.biz/movieserver/ +http://files.duspectacle.com/mp3/ \ No newline at end of file diff --git a/spec/Crawler_spec.py b/spec/Crawler_spec.py index 22713f9..20017c3 100644 --- a/spec/Crawler_spec.py +++ b/spec/Crawler_spec.py @@ -23,10 +23,22 @@ class CrawlerTest(TestCase): self.assertEqual(c.guess_parser(text, {}), NginxParser) - def test_guess_parser3(self): - with open("test_invalid.html", "r") as f: - text = f.read() + # def test_guess_parser3(self): + # with open("test_invalid.html", "r") as f: + # text = f.read() + # + # c = Crawler("http://some.website", False) + # + # self.assertEqual(c.guess_parser(text, {}), None) - c = Crawler("http://some.website", False) + def test_invalid_schema(self): - self.assertEqual(c.guess_parser(text, {}), None) \ No newline at end of file + c1 = Crawler("http://google.com/", False) + c2 = Crawler("https://google.com/", False) + c3 = Crawler("ftp://website.com/", False) + c4 = Crawler("ws://website.com/", False) + + self.assertIsNotNone(c1.parser) + self.assertIsNotNone(c2.parser) + self.assertIsNone(c3.parser) + self.assertIsNone(c4.parser) diff --git a/spec/test_invalid.html b/spec/test_invalid.html new file mode 100644 index 0000000..437976e --- /dev/null +++ b/spec/test_invalid.html @@ -0,0 +1,771 @@ +Backgrounds – Google Drive
Applications Google
Menu principal
\ No newline at end of file