Fixed problem with non-http websites

2025-11-05 23:56:52 +00:00 · 2018-02-06 15:50:47 -05:00 · 2018-02-06 15:50:47 -05:00 · 854fc76cc1
commit 854fc76cc1
parent 23775ec126
5 changed files with 838 additions and 14 deletions
--- a/crawler.py
+++ b/crawler.py
@ -9,15 +9,28 @@ class Crawler:
        self.files = []
        self.base_url = url
        if url.startswith("http"):
            if test_url:
                # Test url
-            r = requests.get(self.base_url, timeout=30)
+                try:
                    r = requests.get(self.base_url, timeout=10)  # todo change to 30
                    if r.status_code == 200:
                        self.parser = self.guess_parser(r.text, r.headers)()
                        print("Using " + self.parser.__class__.__name__ + " as parser")
                    else:
                        print("Couldn't connect (" + str(r.status_code) + ")")
                        self.parser = None
                except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError):
                    print("Timed out / Connection refused")
                    self.parser = None
            else:
                print("Using ApacheParser by default because test_url was set to False")
                self.parser = ApacheParser()  # Default parser
        else:
            print("Invalid Schema")
            self.parser = None
    @staticmethod
@ -39,6 +52,9 @@ class Crawler:
    def crawl(self, address=None):
        if self.parser is None:
            return
        if address is None:
            address = self.base_url
@ -76,6 +92,7 @@ class Crawler:
 if __name__ == "__main__":
-    c = Crawler("https://repo.zenk-security.com/", True)
+    c = Crawler("http://cnqzu.com/library/Anarchy%20Folder/Computers/", True)
    c.crawl()
-    c.store_report("000007")
+    c.store_report("000008")
--- a/parser.py
+++ b/parser.py
@ -221,8 +221,7 @@ class ApacheParser(PageParser):
    def page_is_valid(self, text):
        try:
-            links = self.get_links(text, "")
+            self.get_links(text, "")
            print(links)
            return True
        except Exception as e:
            print("This is not recognised Apache open directory: " + str(e))
--- a/25
+++ b/25
@ -0,0 +1,25 @@
 Breaks:
 http://cnqzu.com/library/Anarchy%20Folder/Computers/Hacking,%20Security/Practical%20Unix%20&%20Internet%20Security/ (points to html files with links)
 https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apache)
 https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/ (recursion problem)
 http://archive.scene.org/pub/resources/docs/bbs_finland/ (recursion problem)
 https://filepursuit.com/ (recursion problem - not an OD)
 https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded)
 http://www.gamers.org/pub/archives/uwp-uml/ (?)
 http://nesninja.com/public/GoodNES_3.14_goodmerged/ (invalid size)
 http://www.serenitystreetnews.com/videos/ (invalid size)
 https://www.datto.com/resource-downloads/ (invalid size)
 https://www.annmariegianni.com/wp-content/uploads/(invalid size)
 http://dl.apkhome.org (size is incorrect)
 Working:
 http://www.cheeseheadhosting.us/downloads/
 http://www.michellemariephotographie.com/wp-content/gallery/
 http://jenserserver.no-ip.biz/movieserver/
 http://files.duspectacle.com/mp3/
--- a/spec/Crawler_spec.py
+++ b/spec/Crawler_spec.py
@ -23,10 +23,22 @@ class CrawlerTest(TestCase):
        self.assertEqual(c.guess_parser(text, {}), NginxParser)
-    def test_guess_parser3(self):
+    # def test_guess_parser3(self):
-        with open("test_invalid.html", "r") as f:
+    #     with open("test_invalid.html", "r") as f:
-            text = f.read()
+    #         text = f.read()
    #
    #     c = Crawler("http://some.website", False)
    #
    #     self.assertEqual(c.guess_parser(text, {}), None)
-        c = Crawler("http://some.website", False)
+    def test_invalid_schema(self):
-        self.assertEqual(c.guess_parser(text, {}), None)
+        c1 = Crawler("http://google.com/", False)
        c2 = Crawler("https://google.com/", False)
        c3 = Crawler("ftp://website.com/", False)
        c4 = Crawler("ws://website.com/", False)
        self.assertIsNotNone(c1.parser)
        self.assertIsNotNone(c2.parser)
        self.assertIsNone(c3.parser)
        self.assertIsNone(c4.parser)
--- a/spec/test_invalid.html
+++ b/spec/test_invalid.html