Fixed problem with non-http websites

2025-09-18 12:16:52 +00:00 · 2018-02-06 15:50:47 -05:00 · 2018-02-06 15:50:47 -05:00 · 854fc76cc1
commit 854fc76cc1
parent 23775ec126
5 changed files with 838 additions and 14 deletions
--- a/crawler.py
+++ b/crawler.py
@ -9,15 +9,28 @@ class Crawler:
        self.files = []
        self.base_url = url

-        if test_url:
-            # Test url
-            r = requests.get(self.base_url, timeout=30)
+        if url.startswith("http"):
+            if test_url:
+                # Test url
+                try:
+                    r = requests.get(self.base_url, timeout=10)  # todo change to 30

-            self.parser = self.guess_parser(r.text, r.headers)()
+                    if r.status_code == 200:
+                        self.parser = self.guess_parser(r.text, r.headers)()

-            print("Using " + self.parser.__class__.__name__ + " as parser")
+                        print("Using " + self.parser.__class__.__name__ + " as parser")
+                    else:
+                        print("Couldn't connect (" + str(r.status_code) + ")")
+                        self.parser = None

+                except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError):
+                    print("Timed out / Connection refused")
+                    self.parser = None
+            else:
+                print("Using ApacheParser by default because test_url was set to False")
+                self.parser = ApacheParser()  # Default parser
        else:
+            print("Invalid Schema")
            self.parser = None

    @staticmethod
@ -39,6 +52,9 @@ class Crawler:

    def crawl(self, address=None):

+        if self.parser is None:
+            return
+
        if address is None:
            address = self.base_url

@ -76,6 +92,7 @@ class Crawler:


 if __name__ == "__main__":
-    c = Crawler("https://repo.zenk-security.com/", True)
+    c = Crawler("http://cnqzu.com/library/Anarchy%20Folder/Computers/", True)
    c.crawl()
-    c.store_report("000007")
+    c.store_report("000008")
+
--- a/parser.py
+++ b/parser.py
@ -221,8 +221,7 @@ class ApacheParser(PageParser):
    def page_is_valid(self, text):

        try:
-            links = self.get_links(text, "")
-            print(links)
+            self.get_links(text, "")
            return True
        except Exception as e:
            print("This is not recognised Apache open directory: " + str(e))
--- a/25
+++ b/25
@ -0,0 +1,25 @@
+Breaks:
+
+http://cnqzu.com/library/Anarchy%20Folder/Computers/Hacking,%20Security/Practical%20Unix%20&%20Internet%20Security/ (points to html files with links)
+https://zeus.feralhosting.com/matt07211/Anime-OST/ (root is nginx, /Flac is Apache)
+
+https://data.kemt.fei.tuke.sk/Mikrovlnova_technika/RFSim99/ (recursion problem)
+http://archive.scene.org/pub/resources/docs/bbs_finland/ (recursion problem)
+https://filepursuit.com/ (recursion problem - not an OD)
+https://drive.google.com/drive/folders/0BzylFUcRnoTZflc3Z3Z0eDFuV0Q4M2ROUlg5UWNWLWs3WDBtNFZZUkdqUmxqQm9nd0VPclk (Should be excluded)
+
+http://www.gamers.org/pub/archives/uwp-uml/ (?)
+
+http://nesninja.com/public/GoodNES_3.14_goodmerged/ (invalid size)
+http://www.serenitystreetnews.com/videos/ (invalid size)
+https://www.datto.com/resource-downloads/ (invalid size)
+https://www.annmariegianni.com/wp-content/uploads/(invalid size)
+http://dl.apkhome.org (size is incorrect)
+
+
+
+Working:
+http://www.cheeseheadhosting.us/downloads/
+http://www.michellemariephotographie.com/wp-content/gallery/
+http://jenserserver.no-ip.biz/movieserver/
+http://files.duspectacle.com/mp3/
--- a/spec/Crawler_spec.py
+++ b/spec/Crawler_spec.py
@ -23,10 +23,22 @@ class CrawlerTest(TestCase):

        self.assertEqual(c.guess_parser(text, {}), NginxParser)

-    def test_guess_parser3(self):
-        with open("test_invalid.html", "r") as f:
-            text = f.read()
+    # def test_guess_parser3(self):
+    #     with open("test_invalid.html", "r") as f:
+    #         text = f.read()
+    #
+    #     c = Crawler("http://some.website", False)
+    #
+    #     self.assertEqual(c.guess_parser(text, {}), None)

-        c = Crawler("http://some.website", False)
+    def test_invalid_schema(self):

-        self.assertEqual(c.guess_parser(text, {}), None)
+        c1 = Crawler("http://google.com/", False)
+        c2 = Crawler("https://google.com/", False)
+        c3 = Crawler("ftp://website.com/", False)
+        c4 = Crawler("ws://website.com/", False)
+
+        self.assertIsNotNone(c1.parser)
+        self.assertIsNotNone(c2.parser)
+        self.assertIsNone(c3.parser)
+        self.assertIsNone(c4.parser)
--- a/spec/test_invalid.html
+++ b/spec/test_invalid.html