From d138db8f06494c6b76536c6492eb5989ea964661 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Tue, 10 Jul 2018 10:14:23 -0400
Subject: [PATCH] Added filter to check if a website can be scanned from its
 parent directory

---
 app.py                      |  9 ++++----
 crawl_server/crawler.py     |  1 +
 crawl_server/remote_http.py |  1 +
 od_util.py                  | 44 ++++++++++++++++++++++++++++++++-----
 4 files changed, 46 insertions(+), 9 deletions(-)
diff --git a/app.py b/app.py
index 601f5ab..1fa2ee1 100644
--- a/app.py
+++ b/app.py
@@ -322,8 +322,12 @@ def submit():
 
 def try_enqueue(url):
     url = os.path.join(url, "")
-    website = db.get_website_by_url(url)
+    url = od_util.get_top_directory(url)
 
+    if not od_util.is_valid_url(url):
+        return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme.", "warning"
+
+    website = db.get_website_by_url(url)
     if website:
         return "Website already exists", "danger"
 
@@ -331,9 +335,6 @@ def try_enqueue(url):
     if website:
         return "A parent directory of this url has already been posted", "danger"
 
-    if not od_util.is_valid_url(url):
-        return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme.", "danger"
-
     if db.is_blacklisted(url):
         return "<strong>Error:</strong> " \
                "Sorry, this website has been blacklisted. If you think " \
diff --git a/crawl_server/crawler.py b/crawl_server/crawler.py
index 79edd2c..6a637a1 100644
--- a/crawl_server/crawler.py
+++ b/crawl_server/crawler.py
@@ -92,6 +92,7 @@ class RemoteDirectoryCrawler:
                 if root_listing:
                     self.crawled_paths.append(path_id)
                 else:
+                    logger.info("No files in root listing for " + self.url)
                     return CrawlResult(0, "empty")
                 directory.close()
             except TimeoutError:
diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index abbb4ee..5539ec1 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -105,6 +105,7 @@ class HttpDirectory(RemoteDirectory):
         path_url = urljoin(self.base_url, path, "")
         body = self._stream_body(path_url)
         if not body:
+            logger.info("No body returned @ " + path_url)
             return None, None
         anchors = self._parse_links(body)
 
diff --git a/od_util.py b/od_util.py
index 69bffae..00057e0 100644
--- a/od_util.py
+++ b/od_util.py
@@ -180,7 +180,7 @@ def is_od(url):
         elif config.SUBMIT_HTTP:
             r = requests.get(url, timeout=30, allow_redirects=False, verify=False)
             if r.status_code != 200:
-                print("No redirects allowed!")
+                # print("No redirects allowed!")
                 return False
             soup = BeautifulSoup(r.text, "lxml")
 
@@ -189,20 +189,54 @@ def is_od(url):
             script_tags = len(list(soup.find_all("script")))
 
             if external_links > 11:
-                print("Too many external links!")
+                # print("Too many external links!")
                 return False
 
             if link_tags > 5:
-                print("Too many link tags!")
+                # print("Too many link tags!")
                 return False
 
             if script_tags > 7:
-                print("Too many script tags!")
+                # print("Too many script tags!")
                 return False
 
             return True
 
     except Exception as e:
-        print(e)
+        # print(e)
         return False
 
+
+def has_parent_dir(url):
+
+    parsed_url = urlparse(url)
+
+    if parsed_url.path == "/":
+        return False
+
+    parent_url = urljoin(url, "../")
+    try:
+        r = requests.get(parent_url, timeout=30, allow_redirects=False, verify=False)
+        if r.status_code != 200:
+            return False
+        soup = BeautifulSoup(r.text, "lxml")
+
+        for anchor in soup.find_all("a"):
+            if anchor.get("href") and anchor.get("href").endswith("/") and urljoin(parent_url, anchor.get("href")) == url:
+                # The parent page exists, and has a link to the child directory
+                return is_od(parent_url)
+
+    except:
+        return False
+
+    # Parent page exists, but does not have a link to the child directory
+    return False
+
+
+def get_top_directory(url):
+    if url.startswith("ftp://"):
+        return url
+
+    while has_parent_dir(url):
+        url = urljoin(url, "../")
+    return url