From d138db8f06494c6b76536c6492eb5989ea964661 Mon Sep 17 00:00:00 2001 From: Simon Date: Tue, 10 Jul 2018 10:14:23 -0400 Subject: [PATCH] Added filter to check if a website can be scanned from its parent directory --- app.py | 9 ++++---- crawl_server/crawler.py | 1 + crawl_server/remote_http.py | 1 + od_util.py | 44 ++++++++++++++++++++++++++++++++----- 4 files changed, 46 insertions(+), 9 deletions(-) diff --git a/app.py b/app.py index 601f5ab..1fa2ee1 100644 --- a/app.py +++ b/app.py @@ -322,8 +322,12 @@ def submit(): def try_enqueue(url): url = os.path.join(url, "") - website = db.get_website_by_url(url) + url = od_util.get_top_directory(url) + if not od_util.is_valid_url(url): + return "Error: Invalid url. Make sure to include the appropriate scheme.", "warning" + + website = db.get_website_by_url(url) if website: return "Website already exists", "danger" @@ -331,9 +335,6 @@ def try_enqueue(url): if website: return "A parent directory of this url has already been posted", "danger" - if not od_util.is_valid_url(url): - return "Error: Invalid url. Make sure to include the appropriate scheme.", "danger" - if db.is_blacklisted(url): return "Error: " \ "Sorry, this website has been blacklisted. If you think " \ diff --git a/crawl_server/crawler.py b/crawl_server/crawler.py index 79edd2c..6a637a1 100644 --- a/crawl_server/crawler.py +++ b/crawl_server/crawler.py @@ -92,6 +92,7 @@ class RemoteDirectoryCrawler: if root_listing: self.crawled_paths.append(path_id) else: + logger.info("No files in root listing for " + self.url) return CrawlResult(0, "empty") directory.close() except TimeoutError: diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index abbb4ee..5539ec1 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -105,6 +105,7 @@ class HttpDirectory(RemoteDirectory): path_url = urljoin(self.base_url, path, "") body = self._stream_body(path_url) if not body: + logger.info("No body returned @ " + path_url) return None, None anchors = self._parse_links(body) diff --git a/od_util.py b/od_util.py index 69bffae..00057e0 100644 --- a/od_util.py +++ b/od_util.py @@ -180,7 +180,7 @@ def is_od(url): elif config.SUBMIT_HTTP: r = requests.get(url, timeout=30, allow_redirects=False, verify=False) if r.status_code != 200: - print("No redirects allowed!") + # print("No redirects allowed!") return False soup = BeautifulSoup(r.text, "lxml") @@ -189,20 +189,54 @@ def is_od(url): script_tags = len(list(soup.find_all("script"))) if external_links > 11: - print("Too many external links!") + # print("Too many external links!") return False if link_tags > 5: - print("Too many link tags!") + # print("Too many link tags!") return False if script_tags > 7: - print("Too many script tags!") + # print("Too many script tags!") return False return True except Exception as e: - print(e) + # print(e) return False + +def has_parent_dir(url): + + parsed_url = urlparse(url) + + if parsed_url.path == "/": + return False + + parent_url = urljoin(url, "../") + try: + r = requests.get(parent_url, timeout=30, allow_redirects=False, verify=False) + if r.status_code != 200: + return False + soup = BeautifulSoup(r.text, "lxml") + + for anchor in soup.find_all("a"): + if anchor.get("href") and anchor.get("href").endswith("/") and urljoin(parent_url, anchor.get("href")) == url: + # The parent page exists, and has a link to the child directory + return is_od(parent_url) + + except: + return False + + # Parent page exists, but does not have a link to the child directory + return False + + +def get_top_directory(url): + if url.startswith("ftp://"): + return url + + while has_parent_dir(url): + url = urljoin(url, "../") + return url