Added filter to check if a website can be scanned from its parent directory

2025-10-24 11:26:53 +00:00 · 2018-07-10 10:14:23 -04:00 · 2018-07-10 10:14:23 -04:00 · d138db8f06
commit d138db8f06
parent f226b82f5a
4 changed files with 46 additions and 9 deletions
--- a/app.py
+++ b/app.py
@ -322,8 +322,12 @@ def submit():
 def try_enqueue(url):
    url = os.path.join(url, "")
-    website = db.get_website_by_url(url)
+    url = od_util.get_top_directory(url)
    if not od_util.is_valid_url(url):
        return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme.", "warning"
    website = db.get_website_by_url(url)
    if website:
        return "Website already exists", "danger"
@ -331,9 +335,6 @@ def try_enqueue(url):
    if website:
        return "A parent directory of this url has already been posted", "danger"
    if not od_util.is_valid_url(url):
        return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme.", "danger"
    if db.is_blacklisted(url):
        return "<strong>Error:</strong> " \
               "Sorry, this website has been blacklisted. If you think " \
--- a/crawl_server/crawler.py
+++ b/crawl_server/crawler.py
@ -92,6 +92,7 @@ class RemoteDirectoryCrawler:
                if root_listing:
                    self.crawled_paths.append(path_id)
                else:
                    logger.info("No files in root listing for " + self.url)
                    return CrawlResult(0, "empty")
                directory.close()
            except TimeoutError:
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@ -105,6 +105,7 @@ class HttpDirectory(RemoteDirectory):
        path_url = urljoin(self.base_url, path, "")
        body = self._stream_body(path_url)
        if not body:
            logger.info("No body returned @ " + path_url)
            return None, None
        anchors = self._parse_links(body)
--- a/od_util.py
+++ b/od_util.py
@ -180,7 +180,7 @@ def is_od(url):
        elif config.SUBMIT_HTTP:
            r = requests.get(url, timeout=30, allow_redirects=False, verify=False)
            if r.status_code != 200:
-                print("No redirects allowed!")
+                # print("No redirects allowed!")
                return False
            soup = BeautifulSoup(r.text, "lxml")
@ -189,20 +189,54 @@ def is_od(url):
            script_tags = len(list(soup.find_all("script")))
            if external_links > 11:
-                print("Too many external links!")
+                # print("Too many external links!")
                return False
            if link_tags > 5:
-                print("Too many link tags!")
+                # print("Too many link tags!")
                return False
            if script_tags > 7:
-                print("Too many script tags!")
+                # print("Too many script tags!")
                return False
            return True
    except Exception as e:
-        print(e)
+        # print(e)
        return False
 def has_parent_dir(url):
    parsed_url = urlparse(url)
    if parsed_url.path == "/":
        return False
    parent_url = urljoin(url, "../")
    try:
        r = requests.get(parent_url, timeout=30, allow_redirects=False, verify=False)
        if r.status_code != 200:
            return False
        soup = BeautifulSoup(r.text, "lxml")
        for anchor in soup.find_all("a"):
            if anchor.get("href") and anchor.get("href").endswith("/") and urljoin(parent_url, anchor.get("href")) == url:
                # The parent page exists, and has a link to the child directory
                return is_od(parent_url)
    except:
        return False
    # Parent page exists, but does not have a link to the child directory
    return False
 def get_top_directory(url):
    if url.startswith("ftp://"):
        return url
    while has_parent_dir(url):
        url = urljoin(url, "../")
    return url