diff --git a/app.py b/app.py
index 601f5ab..1fa2ee1 100644
--- a/app.py
+++ b/app.py
@@ -322,8 +322,12 @@ def submit():
def try_enqueue(url):
url = os.path.join(url, "")
- website = db.get_website_by_url(url)
+ url = od_util.get_top_directory(url)
+ if not od_util.is_valid_url(url):
+ return "Error: Invalid url. Make sure to include the appropriate scheme.", "warning"
+
+ website = db.get_website_by_url(url)
if website:
return "Website already exists", "danger"
@@ -331,9 +335,6 @@ def try_enqueue(url):
if website:
return "A parent directory of this url has already been posted", "danger"
- if not od_util.is_valid_url(url):
- return "Error: Invalid url. Make sure to include the appropriate scheme.", "danger"
-
if db.is_blacklisted(url):
return "Error: " \
"Sorry, this website has been blacklisted. If you think " \
diff --git a/crawl_server/crawler.py b/crawl_server/crawler.py
index 79edd2c..6a637a1 100644
--- a/crawl_server/crawler.py
+++ b/crawl_server/crawler.py
@@ -92,6 +92,7 @@ class RemoteDirectoryCrawler:
if root_listing:
self.crawled_paths.append(path_id)
else:
+ logger.info("No files in root listing for " + self.url)
return CrawlResult(0, "empty")
directory.close()
except TimeoutError:
diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index abbb4ee..5539ec1 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -105,6 +105,7 @@ class HttpDirectory(RemoteDirectory):
path_url = urljoin(self.base_url, path, "")
body = self._stream_body(path_url)
if not body:
+ logger.info("No body returned @ " + path_url)
return None, None
anchors = self._parse_links(body)
diff --git a/od_util.py b/od_util.py
index 69bffae..00057e0 100644
--- a/od_util.py
+++ b/od_util.py
@@ -180,7 +180,7 @@ def is_od(url):
elif config.SUBMIT_HTTP:
r = requests.get(url, timeout=30, allow_redirects=False, verify=False)
if r.status_code != 200:
- print("No redirects allowed!")
+ # print("No redirects allowed!")
return False
soup = BeautifulSoup(r.text, "lxml")
@@ -189,20 +189,54 @@ def is_od(url):
script_tags = len(list(soup.find_all("script")))
if external_links > 11:
- print("Too many external links!")
+ # print("Too many external links!")
return False
if link_tags > 5:
- print("Too many link tags!")
+ # print("Too many link tags!")
return False
if script_tags > 7:
- print("Too many script tags!")
+ # print("Too many script tags!")
return False
return True
except Exception as e:
- print(e)
+ # print(e)
return False
+
+def has_parent_dir(url):
+
+ parsed_url = urlparse(url)
+
+ if parsed_url.path == "/":
+ return False
+
+ parent_url = urljoin(url, "../")
+ try:
+ r = requests.get(parent_url, timeout=30, allow_redirects=False, verify=False)
+ if r.status_code != 200:
+ return False
+ soup = BeautifulSoup(r.text, "lxml")
+
+ for anchor in soup.find_all("a"):
+ if anchor.get("href") and anchor.get("href").endswith("/") and urljoin(parent_url, anchor.get("href")) == url:
+ # The parent page exists, and has a link to the child directory
+ return is_od(parent_url)
+
+ except:
+ return False
+
+ # Parent page exists, but does not have a link to the child directory
+ return False
+
+
+def get_top_directory(url):
+ if url.startswith("ftp://"):
+ return url
+
+ while has_parent_dir(url):
+ url = urljoin(url, "../")
+ return url