From cadaf14c1b9a8e5ef2b5861b87ff0fc65b7dd6bf Mon Sep 17 00:00:00 2001 From: Simon Date: Thu, 23 Aug 2018 12:12:23 -0400 Subject: [PATCH] Small bugfix --- crawl_server/remote_http.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 7d73c8e..3a99947 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -119,7 +119,6 @@ class HttpDirectory(RemoteDirectory): return curl_head - def list_dir(self, path): current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1] @@ -235,14 +234,14 @@ class HttpDirectory(RemoteDirectory): @staticmethod def _should_ignore(base_url, current_path, link: Anchor): - if urljoin(base_url, link.href) == urljoin(urljoin(base_url, current_path), "../"): + full_url = urljoin(base_url, link.href) + if full_url == urljoin(urljoin(base_url, current_path), "../") or full_url == base_url: return True if link.href.endswith(HttpDirectory.BLACK_LIST): return True # Ignore external links - full_url = urljoin(base_url, link.href) if not full_url.startswith(base_url): return True