From d8486104b48e7dd86302a1f7c2921427c6faf62e Mon Sep 17 00:00:00 2001 From: Simon Date: Tue, 19 Jun 2018 12:14:50 -0400 Subject: [PATCH] Fix for odd html listings --- crawl_server/remote_http.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 5a96202..ff4e6ac 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -62,7 +62,11 @@ class HttpDirectory(RemoteDirectory): "?C=N;O=D", "?C=M;O=A", "?C=S;O=A", - "?C=D;O=A" + "?C=D;O=A", + "?MA", + "?SA", + "?DA", + "?ND" ) MAX_RETRIES = 3 @@ -75,7 +79,7 @@ class HttpDirectory(RemoteDirectory): def list_dir(self, path): - path_url = self.base_url + path.strip("/") + "/" + path_url = urljoin(self.base_url, path, "") body = self._stream_body(path_url) if not body: return None @@ -96,8 +100,7 @@ class HttpDirectory(RemoteDirectory): is_dir=True ) else: - pass - urls_to_request.append(path_url + anchor.href) + urls_to_request.append(urljoin(path_url, anchor.href)) for file in self.request_files(urls_to_request): yield file @@ -181,7 +184,7 @@ class HttpDirectory(RemoteDirectory): return True # Ignore external links - full_url = os.path.join(base_url, link.href) + full_url = urljoin(base_url, link.href) if not full_url.startswith(base_url): return True