From d8486104b48e7dd86302a1f7c2921427c6faf62e Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Tue, 19 Jun 2018 12:14:50 -0400
Subject: [PATCH] Fix for odd html listings

---
 crawl_server/remote_http.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index 5a96202..ff4e6ac 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -62,7 +62,11 @@ class HttpDirectory(RemoteDirectory):
         "?C=N;O=D",
         "?C=M;O=A",
         "?C=S;O=A",
-        "?C=D;O=A"
+        "?C=D;O=A",
+        "?MA",
+        "?SA",
+        "?DA",
+        "?ND"
     )
     MAX_RETRIES = 3
 
@@ -75,7 +79,7 @@ class HttpDirectory(RemoteDirectory):
 
     def list_dir(self, path):
 
-        path_url = self.base_url + path.strip("/") + "/"
+        path_url = urljoin(self.base_url, path, "")
         body = self._stream_body(path_url)
         if not body:
             return None
@@ -96,8 +100,7 @@ class HttpDirectory(RemoteDirectory):
                     is_dir=True
                 )
             else:
-                pass
-                urls_to_request.append(path_url + anchor.href)
+                urls_to_request.append(urljoin(path_url, anchor.href))
 
         for file in self.request_files(urls_to_request):
             yield file
@@ -181,7 +184,7 @@ class HttpDirectory(RemoteDirectory):
             return True
 
         # Ignore external links
-        full_url = os.path.join(base_url, link.href)
+        full_url = urljoin(base_url, link.href)
         if not full_url.startswith(base_url):
             return True