From b63c7190c3aed963d3089747af023e2234380825 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Mon, 18 Jun 2018 12:14:05 -0400
Subject: [PATCH] Improved external link detection

---
 crawl_server/remote_http.py | 5 +++--
 debug_put.py                | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py
index 5913ad2..99a6345 100644
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -1,4 +1,4 @@
-from urllib.parse import unquote
+from urllib.parse import unquote, urljoin
 import os
 from html.parser import HTMLParser
 from itertools import repeat
@@ -177,7 +177,8 @@ class HttpDirectory(RemoteDirectory):
             return True
 
         # Ignore external links
-        if link.href.startswith("http") and not link.href.startswith(base_url):
+        full_url = os.path.join(base_url, link.href)
+        if not full_url.startswith(base_url):
             return True
 
     def close(self):
diff --git a/debug_put.py b/debug_put.py
index a586bed..1e18546 100644
--- a/debug_put.py
+++ b/debug_put.py
@@ -4,9 +4,9 @@ import json
 
 payload = json.dumps({
     "website_id": 123,
-    # "url": "http://alphamediazone.com/data/Movies1/",
+    "url": "http://liminaire.fr/TEXTES/",
     # "url": "http://localhost:8000/",
-    "url": "http://ubuntu.mirrorservice.org/",
+    # "url": "http://ubuntu.mirrorservice.org/",
     "priority": 2,
     "callback_type": "",
     "callback_args": "{}"