diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 5913ad2..99a6345 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -1,4 +1,4 @@ -from urllib.parse import unquote +from urllib.parse import unquote, urljoin import os from html.parser import HTMLParser from itertools import repeat @@ -177,7 +177,8 @@ class HttpDirectory(RemoteDirectory): return True # Ignore external links - if link.href.startswith("http") and not link.href.startswith(base_url): + full_url = os.path.join(base_url, link.href) + if not full_url.startswith(base_url): return True def close(self): diff --git a/debug_put.py b/debug_put.py index a586bed..1e18546 100644 --- a/debug_put.py +++ b/debug_put.py @@ -4,9 +4,9 @@ import json payload = json.dumps({ "website_id": 123, - # "url": "http://alphamediazone.com/data/Movies1/", + "url": "http://liminaire.fr/TEXTES/", # "url": "http://localhost:8000/", - "url": "http://ubuntu.mirrorservice.org/", + # "url": "http://ubuntu.mirrorservice.org/", "priority": 2, "callback_type": "", "callback_args": "{}"