From b63c7190c3aed963d3089747af023e2234380825 Mon Sep 17 00:00:00 2001 From: Simon Date: Mon, 18 Jun 2018 12:14:05 -0400 Subject: [PATCH] Improved external link detection --- crawl_server/remote_http.py | 5 +++-- debug_put.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 5913ad2..99a6345 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -1,4 +1,4 @@ -from urllib.parse import unquote +from urllib.parse import unquote, urljoin import os from html.parser import HTMLParser from itertools import repeat @@ -177,7 +177,8 @@ class HttpDirectory(RemoteDirectory): return True # Ignore external links - if link.href.startswith("http") and not link.href.startswith(base_url): + full_url = os.path.join(base_url, link.href) + if not full_url.startswith(base_url): return True def close(self): diff --git a/debug_put.py b/debug_put.py index a586bed..1e18546 100644 --- a/debug_put.py +++ b/debug_put.py @@ -4,9 +4,9 @@ import json payload = json.dumps({ "website_id": 123, - # "url": "http://alphamediazone.com/data/Movies1/", + "url": "http://liminaire.fr/TEXTES/", # "url": "http://localhost:8000/", - "url": "http://ubuntu.mirrorservice.org/", + # "url": "http://ubuntu.mirrorservice.org/", "priority": 2, "callback_type": "", "callback_args": "{}"