Improved external link detection

This commit is contained in:
Simon 2018-06-18 12:14:05 -04:00
parent 400abc9a3c
commit b63c7190c3
2 changed files with 5 additions and 4 deletions

View File

@ -1,4 +1,4 @@
from urllib.parse import unquote
from urllib.parse import unquote, urljoin
import os
from html.parser import HTMLParser
from itertools import repeat
@ -177,7 +177,8 @@ class HttpDirectory(RemoteDirectory):
return True
# Ignore external links
if link.href.startswith("http") and not link.href.startswith(base_url):
full_url = os.path.join(base_url, link.href)
if not full_url.startswith(base_url):
return True
def close(self):

View File

@ -4,9 +4,9 @@ import json
payload = json.dumps({
"website_id": 123,
# "url": "http://alphamediazone.com/data/Movies1/",
"url": "http://liminaire.fr/TEXTES/",
# "url": "http://localhost:8000/",
"url": "http://ubuntu.mirrorservice.org/",
# "url": "http://ubuntu.mirrorservice.org/",
"priority": 2,
"callback_type": "",
"callback_args": "{}"