Should fix unknown encoding errors + removed https warnings

This commit is contained in:
Simon 2018-06-21 19:23:01 -04:00
parent 80aa8933e6
commit 098ad2be72

View File

@ -1,5 +1,4 @@
from urllib.parse import unquote, urljoin from urllib.parse import unquote, urljoin
import warnings
import os import os
from html.parser import HTMLParser from html.parser import HTMLParser
from itertools import repeat from itertools import repeat
@ -11,6 +10,9 @@ import config
from dateutil.parser import parse as parse_date from dateutil.parser import parse as parse_date
import hashlib import hashlib
import urllib3
urllib3.disable_warnings()
class Anchor: class Anchor:
def __init__(self): def __init__(self):
@ -134,47 +136,47 @@ class HttpDirectory(RemoteDirectory):
def _request_file(self, url): def _request_file(self, url):
with warnings.catch_warnings(): retries = HttpDirectory.MAX_RETRIES
warnings.simplefilter("ignore") while retries > 0:
retries = HttpDirectory.MAX_RETRIES try:
while retries > 0: r = self.session.head(url, allow_redirects=False, timeout=40)
try:
r = self.session.head(url, allow_redirects=False, timeout=40)
stripped_url = url[len(self.base_url) - 1:] stripped_url = url[len(self.base_url) - 1:]
path, name = os.path.split(stripped_url) path, name = os.path.split(stripped_url)
date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01" date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
return File( return File(
path=unquote(path).strip("/"), path=unquote(path).strip("/"),
name=unquote(name), name=unquote(name),
size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1, size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
mtime=int(parse_date(date).timestamp()), mtime=int(parse_date(date).timestamp()),
is_dir=False is_dir=False
) )
except RequestException: except RequestException:
self.session.close() self.session.close()
retries -= 1 retries -= 1
return None return None
def _stream_body(self, url: str): def _stream_body(self, url: str):
with warnings.catch_warnings(): retries = HttpDirectory.MAX_RETRIES
warnings.simplefilter("ignore") while retries > 0:
retries = HttpDirectory.MAX_RETRIES try:
while retries > 0: r = self.session.get(url, stream=True, timeout=40)
try: for chunk in r.iter_content(chunk_size=4096):
r = self.session.get(url, stream=True, timeout=40) try:
for chunk in r.iter_content(chunk_size=4096):
yield chunk.decode(r.encoding if r.encoding else "utf-8", errors="ignore") yield chunk.decode(r.encoding if r.encoding else "utf-8", errors="ignore")
r.close() except LookupError:
del r # Unsupported encoding
break yield chunk.decode("utf-8", errors="ignore")
except RequestException: r.close()
self.session.close() del r
retries -= 1 break
except RequestException:
self.session.close()
retries -= 1
return None return None
@staticmethod @staticmethod
def _parse_links(body): def _parse_links(body):