Should fix unknown encoding errors + removed https warnings

This commit is contained in:
Simon 2018-06-21 19:23:01 -04:00
parent 80aa8933e6
commit 098ad2be72

View File

@ -1,5 +1,4 @@
from urllib.parse import unquote, urljoin from urllib.parse import unquote, urljoin
import warnings
import os import os
from html.parser import HTMLParser from html.parser import HTMLParser
from itertools import repeat from itertools import repeat
@ -11,6 +10,9 @@ import config
from dateutil.parser import parse as parse_date from dateutil.parser import parse as parse_date
import hashlib import hashlib
import urllib3
urllib3.disable_warnings()
class Anchor: class Anchor:
def __init__(self): def __init__(self):
@ -134,8 +136,6 @@ class HttpDirectory(RemoteDirectory):
def _request_file(self, url): def _request_file(self, url):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
retries = HttpDirectory.MAX_RETRIES retries = HttpDirectory.MAX_RETRIES
while retries > 0: while retries > 0:
try: try:
@ -159,14 +159,16 @@ class HttpDirectory(RemoteDirectory):
return None return None
def _stream_body(self, url: str): def _stream_body(self, url: str):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
retries = HttpDirectory.MAX_RETRIES retries = HttpDirectory.MAX_RETRIES
while retries > 0: while retries > 0:
try: try:
r = self.session.get(url, stream=True, timeout=40) r = self.session.get(url, stream=True, timeout=40)
for chunk in r.iter_content(chunk_size=4096): for chunk in r.iter_content(chunk_size=4096):
try:
yield chunk.decode(r.encoding if r.encoding else "utf-8", errors="ignore") yield chunk.decode(r.encoding if r.encoding else "utf-8", errors="ignore")
except LookupError:
# Unsupported encoding
yield chunk.decode("utf-8", errors="ignore")
r.close() r.close()
del r del r
break break