mirror of
https://github.com/simon987/od-database.git
synced 2025-04-18 18:06:44 +00:00
Should fix unknown encoding errors + removed https warnings
This commit is contained in:
parent
80aa8933e6
commit
098ad2be72
@ -1,5 +1,4 @@
|
||||
from urllib.parse import unquote, urljoin
|
||||
import warnings
|
||||
import os
|
||||
from html.parser import HTMLParser
|
||||
from itertools import repeat
|
||||
@ -11,6 +10,9 @@ import config
|
||||
from dateutil.parser import parse as parse_date
|
||||
import hashlib
|
||||
|
||||
import urllib3
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class Anchor:
|
||||
def __init__(self):
|
||||
@ -134,47 +136,47 @@ class HttpDirectory(RemoteDirectory):
|
||||
|
||||
def _request_file(self, url):
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
retries = HttpDirectory.MAX_RETRIES
|
||||
while retries > 0:
|
||||
try:
|
||||
r = self.session.head(url, allow_redirects=False, timeout=40)
|
||||
retries = HttpDirectory.MAX_RETRIES
|
||||
while retries > 0:
|
||||
try:
|
||||
r = self.session.head(url, allow_redirects=False, timeout=40)
|
||||
|
||||
stripped_url = url[len(self.base_url) - 1:]
|
||||
stripped_url = url[len(self.base_url) - 1:]
|
||||
|
||||
path, name = os.path.split(stripped_url)
|
||||
date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
|
||||
return File(
|
||||
path=unquote(path).strip("/"),
|
||||
name=unquote(name),
|
||||
size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
|
||||
mtime=int(parse_date(date).timestamp()),
|
||||
is_dir=False
|
||||
)
|
||||
except RequestException:
|
||||
self.session.close()
|
||||
retries -= 1
|
||||
path, name = os.path.split(stripped_url)
|
||||
date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
|
||||
return File(
|
||||
path=unquote(path).strip("/"),
|
||||
name=unquote(name),
|
||||
size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
|
||||
mtime=int(parse_date(date).timestamp()),
|
||||
is_dir=False
|
||||
)
|
||||
except RequestException:
|
||||
self.session.close()
|
||||
retries -= 1
|
||||
|
||||
return None
|
||||
return None
|
||||
|
||||
def _stream_body(self, url: str):
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
retries = HttpDirectory.MAX_RETRIES
|
||||
while retries > 0:
|
||||
try:
|
||||
r = self.session.get(url, stream=True, timeout=40)
|
||||
for chunk in r.iter_content(chunk_size=4096):
|
||||
retries = HttpDirectory.MAX_RETRIES
|
||||
while retries > 0:
|
||||
try:
|
||||
r = self.session.get(url, stream=True, timeout=40)
|
||||
for chunk in r.iter_content(chunk_size=4096):
|
||||
try:
|
||||
yield chunk.decode(r.encoding if r.encoding else "utf-8", errors="ignore")
|
||||
r.close()
|
||||
del r
|
||||
break
|
||||
except RequestException:
|
||||
self.session.close()
|
||||
retries -= 1
|
||||
except LookupError:
|
||||
# Unsupported encoding
|
||||
yield chunk.decode("utf-8", errors="ignore")
|
||||
r.close()
|
||||
del r
|
||||
break
|
||||
except RequestException:
|
||||
self.session.close()
|
||||
retries -= 1
|
||||
|
||||
return None
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _parse_links(body):
|
||||
|
Loading…
x
Reference in New Issue
Block a user