Attempt to fix unicode decode errors

This commit is contained in:
Simon 2018-06-16 15:20:23 -04:00
parent 9d0a0a8b42
commit c309aa25c8

View File

@ -102,6 +102,7 @@ class HttpDirectory(RemoteDirectory):
def _parse_links(self, body: bytes) -> list: def _parse_links(self, body: bytes) -> list:
result = list() result = list()
try:
tree = etree.HTML(body, parser=self.parser) tree = etree.HTML(body, parser=self.parser)
links = [] links = []
try: try:
@ -111,6 +112,16 @@ class HttpDirectory(RemoteDirectory):
for link in links: for link in links:
result.append((link.text, link.get("href"))) result.append((link.text, link.get("href")))
except UnicodeDecodeError:
tree = etree.HTML(body.decode("utf-8", errors="ignore"), parser=self.parser)
links = []
try:
links = tree.findall(".//a/[@href]")
except AttributeError:
pass
for link in links:
result.append((link.text, link.get("href")))
return result return result