From c309aa25c8beb0cd25f28a508ed63d1f8afa748b Mon Sep 17 00:00:00 2001 From: Simon Date: Sat, 16 Jun 2018 15:20:23 -0400 Subject: [PATCH] Attempt to fix unicode decode errors --- crawl_server/remote_http.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 2978b12..1ad4b50 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -102,15 +102,26 @@ class HttpDirectory(RemoteDirectory): def _parse_links(self, body: bytes) -> list: result = list() - tree = etree.HTML(body, parser=self.parser) - links = [] try: - links = tree.findall(".//a/[@href]") - except AttributeError: - pass + tree = etree.HTML(body, parser=self.parser) + links = [] + try: + links = tree.findall(".//a/[@href]") + except AttributeError: + pass - for link in links: - result.append((link.text, link.get("href"))) + for link in links: + result.append((link.text, link.get("href"))) + except UnicodeDecodeError: + tree = etree.HTML(body.decode("utf-8", errors="ignore"), parser=self.parser) + links = [] + try: + links = tree.findall(".//a/[@href]") + except AttributeError: + pass + + for link in links: + result.append((link.text, link.get("href"))) return result