Should fix unknown encoding errors + removed https warnings

2025-10-25 03:46:52 +00:00 · 2018-06-21 19:23:01 -04:00 · 2018-06-21 19:23:01 -04:00 · 098ad2be72
commit 098ad2be72
parent 80aa8933e6
1 changed files with 37 additions and 35 deletions
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@ -1,5 +1,4 @@
 from urllib.parse import unquote, urljoin
 import warnings
 import os
 from html.parser import HTMLParser
 from itertools import repeat
@ -11,6 +10,9 @@ import config
 from dateutil.parser import parse as parse_date
 import hashlib
 import urllib3
 urllib3.disable_warnings()
 class Anchor:
    def __init__(self):
@ -134,47 +136,47 @@ class HttpDirectory(RemoteDirectory):
    def _request_file(self, url):
-        with warnings.catch_warnings():
+        retries = HttpDirectory.MAX_RETRIES
-            warnings.simplefilter("ignore")
+        while retries > 0:
-            retries = HttpDirectory.MAX_RETRIES
+            try:
-            while retries > 0:
+                r = self.session.head(url, allow_redirects=False, timeout=40)
                try:
                    r = self.session.head(url, allow_redirects=False, timeout=40)
-                    stripped_url = url[len(self.base_url) - 1:]
+                stripped_url = url[len(self.base_url) - 1:]
-                    path, name = os.path.split(stripped_url)
+                path, name = os.path.split(stripped_url)
-                    date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
+                date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
-                    return File(
+                return File(
-                        path=unquote(path).strip("/"),
+                    path=unquote(path).strip("/"),
-                        name=unquote(name),
+                    name=unquote(name),
-                        size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
+                    size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
-                        mtime=int(parse_date(date).timestamp()),
+                    mtime=int(parse_date(date).timestamp()),
-                        is_dir=False
+                    is_dir=False
-                    )
+                )
-                except RequestException:
+            except RequestException:
-                    self.session.close()
+                self.session.close()
-                    retries -= 1
+                retries -= 1
-            return None
+        return None
    def _stream_body(self, url: str):
-        with warnings.catch_warnings():
+        retries = HttpDirectory.MAX_RETRIES
-            warnings.simplefilter("ignore")
+        while retries > 0:
-            retries = HttpDirectory.MAX_RETRIES
+            try:
-            while retries > 0:
+                r = self.session.get(url, stream=True, timeout=40)
-                try:
+                for chunk in r.iter_content(chunk_size=4096):
-                    r = self.session.get(url, stream=True, timeout=40)
+                    try:
                    for chunk in r.iter_content(chunk_size=4096):
                        yield chunk.decode(r.encoding if r.encoding else "utf-8", errors="ignore")
-                    r.close()
+                    except LookupError:
-                    del r
+                        # Unsupported encoding
-                    break
+                        yield chunk.decode("utf-8", errors="ignore")
-                except RequestException:
+                r.close()
-                    self.session.close()
+                del r
-                    retries -= 1
+                break
            except RequestException:
                self.session.close()
                retries -= 1
-            return None
+        return None
    @staticmethod
    def _parse_links(body):