Should fix memory usage problem when crawling

2025-12-14 15:19:03 +00:00 · 2018-06-14 23:36:54 -04:00
parent 9aed18c2d2
commit adb94cf326
6 changed files with 46 additions and 12 deletions
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -36,7 +36,7 @@ class HttpDirectory(RemoteDirectory):

    def __init__(self, url):
        super().__init__(url)
-        self.parser = etree.HTMLParser(collect_ids=False)
+        self.parser = etree.HTMLParser(collect_ids=False, encoding='utf-8')

    def list_dir(self, path) -> list:
        results = []
@@ -103,7 +103,7 @@ class HttpDirectory(RemoteDirectory):
        while retries > 0:
            try:
                r = requests.get(url, headers=HttpDirectory.HEADERS)
-                return r.content
+                return r.text
            except RequestException:
                retries -= 1