Support for more than just utf-8 and removed some debug info

2025-12-16 08:09:04 +00:00 · 2018-06-18 13:44:19 -04:00
parent 7c47b0f00c
commit 8a73142ff8
4 changed files with 38 additions and 39 deletions
--- a/crawl_server/crawler.py
+++ b/crawl_server/crawler.py
@@ -80,8 +80,6 @@ class RemoteDirectoryCrawler:
    def crawl_directory(self, out_file: str) -> CrawlResult:
        import gc
        gc.set_debug(gc.DEBUG_LEAK)
        try:
            directory = RemoteDirectoryFactory.get_directory(self.url)
            root_listing = directory.list_dir("")
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -1,4 +1,5 @@
 from urllib.parse import unquote, urljoin
 import warnings
 import os
 from html.parser import HTMLParser
 from itertools import repeat
@@ -118,6 +119,8 @@ class HttpDirectory(RemoteDirectory):
    def _request_file(self, url):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            retries = HttpDirectory.MAX_RETRIES
            while retries > 0:
                try:
@@ -141,13 +144,14 @@ class HttpDirectory(RemoteDirectory):
            return None
    def _stream_body(self, url: str):
-
+        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            retries = HttpDirectory.MAX_RETRIES
            while retries > 0:
                try:
                    r = self.session.get(url, stream=True, timeout=40)
                    for chunk in r.iter_content(chunk_size=4096):
-                    yield chunk
+                        yield chunk.decode(r.encoding, errors="ignore")
                    r.close()
                    del r
                    break
@@ -163,7 +167,7 @@ class HttpDirectory(RemoteDirectory):
        parser = HTMLAnchorParser()
        for chunk in body:
-            parser.feed(chunk.decode("utf-8", errors="ignore"))
+            parser.feed(chunk)
            for anchor in parser.anchors:
                yield anchor
--- a/crawl_server/task_manager.py
+++ b/crawl_server/task_manager.py
@@ -53,9 +53,6 @@ class TaskManager:
    @staticmethod
    def run_task(task, db_path, current_tasks):
        # import gc
        # gc.set_debug(gc.DEBUG_LEAK)
        result = TaskResult()
        result.start_time = datetime.utcnow()
        result.website_id = task.website_id
--- a/debug_put.py
+++ b/debug_put.py
@@ -4,7 +4,7 @@ import json
 payload = json.dumps({
    "website_id": 123,
-    "url": "http://liminaire.fr/TEXTES/",
+    "url": "https://computerarchive.org/files/computer/",
    # "url": "http://localhost:8000/",
    # "url": "http://ubuntu.mirrorservice.org/",
    "priority": 2,