Support for more than just utf-8 and removed some debug info

2025-11-01 05:56:52 +00:00 · 2018-06-18 13:44:19 -04:00 · 2018-06-18 13:44:19 -04:00 · 8a73142ff8
commit 8a73142ff8
parent 7c47b0f00c
4 changed files with 38 additions and 39 deletions
--- a/crawl_server/crawler.py
+++ b/crawl_server/crawler.py
@ -80,8 +80,6 @@ class RemoteDirectoryCrawler:
    def crawl_directory(self, out_file: str) -> CrawlResult:
        import gc
        gc.set_debug(gc.DEBUG_LEAK)
        try:
            directory = RemoteDirectoryFactory.get_directory(self.url)
            root_listing = directory.list_dir("")
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@ -1,4 +1,5 @@
 from urllib.parse import unquote, urljoin
 import warnings
 import os
 from html.parser import HTMLParser
 from itertools import repeat
@ -118,44 +119,47 @@ class HttpDirectory(RemoteDirectory):
    def _request_file(self, url):
-        retries = HttpDirectory.MAX_RETRIES
+        with warnings.catch_warnings():
-        while retries > 0:
+            warnings.simplefilter("ignore")
-            try:
+            retries = HttpDirectory.MAX_RETRIES
-                r = self.session.head(url, allow_redirects=False, timeout=40)
+            while retries > 0:
                try:
                    r = self.session.head(url, allow_redirects=False, timeout=40)
-                stripped_url = url[len(self.base_url) - 1:]
+                    stripped_url = url[len(self.base_url) - 1:]
-                path, name = os.path.split(stripped_url)
+                    path, name = os.path.split(stripped_url)
-                date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
+                    date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
-                return File(
+                    return File(
-                    path=unquote(path).strip("/"),
+                        path=unquote(path).strip("/"),
-                    name=unquote(name),
+                        name=unquote(name),
-                    size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
+                        size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
-                    mtime=int(parse_date(date).timestamp()),
+                        mtime=int(parse_date(date).timestamp()),
-                    is_dir=False
+                        is_dir=False
-                )
+                    )
-            except RequestException:
+                except RequestException:
-                self.session.close()
+                    self.session.close()
-                retries -= 1
+                    retries -= 1
-        return None
+            return None
    def _stream_body(self, url: str):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            retries = HttpDirectory.MAX_RETRIES
            while retries > 0:
                try:
                    r = self.session.get(url, stream=True, timeout=40)
                    for chunk in r.iter_content(chunk_size=4096):
                        yield chunk.decode(r.encoding, errors="ignore")
                    r.close()
                    del r
                    break
                except RequestException:
                    self.session.close()
                    retries -= 1
-        retries = HttpDirectory.MAX_RETRIES
+            return None
        while retries > 0:
            try:
                r = self.session.get(url, stream=True, timeout=40)
                for chunk in r.iter_content(chunk_size=4096):
                    yield chunk
                r.close()
                del r
                break
            except RequestException:
                self.session.close()
                retries -= 1
        return None
    @staticmethod
    def _parse_links(body):
@ -163,7 +167,7 @@ class HttpDirectory(RemoteDirectory):
        parser = HTMLAnchorParser()
        for chunk in body:
-            parser.feed(chunk.decode("utf-8", errors="ignore"))
+            parser.feed(chunk)
            for anchor in parser.anchors:
                yield anchor
--- a/crawl_server/task_manager.py
+++ b/crawl_server/task_manager.py
@ -53,9 +53,6 @@ class TaskManager:
    @staticmethod
    def run_task(task, db_path, current_tasks):
        # import gc
        # gc.set_debug(gc.DEBUG_LEAK)
        result = TaskResult()
        result.start_time = datetime.utcnow()
        result.website_id = task.website_id
--- a/debug_put.py
+++ b/debug_put.py
@ -4,7 +4,7 @@ import json
 payload = json.dumps({
    "website_id": 123,
-    "url": "http://liminaire.fr/TEXTES/",
+    "url": "https://computerarchive.org/files/computer/",
    # "url": "http://localhost:8000/",
    # "url": "http://ubuntu.mirrorservice.org/",
    "priority": 2,