Support for more than just utf-8 and removed some debug info

2025-12-19 01:29:02 +00:00 · 2018-06-18 13:44:19 -04:00
parent 7c47b0f00c
commit 8a73142ff8
4 changed files with 38 additions and 39 deletions
--- a/crawl_server/crawler.py
+++ b/crawl_server/crawler.py
@@ -80,8 +80,6 @@ class RemoteDirectoryCrawler:

    def crawl_directory(self, out_file: str) -> CrawlResult:

-        import gc
-        gc.set_debug(gc.DEBUG_LEAK)
        try:
            directory = RemoteDirectoryFactory.get_directory(self.url)
            root_listing = directory.list_dir("")
--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@@ -1,4 +1,5 @@
 from urllib.parse import unquote, urljoin
+import warnings
 import os
 from html.parser import HTMLParser
 from itertools import repeat
@@ -118,44 +119,47 @@ class HttpDirectory(RemoteDirectory):

    def _request_file(self, url):

-        retries = HttpDirectory.MAX_RETRIES
-        while retries > 0:
-            try:
-                r = self.session.head(url, allow_redirects=False, timeout=40)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            retries = HttpDirectory.MAX_RETRIES
+            while retries > 0:
+                try:
+                    r = self.session.head(url, allow_redirects=False, timeout=40)

-                stripped_url = url[len(self.base_url) - 1:]
+                    stripped_url = url[len(self.base_url) - 1:]

-                path, name = os.path.split(stripped_url)
-                date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
-                return File(
-                    path=unquote(path).strip("/"),
-                    name=unquote(name),
-                    size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
-                    mtime=int(parse_date(date).timestamp()),
-                    is_dir=False
-                )
-            except RequestException:
-                self.session.close()
-                retries -= 1
+                    path, name = os.path.split(stripped_url)
+                    date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01"
+                    return File(
+                        path=unquote(path).strip("/"),
+                        name=unquote(name),
+                        size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
+                        mtime=int(parse_date(date).timestamp()),
+                        is_dir=False
+                    )
+                except RequestException:
+                    self.session.close()
+                    retries -= 1

-        return None
+            return None

    def _stream_body(self, url: str):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            retries = HttpDirectory.MAX_RETRIES
+            while retries > 0:
+                try:
+                    r = self.session.get(url, stream=True, timeout=40)
+                    for chunk in r.iter_content(chunk_size=4096):
+                        yield chunk.decode(r.encoding, errors="ignore")
+                    r.close()
+                    del r
+                    break
+                except RequestException:
+                    self.session.close()
+                    retries -= 1

-        retries = HttpDirectory.MAX_RETRIES
-        while retries > 0:
-            try:
-                r = self.session.get(url, stream=True, timeout=40)
-                for chunk in r.iter_content(chunk_size=4096):
-                    yield chunk
-                r.close()
-                del r
-                break
-            except RequestException:
-                self.session.close()
-                retries -= 1
-
-        return None
+            return None

    @staticmethod
    def _parse_links(body):
@@ -163,7 +167,7 @@ class HttpDirectory(RemoteDirectory):
        parser = HTMLAnchorParser()

        for chunk in body:
-            parser.feed(chunk.decode("utf-8", errors="ignore"))
+            parser.feed(chunk)
            for anchor in parser.anchors:
                yield anchor

--- a/crawl_server/task_manager.py
+++ b/crawl_server/task_manager.py
@@ -53,9 +53,6 @@ class TaskManager:
    @staticmethod
    def run_task(task, db_path, current_tasks):

-        # import gc
-        # gc.set_debug(gc.DEBUG_LEAK)
-
        result = TaskResult()
        result.start_time = datetime.utcnow()
        result.website_id = task.website_id
--- a/debug_put.py
+++ b/debug_put.py
@@ -4,7 +4,7 @@ import json

 payload = json.dumps({
    "website_id": 123,
-    "url": "http://liminaire.fr/TEXTES/",
+    "url": "https://computerarchive.org/files/computer/",
    # "url": "http://localhost:8000/",
    # "url": "http://ubuntu.mirrorservice.org/",
    "priority": 2,