From 8a73142ff8621972511df836b82c61d017515c00 Mon Sep 17 00:00:00 2001 From: Simon Date: Mon, 18 Jun 2018 13:44:19 -0400 Subject: [PATCH] Support for more than just utf-8 and removed some debug info --- crawl_server/crawler.py | 2 -- crawl_server/remote_http.py | 70 +++++++++++++++++++----------------- crawl_server/task_manager.py | 3 -- debug_put.py | 2 +- 4 files changed, 38 insertions(+), 39 deletions(-) diff --git a/crawl_server/crawler.py b/crawl_server/crawler.py index 9d3177e..4acb78a 100644 --- a/crawl_server/crawler.py +++ b/crawl_server/crawler.py @@ -80,8 +80,6 @@ class RemoteDirectoryCrawler: def crawl_directory(self, out_file: str) -> CrawlResult: - import gc - gc.set_debug(gc.DEBUG_LEAK) try: directory = RemoteDirectoryFactory.get_directory(self.url) root_listing = directory.list_dir("") diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 99a6345..74e8003 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -1,4 +1,5 @@ from urllib.parse import unquote, urljoin +import warnings import os from html.parser import HTMLParser from itertools import repeat @@ -118,44 +119,47 @@ class HttpDirectory(RemoteDirectory): def _request_file(self, url): - retries = HttpDirectory.MAX_RETRIES - while retries > 0: - try: - r = self.session.head(url, allow_redirects=False, timeout=40) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + retries = HttpDirectory.MAX_RETRIES + while retries > 0: + try: + r = self.session.head(url, allow_redirects=False, timeout=40) - stripped_url = url[len(self.base_url) - 1:] + stripped_url = url[len(self.base_url) - 1:] - path, name = os.path.split(stripped_url) - date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01" - return File( - path=unquote(path).strip("/"), - name=unquote(name), - size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1, - mtime=int(parse_date(date).timestamp()), - is_dir=False - ) - except RequestException: - self.session.close() - retries -= 1 + path, name = os.path.split(stripped_url) + date = r.headers["Last-Modified"] if "Last-Modified" in r.headers else "1970-01-01" + return File( + path=unquote(path).strip("/"), + name=unquote(name), + size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1, + mtime=int(parse_date(date).timestamp()), + is_dir=False + ) + except RequestException: + self.session.close() + retries -= 1 - return None + return None def _stream_body(self, url: str): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + retries = HttpDirectory.MAX_RETRIES + while retries > 0: + try: + r = self.session.get(url, stream=True, timeout=40) + for chunk in r.iter_content(chunk_size=4096): + yield chunk.decode(r.encoding, errors="ignore") + r.close() + del r + break + except RequestException: + self.session.close() + retries -= 1 - retries = HttpDirectory.MAX_RETRIES - while retries > 0: - try: - r = self.session.get(url, stream=True, timeout=40) - for chunk in r.iter_content(chunk_size=4096): - yield chunk - r.close() - del r - break - except RequestException: - self.session.close() - retries -= 1 - - return None + return None @staticmethod def _parse_links(body): @@ -163,7 +167,7 @@ class HttpDirectory(RemoteDirectory): parser = HTMLAnchorParser() for chunk in body: - parser.feed(chunk.decode("utf-8", errors="ignore")) + parser.feed(chunk) for anchor in parser.anchors: yield anchor diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index df98b77..82b4687 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -53,9 +53,6 @@ class TaskManager: @staticmethod def run_task(task, db_path, current_tasks): - # import gc - # gc.set_debug(gc.DEBUG_LEAK) - result = TaskResult() result.start_time = datetime.utcnow() result.website_id = task.website_id diff --git a/debug_put.py b/debug_put.py index 1e18546..450f4c9 100644 --- a/debug_put.py +++ b/debug_put.py @@ -4,7 +4,7 @@ import json payload = json.dumps({ "website_id": 123, - "url": "http://liminaire.fr/TEXTES/", + "url": "https://computerarchive.org/files/computer/", # "url": "http://localhost:8000/", # "url": "http://ubuntu.mirrorservice.org/", "priority": 2,