diff --git a/crawl_server/crawler.py b/crawl_server/crawler.py index 0bea6fd..9d3177e 100644 --- a/crawl_server/crawler.py +++ b/crawl_server/crawler.py @@ -120,7 +120,6 @@ class RemoteDirectoryCrawler: files_q.put(None) file_writer_thread.join() - return CrawlResult(files_written[0], "success") def _process_listings(self, url: str, in_q: Queue, files_q: Queue): diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 2547da0..5913ad2 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -1,5 +1,4 @@ -from urllib.parse import urljoin, unquote, quote - +from urllib.parse import unquote import os from html.parser import HTMLParser from itertools import repeat @@ -104,7 +103,7 @@ class HttpDirectory(RemoteDirectory): def request_files(self, urls_to_request: list) -> list: - if len(urls_to_request) > 30: + if len(urls_to_request) > 3000000: # Many urls, use multi-threaded solution pool = ThreadPool(processes=10) files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))