From 756e331c83f627b6dfaf6a9c2dfba481e2685f74 Mon Sep 17 00:00:00 2001 From: Simon Date: Tue, 17 Jul 2018 11:03:10 -0400 Subject: [PATCH] Fixed bug in crawler when file count in a directory is greater than 150 --- crawl_server/remote_http.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index c6a4718..7df8e1f 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -119,7 +119,7 @@ class HttpDirectory(RemoteDirectory): if self._isdir(anchor): directory = File( - name=anchor.href, # todo handle external links here + name=anchor.href, # todo handle external links here mtime=0, size=0, path=path, @@ -143,7 +143,9 @@ class HttpDirectory(RemoteDirectory): pool = ThreadPool(processes=10) files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request)) pool.close() - return (f for f in files if f) + for file in files: + if file: + yield file else: # Too few urls to create thread pool for url in urls_to_request: