Fixed bug in crawler when file count in a directory is greater than 150

This commit is contained in:
Simon 2018-07-17 11:03:10 -04:00
parent cf96d1697d
commit 756e331c83

View File

@ -119,7 +119,7 @@ class HttpDirectory(RemoteDirectory):
if self._isdir(anchor): if self._isdir(anchor):
directory = File( directory = File(
name=anchor.href, # todo handle external links here name=anchor.href, # todo handle external links here
mtime=0, mtime=0,
size=0, size=0,
path=path, path=path,
@ -143,7 +143,9 @@ class HttpDirectory(RemoteDirectory):
pool = ThreadPool(processes=10) pool = ThreadPool(processes=10)
files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request)) files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
pool.close() pool.close()
return (f for f in files if f) for file in files:
if file:
yield file
else: else:
# Too few urls to create thread pool # Too few urls to create thread pool
for url in urls_to_request: for url in urls_to_request: