FTP crawler bug fixes

This commit is contained in:
Simon 2018-06-24 16:44:21 -04:00
parent f603f41754
commit ab35ce96cc
2 changed files with 35 additions and 22 deletions

View File

@ -87,7 +87,10 @@ class RemoteDirectoryCrawler:
try: try:
directory = RemoteDirectoryFactory.get_directory(self.url) directory = RemoteDirectoryFactory.get_directory(self.url)
path_id, root_listing = directory.list_dir(urlparse(self.url).path) path_id, root_listing = directory.list_dir(urlparse(self.url).path)
if root_listing:
self.crawled_paths.append(path_id) self.crawled_paths.append(path_id)
else:
return CrawlResult(0, "empty")
directory.close() directory.close()
except TimeoutError: except TimeoutError:
return CrawlResult(0, "timeout") return CrawlResult(0, "timeout")
@ -132,9 +135,10 @@ class RemoteDirectoryCrawler:
while directory: while directory:
try: try:
path = in_q.get(timeout=300) path = in_q.get(timeout=150)
except Empty: except Empty:
directory.close() directory.close()
print("Directory timed out")
break break
if path is None: if path is None:
@ -147,7 +151,7 @@ class RemoteDirectoryCrawler:
for f in listing: for f in listing:
if f.is_dir: if f.is_dir:
in_q.put(urljoin(f.path, f.name, "")) in_q.put(urljoin(f.path, f.name))
else: else:
files_q.put(f) files_q.put(f)
import sys import sys

View File

@ -22,31 +22,31 @@ class FtpDirectory(RemoteDirectory):
host = urlparse(url).netloc host = urlparse(url).netloc
super().__init__(host) super().__init__(host)
self.max_attempts = 2 self.max_attempts = 3
self.ftp = None self.ftp = None
self.stop_when_connected() self.stop_when_connected()
def _connect(self): def _connect(self):
self.ftp = ftputil.FTPHost(self.base_url, "anonymous", "od-database", session_factory=session_factory( self.ftp = ftputil.FTPHost(self.base_url, "anonymous", "od-database", session_factory=session_factory(
use_passive_mode=False use_passive_mode=True
)) ))
self.ftp._session.timeout = 40 self.ftp._session.timeout = 1
def stop_when_connected(self): def stop_when_connected(self):
failed_attempts = 0 failed_attempts = 0
while failed_attempts < self.max_attempts: while failed_attempts < self.max_attempts:
try: try:
self._connect() self._connect()
break return True
except ftputil.error.FTPError as e: except ftputil.error.FTPError as e:
if e.errno == 530 or e.errno == 421: if e.errno == 530 or e.errno == 421:
print("Cancel connection - too many connections")
break break
failed_attempts += 1 failed_attempts += 1
print("Connection error; reconnecting..." + e.strerror + " " + str(e.errno)) print("Connection error; reconnecting..." + e.strerror + " " + str(e.errno))
time.sleep(2 * random.uniform(0.5, 1.5)) time.sleep(2)
return False
def list_dir(self, path): def list_dir(self, path):
if not self.ftp: if not self.ftp:
@ -59,15 +59,16 @@ class FtpDirectory(RemoteDirectory):
file_names = self.ftp.listdir(path) file_names = self.ftp.listdir(path)
for file_name in file_names: for file_name in file_names:
stat = self.try_stat(os.path.join(path, file_name)) file_path = os.path.join(path, file_name)
is_dir = self.ftp.path.isdir(os.path.join(path, file_name)) stat = self.try_stat(file_path)
is_dir = self.ftp.path.isdir(file_path)
results.append(File( results.append(File(
name=file_name, name=os.path.join(file_name, "") if is_dir else file_name,
mtime=stat.st_mtime, mtime=stat.st_mtime,
size=-1 if is_dir else stat.st_size, size=-1 if is_dir else stat.st_size,
is_dir=is_dir, is_dir=is_dir,
path=path path=path.strip("/") if not is_dir else path
)) ))
return path, results return path, results
except ftputil.error.ParserError as e: except ftputil.error.ParserError as e:
@ -77,22 +78,30 @@ class FtpDirectory(RemoteDirectory):
if e.errno in FtpDirectory.CANCEL_LISTING_CODE: if e.errno in FtpDirectory.CANCEL_LISTING_CODE:
break break
failed_attempts += 1 failed_attempts += 1
print(str(e.strerror) + "errno" + str(e.errno)) self.reconnect()
print("Error - reconnecting")
self.stop_when_connected()
except ftputil.error.PermanentError as e: except ftputil.error.PermanentError as e:
if e.errno == 530: if e.errno == 530:
raise TooManyConnectionsError() raise TooManyConnectionsError()
print(str(e.strerror) + "errno" + str(e.errno)) if e.errno is None:
failed_attempts += 1
self.reconnect()
else:
print(str(e.strerror) + " errno:" + str(e.errno))
break break
except Exception as e: except Exception as e:
# TODO remove that debug info failed_attempts += 1
print("ERROR:" + str(e)) self.reconnect()
print(type(e)) print(e)
raise e
return path, [] return path, []
def reconnect(self):
if self.ftp:
self.ftp.close()
time.sleep(8)
self.stop_when_connected()
def try_stat(self, path): def try_stat(self, path):
try: try: