od-database/crawl_server/remote_ftp.py
2018-06-25 14:36:16 -04:00

119 lines
4.0 KiB
Python

#! /usr/bin/env python
from crawl_server import logger
from urllib.parse import urlparse
import os
import time
import ftputil
import ftputil.error
from ftputil.session import session_factory
from crawl_server.crawler import RemoteDirectory, File, TooManyConnectionsError
class FtpDirectory(RemoteDirectory):
SCHEMES = ("ftp", )
CANCEL_LISTING_CODE = (
550, # Forbidden
)
def __init__(self, url):
host = urlparse(url).netloc
super().__init__(host)
self.max_attempts = 3
self.ftp = None
self.stop_when_connected()
def _connect(self):
self.ftp = ftputil.FTPHost(self.base_url, "anonymous", "od-database", session_factory=session_factory(
use_passive_mode=True
))
self.ftp._session.timeout = 30
def stop_when_connected(self):
failed_attempts = 0
while failed_attempts < self.max_attempts:
try:
self._connect()
logger.debug("New FTP connection @ " + self.base_url)
return True
except ftputil.error.FTPError as e:
if e.errno == 530 or e.errno == 421:
break
failed_attempts += 1
print("Connection error; reconnecting..." + e.strerror + " " + str(e.errno))
time.sleep(2)
return False
def list_dir(self, path):
if not self.ftp:
# No connection - assuming that connection was dropped because too many
raise TooManyConnectionsError()
results = []
failed_attempts = 0
while failed_attempts < self.max_attempts:
try:
file_names = self.ftp.listdir(path)
for file_name in file_names:
file_path = os.path.join(path, file_name)
stat = self.try_stat(file_path)
is_dir = self.ftp.path.isdir(file_path)
results.append(File(
name=os.path.join(file_name, "") if is_dir else file_name,
mtime=stat.st_mtime,
size=-1 if is_dir else stat.st_size,
is_dir=is_dir,
path=path.strip("/") if not is_dir else path
))
return path, results
except ftputil.error.ParserError as e:
logger.error("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name))
break
except ftputil.error.FTPError as e:
if e.errno in FtpDirectory.CANCEL_LISTING_CODE:
break
failed_attempts += 1
self.reconnect()
except ftputil.error.PermanentError as e:
if e.errno == 530:
raise TooManyConnectionsError()
if e.errno is None:
failed_attempts += 1
self.reconnect()
else:
print(str(e.strerror) + " errno:" + str(e.errno))
break
except Exception as e:
failed_attempts += 1
self.reconnect()
logger.error("Exception while processing FTP listing for " + self.base_url + ": " + str(e))
return path, []
def reconnect(self):
if self.ftp:
self.ftp.close()
success = self.stop_when_connected()
logger.debug("Reconnecting to FTP server " + self.base_url + (" (OK)" if success else " (ERR)"))
def try_stat(self, path):
try:
return self.ftp.stat(path)
except ftputil.error.ParserError as e:
# TODO: Try to parse it ourselves?
logger.error("Exception while parsing FTP listing for " + self.base_url + path + " " + e.strerror)
return None
def close(self):
if self.ftp:
self.ftp.close()
self.ftp = None
logger.debug("Closing FtpRemoteDirectory for " + self.base_url)