od-database/crawl_server/remote_ftp.py

80 lines
2.3 KiB
Python

#! /usr/bin/env python
from urllib.parse import urlparse
import os
import time
import ftputil
import ftputil.error
from ftputil.session import session_factory
import random
import timeout_decorator
from crawl_server.crawler import RemoteDirectory, File, TooManyConnectionsError
class FtpDirectory(RemoteDirectory):
SCHEMES = ("ftp", )
def __init__(self, url):
host = urlparse(url).netloc
super().__init__(host)
self.failed_attempts = 0
self.max_attempts = 2
self.ftp = None
self.stop_when_connected()
def _connect(self):
self.ftp = ftputil.FTPHost(self.base_url, "anonymous", "od-database", session_factory=session_factory(
use_passive_mode=False
))
def stop_when_connected(self):
while self.failed_attempts < self.max_attempts:
try:
self._connect()
self.failed_attempts = 0
break
except ftputil.error.FTPError as e:
if e.errno == 530:
print("Cancel connection - too many connections")
break
self.failed_attempts += 1
print("Connection error; reconnecting...")
time.sleep(2 * random.uniform(0.5, 1.5))
self.stop_when_connected()
@timeout_decorator.timeout(15, use_signals=False)
def list_dir(self, path) -> list:
if not self.ftp:
print("Conn closed")
return []
results = []
try:
self.ftp.chdir(path)
file_names = self.ftp.listdir(path)
for file_name in file_names:
stat = self.ftp.stat(file_name)
is_dir = self.ftp.path.isdir(os.path.join(path, file_name))
results.append(File(
name=file_name,
mtime=stat.st_mtime, # TODO: check
size=-1 if is_dir else stat.st_size,
is_dir=is_dir,
path=path
))
except ftputil.error.FTPError as e:
if e.errno == 530:
raise TooManyConnectionsError()
pass
return results
def close(self):
if self.ftp:
self.ftp.close()