mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
FTP crawler bug fixes
This commit is contained in:
parent
f603f41754
commit
ab35ce96cc
@ -87,7 +87,10 @@ class RemoteDirectoryCrawler:
|
|||||||
try:
|
try:
|
||||||
directory = RemoteDirectoryFactory.get_directory(self.url)
|
directory = RemoteDirectoryFactory.get_directory(self.url)
|
||||||
path_id, root_listing = directory.list_dir(urlparse(self.url).path)
|
path_id, root_listing = directory.list_dir(urlparse(self.url).path)
|
||||||
|
if root_listing:
|
||||||
self.crawled_paths.append(path_id)
|
self.crawled_paths.append(path_id)
|
||||||
|
else:
|
||||||
|
return CrawlResult(0, "empty")
|
||||||
directory.close()
|
directory.close()
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
return CrawlResult(0, "timeout")
|
return CrawlResult(0, "timeout")
|
||||||
@ -132,9 +135,10 @@ class RemoteDirectoryCrawler:
|
|||||||
|
|
||||||
while directory:
|
while directory:
|
||||||
try:
|
try:
|
||||||
path = in_q.get(timeout=300)
|
path = in_q.get(timeout=150)
|
||||||
except Empty:
|
except Empty:
|
||||||
directory.close()
|
directory.close()
|
||||||
|
print("Directory timed out")
|
||||||
break
|
break
|
||||||
|
|
||||||
if path is None:
|
if path is None:
|
||||||
@ -147,7 +151,7 @@ class RemoteDirectoryCrawler:
|
|||||||
|
|
||||||
for f in listing:
|
for f in listing:
|
||||||
if f.is_dir:
|
if f.is_dir:
|
||||||
in_q.put(urljoin(f.path, f.name, ""))
|
in_q.put(urljoin(f.path, f.name))
|
||||||
else:
|
else:
|
||||||
files_q.put(f)
|
files_q.put(f)
|
||||||
import sys
|
import sys
|
||||||
|
@ -22,31 +22,31 @@ class FtpDirectory(RemoteDirectory):
|
|||||||
|
|
||||||
host = urlparse(url).netloc
|
host = urlparse(url).netloc
|
||||||
super().__init__(host)
|
super().__init__(host)
|
||||||
self.max_attempts = 2
|
self.max_attempts = 3
|
||||||
self.ftp = None
|
self.ftp = None
|
||||||
self.stop_when_connected()
|
self.stop_when_connected()
|
||||||
|
|
||||||
def _connect(self):
|
def _connect(self):
|
||||||
self.ftp = ftputil.FTPHost(self.base_url, "anonymous", "od-database", session_factory=session_factory(
|
self.ftp = ftputil.FTPHost(self.base_url, "anonymous", "od-database", session_factory=session_factory(
|
||||||
use_passive_mode=False
|
use_passive_mode=True
|
||||||
))
|
))
|
||||||
self.ftp._session.timeout = 40
|
self.ftp._session.timeout = 1
|
||||||
|
|
||||||
def stop_when_connected(self):
|
def stop_when_connected(self):
|
||||||
failed_attempts = 0
|
failed_attempts = 0
|
||||||
while failed_attempts < self.max_attempts:
|
while failed_attempts < self.max_attempts:
|
||||||
try:
|
try:
|
||||||
self._connect()
|
self._connect()
|
||||||
break
|
return True
|
||||||
except ftputil.error.FTPError as e:
|
except ftputil.error.FTPError as e:
|
||||||
|
|
||||||
if e.errno == 530 or e.errno == 421:
|
if e.errno == 530 or e.errno == 421:
|
||||||
print("Cancel connection - too many connections")
|
|
||||||
break
|
break
|
||||||
|
|
||||||
failed_attempts += 1
|
failed_attempts += 1
|
||||||
print("Connection error; reconnecting..." + e.strerror + " " + str(e.errno))
|
print("Connection error; reconnecting..." + e.strerror + " " + str(e.errno))
|
||||||
time.sleep(2 * random.uniform(0.5, 1.5))
|
time.sleep(2)
|
||||||
|
return False
|
||||||
|
|
||||||
def list_dir(self, path):
|
def list_dir(self, path):
|
||||||
if not self.ftp:
|
if not self.ftp:
|
||||||
@ -59,15 +59,16 @@ class FtpDirectory(RemoteDirectory):
|
|||||||
file_names = self.ftp.listdir(path)
|
file_names = self.ftp.listdir(path)
|
||||||
|
|
||||||
for file_name in file_names:
|
for file_name in file_names:
|
||||||
stat = self.try_stat(os.path.join(path, file_name))
|
file_path = os.path.join(path, file_name)
|
||||||
is_dir = self.ftp.path.isdir(os.path.join(path, file_name))
|
stat = self.try_stat(file_path)
|
||||||
|
is_dir = self.ftp.path.isdir(file_path)
|
||||||
|
|
||||||
results.append(File(
|
results.append(File(
|
||||||
name=file_name,
|
name=os.path.join(file_name, "") if is_dir else file_name,
|
||||||
mtime=stat.st_mtime,
|
mtime=stat.st_mtime,
|
||||||
size=-1 if is_dir else stat.st_size,
|
size=-1 if is_dir else stat.st_size,
|
||||||
is_dir=is_dir,
|
is_dir=is_dir,
|
||||||
path=path
|
path=path.strip("/") if not is_dir else path
|
||||||
))
|
))
|
||||||
return path, results
|
return path, results
|
||||||
except ftputil.error.ParserError as e:
|
except ftputil.error.ParserError as e:
|
||||||
@ -77,22 +78,30 @@ class FtpDirectory(RemoteDirectory):
|
|||||||
if e.errno in FtpDirectory.CANCEL_LISTING_CODE:
|
if e.errno in FtpDirectory.CANCEL_LISTING_CODE:
|
||||||
break
|
break
|
||||||
failed_attempts += 1
|
failed_attempts += 1
|
||||||
print(str(e.strerror) + "errno" + str(e.errno))
|
self.reconnect()
|
||||||
print("Error - reconnecting")
|
|
||||||
self.stop_when_connected()
|
|
||||||
except ftputil.error.PermanentError as e:
|
except ftputil.error.PermanentError as e:
|
||||||
if e.errno == 530:
|
if e.errno == 530:
|
||||||
raise TooManyConnectionsError()
|
raise TooManyConnectionsError()
|
||||||
print(str(e.strerror) + "errno" + str(e.errno))
|
if e.errno is None:
|
||||||
|
failed_attempts += 1
|
||||||
|
self.reconnect()
|
||||||
|
else:
|
||||||
|
print(str(e.strerror) + " errno:" + str(e.errno))
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# TODO remove that debug info
|
failed_attempts += 1
|
||||||
print("ERROR:" + str(e))
|
self.reconnect()
|
||||||
print(type(e))
|
print(e)
|
||||||
raise e
|
|
||||||
|
|
||||||
return path, []
|
return path, []
|
||||||
|
|
||||||
|
def reconnect(self):
|
||||||
|
|
||||||
|
if self.ftp:
|
||||||
|
self.ftp.close()
|
||||||
|
time.sleep(8)
|
||||||
|
self.stop_when_connected()
|
||||||
|
|
||||||
def try_stat(self, path):
|
def try_stat(self, path):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user