Should fix some FTP errors

This commit is contained in:
Simon 2018-06-24 13:50:55 -04:00
parent a6d753c6ee
commit 8e937e69c0
6 changed files with 21 additions and 16 deletions

View File

@ -15,7 +15,7 @@ CAPTCHA_SITE_KEY = ""
CAPTCHA_SECRET_KEY = "" CAPTCHA_SECRET_KEY = ""
FLASK_SECRET = "" FLASK_SECRET = ""
RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000) RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000)
CRAWL_SERVER_HEADERS = {} HEADERS = {}
CRAWL_SERVER_TOKEN = "" CRAWL_SERVER_TOKEN = ""
CRAWL_SERVER_PORT = 5001 CRAWL_SERVER_PORT = 5001
CRAWL_SERVER_PROCESSES = 3 CRAWL_SERVER_PROCESSES = 3

View File

@ -129,7 +129,6 @@ class RemoteDirectoryCrawler:
def _process_listings(self, url: str, in_q: Queue, files_q: Queue): def _process_listings(self, url: str, in_q: Queue, files_q: Queue):
directory = RemoteDirectoryFactory.get_directory(url) directory = RemoteDirectoryFactory.get_directory(url)
timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES
while directory: while directory:
try: try:
@ -145,7 +144,6 @@ class RemoteDirectoryCrawler:
path_id, listing = directory.list_dir(path) path_id, listing = directory.list_dir(path)
if len(listing) > 0 and path_id not in self.crawled_paths: if len(listing) > 0 and path_id not in self.crawled_paths:
self.crawled_paths.append(path_id) self.crawled_paths.append(path_id)
timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES
for f in listing: for f in listing:
if f.is_dir: if f.is_dir:

View File

@ -14,11 +14,15 @@ class FtpDirectory(RemoteDirectory):
SCHEMES = ("ftp", ) SCHEMES = ("ftp", )
CANCEL_LISTING_CODE = (
550, # Forbidden
)
def __init__(self, url): def __init__(self, url):
host = urlparse(url).netloc host = urlparse(url).netloc
super().__init__(host) super().__init__(host)
self.max_attempts = 3 self.max_attempts = 2
self.ftp = None self.ftp = None
self.stop_when_connected() self.stop_when_connected()
@ -69,13 +73,18 @@ class FtpDirectory(RemoteDirectory):
except ftputil.error.ParserError as e: except ftputil.error.ParserError as e:
print("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name)) print("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name))
break break
except ftputil.error.FTPOSError as e:
if e.strerror == "timed out":
failed_attempts += 1
continue
except ftputil.error.FTPError as e: except ftputil.error.FTPError as e:
if e.errno in FtpDirectory.CANCEL_LISTING_CODE:
break
failed_attempts += 1
print(str(e.strerror) + "errno" + str(e.errno))
print("Error - reconnecting")
self.stop_when_connected()
except ftputil.error.PermanentError as e:
if e.errno == 530: if e.errno == 530:
raise TooManyConnectionsError() raise TooManyConnectionsError()
print(str(e.strerror) + "errno" + str(e.errno))
break
except Exception as e: except Exception as e:
# TODO remove that debug info # TODO remove that debug info
print("ERROR:" + str(e)) print("ERROR:" + str(e))

View File

@ -88,11 +88,7 @@ class TaskManager:
@staticmethod @staticmethod
def task_complete(result): def task_complete(result):
try:
task_result, db_path, current_tasks = result task_result, db_path, current_tasks = result
except Exception as e:
print("Exception during task " + str(e))
return
print(task_result.status_code) print(task_result.status_code)
print(task_result.file_count) print(task_result.file_count)

View File

@ -5,6 +5,7 @@ import os
import validators import validators
import re import re
from ftplib import FTP from ftplib import FTP
import config
def truncate_path(path, max_len): def truncate_path(path, max_len):
@ -162,12 +163,12 @@ def is_od(url):
return False return False
try: try:
if url.startswith("ftp://"): if url.startswith("ftp://") and config.SUBMIT_FTP:
ftp = FTP(urlparse(url).netloc) ftp = FTP(urlparse(url).netloc)
ftp.login() ftp.login()
ftp.close() ftp.close()
return True return True
else: elif config.SUBMIT_HTTP:
r = requests.get(url, timeout=30, allow_redirects=False) r = requests.get(url, timeout=30, allow_redirects=False)
if r.status_code != 200: if r.status_code != 200:
print("No redirects allowed!") print("No redirects allowed!")

View File

@ -15,3 +15,4 @@ python-dateutil
flask_httpauth flask_httpauth
ujson ujson
urllib3 urllib3
pyOpenSSL