Should fix some FTP errors

This commit is contained in:
Simon 2018-06-24 13:50:55 -04:00
parent a6d753c6ee
commit 8e937e69c0
6 changed files with 21 additions and 16 deletions

View File

@ -15,7 +15,7 @@ CAPTCHA_SITE_KEY = ""
CAPTCHA_SECRET_KEY = ""
FLASK_SECRET = ""
RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000)
CRAWL_SERVER_HEADERS = {}
HEADERS = {}
CRAWL_SERVER_TOKEN = ""
CRAWL_SERVER_PORT = 5001
CRAWL_SERVER_PROCESSES = 3

View File

@ -129,7 +129,6 @@ class RemoteDirectoryCrawler:
def _process_listings(self, url: str, in_q: Queue, files_q: Queue):
directory = RemoteDirectoryFactory.get_directory(url)
timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES
while directory:
try:
@ -145,7 +144,6 @@ class RemoteDirectoryCrawler:
path_id, listing = directory.list_dir(path)
if len(listing) > 0 and path_id not in self.crawled_paths:
self.crawled_paths.append(path_id)
timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES
for f in listing:
if f.is_dir:

View File

@ -14,11 +14,15 @@ class FtpDirectory(RemoteDirectory):
SCHEMES = ("ftp", )
CANCEL_LISTING_CODE = (
550, # Forbidden
)
def __init__(self, url):
host = urlparse(url).netloc
super().__init__(host)
self.max_attempts = 3
self.max_attempts = 2
self.ftp = None
self.stop_when_connected()
@ -69,13 +73,18 @@ class FtpDirectory(RemoteDirectory):
except ftputil.error.ParserError as e:
print("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name))
break
except ftputil.error.FTPOSError as e:
if e.strerror == "timed out":
failed_attempts += 1
continue
except ftputil.error.FTPError as e:
if e.errno in FtpDirectory.CANCEL_LISTING_CODE:
break
failed_attempts += 1
print(str(e.strerror) + "errno" + str(e.errno))
print("Error - reconnecting")
self.stop_when_connected()
except ftputil.error.PermanentError as e:
if e.errno == 530:
raise TooManyConnectionsError()
print(str(e.strerror) + "errno" + str(e.errno))
break
except Exception as e:
# TODO remove that debug info
print("ERROR:" + str(e))

View File

@ -88,11 +88,7 @@ class TaskManager:
@staticmethod
def task_complete(result):
try:
task_result, db_path, current_tasks = result
except Exception as e:
print("Exception during task " + str(e))
return
task_result, db_path, current_tasks = result
print(task_result.status_code)
print(task_result.file_count)

View File

@ -5,6 +5,7 @@ import os
import validators
import re
from ftplib import FTP
import config
def truncate_path(path, max_len):
@ -162,12 +163,12 @@ def is_od(url):
return False
try:
if url.startswith("ftp://"):
if url.startswith("ftp://") and config.SUBMIT_FTP:
ftp = FTP(urlparse(url).netloc)
ftp.login()
ftp.close()
return True
else:
elif config.SUBMIT_HTTP:
r = requests.get(url, timeout=30, allow_redirects=False)
if r.status_code != 200:
print("No redirects allowed!")

View File

@ -14,4 +14,5 @@ elasticsearch
python-dateutil
flask_httpauth
ujson
urllib3
urllib3
pyOpenSSL