diff --git a/README.md b/README.md index 1f63eb9..6d3ee56 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ CAPTCHA_SITE_KEY = "" CAPTCHA_SECRET_KEY = "" FLASK_SECRET = "" RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000) -CRAWL_SERVER_HEADERS = {} +HEADERS = {} CRAWL_SERVER_TOKEN = "" CRAWL_SERVER_PORT = 5001 CRAWL_SERVER_PROCESSES = 3 diff --git a/crawl_server/crawler.py b/crawl_server/crawler.py index e96b5c9..ffb3744 100644 --- a/crawl_server/crawler.py +++ b/crawl_server/crawler.py @@ -129,7 +129,6 @@ class RemoteDirectoryCrawler: def _process_listings(self, url: str, in_q: Queue, files_q: Queue): directory = RemoteDirectoryFactory.get_directory(url) - timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES while directory: try: @@ -145,7 +144,6 @@ class RemoteDirectoryCrawler: path_id, listing = directory.list_dir(path) if len(listing) > 0 and path_id not in self.crawled_paths: self.crawled_paths.append(path_id) - timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES for f in listing: if f.is_dir: diff --git a/crawl_server/remote_ftp.py b/crawl_server/remote_ftp.py index d5b500a..ecdc436 100644 --- a/crawl_server/remote_ftp.py +++ b/crawl_server/remote_ftp.py @@ -14,11 +14,15 @@ class FtpDirectory(RemoteDirectory): SCHEMES = ("ftp", ) + CANCEL_LISTING_CODE = ( + 550, # Forbidden + ) + def __init__(self, url): host = urlparse(url).netloc super().__init__(host) - self.max_attempts = 3 + self.max_attempts = 2 self.ftp = None self.stop_when_connected() @@ -69,13 +73,18 @@ class FtpDirectory(RemoteDirectory): except ftputil.error.ParserError as e: print("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name)) break - except ftputil.error.FTPOSError as e: - if e.strerror == "timed out": - failed_attempts += 1 - continue except ftputil.error.FTPError as e: + if e.errno in FtpDirectory.CANCEL_LISTING_CODE: + break + failed_attempts += 1 + print(str(e.strerror) + "errno" + str(e.errno)) + print("Error - reconnecting") + self.stop_when_connected() + except ftputil.error.PermanentError as e: if e.errno == 530: raise TooManyConnectionsError() + print(str(e.strerror) + "errno" + str(e.errno)) + break except Exception as e: # TODO remove that debug info print("ERROR:" + str(e)) diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index e38b199..2d44d21 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -88,11 +88,7 @@ class TaskManager: @staticmethod def task_complete(result): - try: - task_result, db_path, current_tasks = result - except Exception as e: - print("Exception during task " + str(e)) - return + task_result, db_path, current_tasks = result print(task_result.status_code) print(task_result.file_count) diff --git a/od_util.py b/od_util.py index 7fad8ac..0001334 100644 --- a/od_util.py +++ b/od_util.py @@ -5,6 +5,7 @@ import os import validators import re from ftplib import FTP +import config def truncate_path(path, max_len): @@ -162,12 +163,12 @@ def is_od(url): return False try: - if url.startswith("ftp://"): + if url.startswith("ftp://") and config.SUBMIT_FTP: ftp = FTP(urlparse(url).netloc) ftp.login() ftp.close() return True - else: + elif config.SUBMIT_HTTP: r = requests.get(url, timeout=30, allow_redirects=False) if r.status_code != 200: print("No redirects allowed!") diff --git a/requirements.txt b/requirements.txt index 25e274f..bc52fd2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,5 @@ elasticsearch python-dateutil flask_httpauth ujson -urllib3 \ No newline at end of file +urllib3 +pyOpenSSL \ No newline at end of file