mirror of
https://github.com/simon987/od-database.git
synced 2025-04-20 10:56:47 +00:00
Should fix some FTP errors
This commit is contained in:
parent
a6d753c6ee
commit
8e937e69c0
@ -15,7 +15,7 @@ CAPTCHA_SITE_KEY = ""
|
|||||||
CAPTCHA_SECRET_KEY = ""
|
CAPTCHA_SECRET_KEY = ""
|
||||||
FLASK_SECRET = ""
|
FLASK_SECRET = ""
|
||||||
RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000)
|
RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000)
|
||||||
CRAWL_SERVER_HEADERS = {}
|
HEADERS = {}
|
||||||
CRAWL_SERVER_TOKEN = ""
|
CRAWL_SERVER_TOKEN = ""
|
||||||
CRAWL_SERVER_PORT = 5001
|
CRAWL_SERVER_PORT = 5001
|
||||||
CRAWL_SERVER_PROCESSES = 3
|
CRAWL_SERVER_PROCESSES = 3
|
||||||
|
@ -129,7 +129,6 @@ class RemoteDirectoryCrawler:
|
|||||||
def _process_listings(self, url: str, in_q: Queue, files_q: Queue):
|
def _process_listings(self, url: str, in_q: Queue, files_q: Queue):
|
||||||
|
|
||||||
directory = RemoteDirectoryFactory.get_directory(url)
|
directory = RemoteDirectoryFactory.get_directory(url)
|
||||||
timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES
|
|
||||||
|
|
||||||
while directory:
|
while directory:
|
||||||
try:
|
try:
|
||||||
@ -145,7 +144,6 @@ class RemoteDirectoryCrawler:
|
|||||||
path_id, listing = directory.list_dir(path)
|
path_id, listing = directory.list_dir(path)
|
||||||
if len(listing) > 0 and path_id not in self.crawled_paths:
|
if len(listing) > 0 and path_id not in self.crawled_paths:
|
||||||
self.crawled_paths.append(path_id)
|
self.crawled_paths.append(path_id)
|
||||||
timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES
|
|
||||||
|
|
||||||
for f in listing:
|
for f in listing:
|
||||||
if f.is_dir:
|
if f.is_dir:
|
||||||
|
@ -14,11 +14,15 @@ class FtpDirectory(RemoteDirectory):
|
|||||||
|
|
||||||
SCHEMES = ("ftp", )
|
SCHEMES = ("ftp", )
|
||||||
|
|
||||||
|
CANCEL_LISTING_CODE = (
|
||||||
|
550, # Forbidden
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
|
|
||||||
host = urlparse(url).netloc
|
host = urlparse(url).netloc
|
||||||
super().__init__(host)
|
super().__init__(host)
|
||||||
self.max_attempts = 3
|
self.max_attempts = 2
|
||||||
self.ftp = None
|
self.ftp = None
|
||||||
self.stop_when_connected()
|
self.stop_when_connected()
|
||||||
|
|
||||||
@ -69,13 +73,18 @@ class FtpDirectory(RemoteDirectory):
|
|||||||
except ftputil.error.ParserError as e:
|
except ftputil.error.ParserError as e:
|
||||||
print("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name))
|
print("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name))
|
||||||
break
|
break
|
||||||
except ftputil.error.FTPOSError as e:
|
|
||||||
if e.strerror == "timed out":
|
|
||||||
failed_attempts += 1
|
|
||||||
continue
|
|
||||||
except ftputil.error.FTPError as e:
|
except ftputil.error.FTPError as e:
|
||||||
|
if e.errno in FtpDirectory.CANCEL_LISTING_CODE:
|
||||||
|
break
|
||||||
|
failed_attempts += 1
|
||||||
|
print(str(e.strerror) + "errno" + str(e.errno))
|
||||||
|
print("Error - reconnecting")
|
||||||
|
self.stop_when_connected()
|
||||||
|
except ftputil.error.PermanentError as e:
|
||||||
if e.errno == 530:
|
if e.errno == 530:
|
||||||
raise TooManyConnectionsError()
|
raise TooManyConnectionsError()
|
||||||
|
print(str(e.strerror) + "errno" + str(e.errno))
|
||||||
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# TODO remove that debug info
|
# TODO remove that debug info
|
||||||
print("ERROR:" + str(e))
|
print("ERROR:" + str(e))
|
||||||
|
@ -88,11 +88,7 @@ class TaskManager:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def task_complete(result):
|
def task_complete(result):
|
||||||
|
|
||||||
try:
|
task_result, db_path, current_tasks = result
|
||||||
task_result, db_path, current_tasks = result
|
|
||||||
except Exception as e:
|
|
||||||
print("Exception during task " + str(e))
|
|
||||||
return
|
|
||||||
|
|
||||||
print(task_result.status_code)
|
print(task_result.status_code)
|
||||||
print(task_result.file_count)
|
print(task_result.file_count)
|
||||||
|
@ -5,6 +5,7 @@ import os
|
|||||||
import validators
|
import validators
|
||||||
import re
|
import re
|
||||||
from ftplib import FTP
|
from ftplib import FTP
|
||||||
|
import config
|
||||||
|
|
||||||
|
|
||||||
def truncate_path(path, max_len):
|
def truncate_path(path, max_len):
|
||||||
@ -162,12 +163,12 @@ def is_od(url):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if url.startswith("ftp://"):
|
if url.startswith("ftp://") and config.SUBMIT_FTP:
|
||||||
ftp = FTP(urlparse(url).netloc)
|
ftp = FTP(urlparse(url).netloc)
|
||||||
ftp.login()
|
ftp.login()
|
||||||
ftp.close()
|
ftp.close()
|
||||||
return True
|
return True
|
||||||
else:
|
elif config.SUBMIT_HTTP:
|
||||||
r = requests.get(url, timeout=30, allow_redirects=False)
|
r = requests.get(url, timeout=30, allow_redirects=False)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
print("No redirects allowed!")
|
print("No redirects allowed!")
|
||||||
|
@ -15,3 +15,4 @@ python-dateutil
|
|||||||
flask_httpauth
|
flask_httpauth
|
||||||
ujson
|
ujson
|
||||||
urllib3
|
urllib3
|
||||||
|
pyOpenSSL
|
Loading…
x
Reference in New Issue
Block a user