diff --git a/app.py b/app.py
index e66d772..3c423ad 100644
--- a/app.py
+++ b/app.py
@@ -157,8 +157,7 @@ def try_enqueue(url):
return "A parent directory of this url has already been posted", "danger"
if not od_util.is_valid_url(url):
- return "Error: Invalid url. Make sure to include the http(s):// suffix. " \
- "FTP is not supported", "danger"
+ return "Error: Invalid url. Make sure to include the appropriate scheme.", "danger"
if od_util.is_blacklisted(url):
diff --git a/ftp_crawler.py b/ftp_crawler.py
new file mode 100644
index 0000000..1587d94
--- /dev/null
+++ b/ftp_crawler.py
@@ -0,0 +1,127 @@
+#! /usr/bin/env python
+
+from threading import Thread
+from queue import Queue
+import os
+import time
+import ftputil
+import random
+
+
+class File:
+
+ def __init__(self, name: str, size: int, mtime: str, path: str, is_dir: bool):
+ self.name = name
+ self.size = size
+ self.mtime = mtime
+ self.path = path
+ self.is_dir = is_dir
+
+ def __str__(self):
+ return ("DIR " if self.is_dir else "FILE ") + self.path + "/" + self.name
+
+
+class FTPConnection(object):
+ def __init__(self, host):
+ self.host = host
+ self.failed_attempts = 0
+ self.max_attempts = 5
+ self.stop_when_connected()
+ self._list_fn = None
+
+ def _connect(self):
+ # attempt an anonymous FTP connection
+ print("CONNECT %s ATTEMPT", self.host)
+ self.ftp = ftputil.FTPHost(self.host, "anonymous", "od-database")
+ print("CONNECT %s SUCCESS", self.host)
+
+ def stop_when_connected(self):
+ # continually tries to reconnect ad infinitum
+ # TODO: Max retries
+ try:
+ self._connect()
+ except Exception:
+ print("CONNECT %s FAILED; trying again...", self.host)
+ time.sleep(5 * random.uniform(0.5, 1.5))
+ self.stop_when_connected()
+
+ def list(self, path) -> list:
+ results = []
+ self.ftp.chdir(path)
+ file_names = self.ftp.listdir(path)
+
+ for file_name in file_names:
+ stat = self.ftp.stat(file_name)
+ is_dir = self.ftp.path.isdir(os.path.join(path, file_name))
+
+ results.append(File(
+ name=file_name,
+ mtime=stat.st_mtime,
+ size=-1 if is_dir else stat.st_size,
+ is_dir=is_dir,
+ path=path
+ ))
+
+ return results
+
+ def process_path(self, path):
+ while self.failed_attempts < self.max_attempts:
+ try:
+ results = self.list(path)
+ self.failed_attempts = 0
+ return results
+ except Exception as e:
+ print(e)
+ self.failed_attempts += 1
+ self.ftp.close()
+ print("LIST FAILED; reconnecting...")
+ time.sleep(2 * random.uniform(0.5, 1.5))
+ self.stop_when_connected()
+
+ # if I get here, I never succeeded in getting the data
+ print("LIST ABANDONED %s", path)
+ self.failed_attempts = 0
+ return []
+
+
+def process_and_queue(host, q: Queue):
+
+ ftp = FTPConnection(host)
+
+ while True:
+ file = q.get()
+
+ if file.is_dir:
+ print(file)
+ listing = ftp.process_path(os.path.join(file.path, file.name))
+ for f in listing:
+ q.put(f)
+ else:
+ pass
+
+ q.task_done()
+
+
+def do_the_thing():
+
+ host = "80.252.155.68"
+ ftp = FTPConnection(host)
+ root_listing = ftp.process_path("/")
+ ftp.ftp.close()
+
+ q = Queue(maxsize=0)
+ num_threads = 10
+
+ for i in range(num_threads):
+ worker = Thread(target=process_and_queue, args=(host, q,))
+ worker.setDaemon(True)
+ worker.start()
+
+ for file in root_listing:
+ q.put(file)
+
+ q.join()
+
+
+if __name__ == '__main__':
+ do_the_thing()
diff --git a/requirements.txt b/requirements.txt
index beadaac..f6cb662 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,4 @@ praw
humanfriendly
apscheduler
bcrypt
-twisted
\ No newline at end of file
+ftputil
\ No newline at end of file
diff --git a/scrapy_od_database/handlers.py b/scrapy_od_database/handlers.py
deleted file mode 100644
index c83e6ae..0000000
--- a/scrapy_od_database/handlers.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import json
-from twisted.protocols.ftp import FTPFileListProtocol
-from scrapy.http import Response
-from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
-
-
-# Inspired by https://github.com/laserson/ftptree
-class FtpListingHandler(FTPDownloadHandler):
-
- def gotClient(self, client, request, file_path):
-
- protocol = FTPFileListProtocol()
-
- return client.list(file_path, protocol).addCallbacks(
- callback=self._build_response,
- callbackArgs=(request, protocol),
- errback=self._failed,
- errbackArgs=(request, ))
-
- def _build_response(self, result, request, protocol):
-
- self.result = result
- body = json.dumps(protocol.files).encode()
- return Response(url=request.url, status=200, body=body)
diff --git a/scrapy_od_database/items.py b/scrapy_od_database/items.py
deleted file mode 100644
index e1591ab..0000000
--- a/scrapy_od_database/items.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from scrapy import Item, Field
-
-
-class File(Item):
- path = Field()
- name = Field()
- mime = Field()
- mtime = Field()
- size = Field()
diff --git a/scrapy_od_database/spiders/ftp_links_spider.py b/scrapy_od_database/spiders/ftp_links_spider.py
deleted file mode 100644
index a710dd2..0000000
--- a/scrapy_od_database/spiders/ftp_links_spider.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import json
-import scrapy
-import os
-from scrapy_od_database.items import File
-
-
-class AnonFtpRequest(scrapy.Request):
-
- anon_meta = {
- "ftp_user": "anonymous",
- "ftp_password": "od-database"
- }
-
- def __init__(self, *args, **kwargs):
- super(AnonFtpRequest, self).__init__(*args, **kwargs)
- self.meta.update(self.anon_meta)
-
-
-class FtpLinksSpider(scrapy.Spider):
- """Scrapy spider for ftp directories. Will gather all files recursively"""
-
- name = "ftp_links"
-
- handle_httpstatus_list = [404]
-
- def __index__(self, **kw):
- super(FtpLinksSpider, self).__init__(**kw)
- self.base_url = kw.get("base_url")
-
- def start_requests(self):
- yield AnonFtpRequest(url=self.base_url, callback=self.parse)
-
- def parse(self, response):
- stripped_url = response.url[len(self.base_url) - 1:]
-
- files = json.loads(response.body)
- for file in files:
-
- if file['filetype'] == 'd':
- yield AnonFtpRequest(os.path.join(response.url, file["filename"]))
-
- if file['filetype'] == '-':
- print(file)
- result = File(
- name=file['filename'],
- path=stripped_url.strip("/"),
- size=file['size'],
- mtime=file['date'])
- yield result