From f2d914060bd7fc90652cb346b5624a858588dc3f Mon Sep 17 00:00:00 2001 From: Simon Date: Sun, 10 Jun 2018 20:08:59 -0400 Subject: [PATCH] Removed unsuitable scrapy spider and implemented custom crawler --- app.py | 3 +- ftp_crawler.py | 127 ++++++++++++++++++ requirements.txt | 2 +- scrapy_od_database/handlers.py | 24 ---- scrapy_od_database/items.py | 9 -- .../spiders/ftp_links_spider.py | 49 ------- 6 files changed, 129 insertions(+), 85 deletions(-) create mode 100644 ftp_crawler.py delete mode 100644 scrapy_od_database/handlers.py delete mode 100644 scrapy_od_database/items.py delete mode 100644 scrapy_od_database/spiders/ftp_links_spider.py diff --git a/app.py b/app.py index e66d772..3c423ad 100644 --- a/app.py +++ b/app.py @@ -157,8 +157,7 @@ def try_enqueue(url): return "A parent directory of this url has already been posted", "danger" if not od_util.is_valid_url(url): - return "Error: Invalid url. Make sure to include the http(s):// suffix. " \ - "FTP is not supported", "danger" + return "Error: Invalid url. Make sure to include the appropriate scheme.", "danger" if od_util.is_blacklisted(url): diff --git a/ftp_crawler.py b/ftp_crawler.py new file mode 100644 index 0000000..1587d94 --- /dev/null +++ b/ftp_crawler.py @@ -0,0 +1,127 @@ +#! /usr/bin/env python + +from threading import Thread +from queue import Queue +import os +import time +import ftputil +import random + + +class File: + + def __init__(self, name: str, size: int, mtime: str, path: str, is_dir: bool): + self.name = name + self.size = size + self.mtime = mtime + self.path = path + self.is_dir = is_dir + + def __str__(self): + return ("DIR " if self.is_dir else "FILE ") + self.path + "/" + self.name + + +class FTPConnection(object): + def __init__(self, host): + self.host = host + self.failed_attempts = 0 + self.max_attempts = 5 + self.stop_when_connected() + self._list_fn = None + + def _connect(self): + # attempt an anonymous FTP connection + print("CONNECT %s ATTEMPT", self.host) + self.ftp = ftputil.FTPHost(self.host, "anonymous", "od-database") + print("CONNECT %s SUCCESS", self.host) + + def stop_when_connected(self): + # continually tries to reconnect ad infinitum + # TODO: Max retries + try: + self._connect() + except Exception: + print("CONNECT %s FAILED; trying again...", self.host) + time.sleep(5 * random.uniform(0.5, 1.5)) + self.stop_when_connected() + + def list(self, path) -> list: + results = [] + self.ftp.chdir(path) + file_names = self.ftp.listdir(path) + + for file_name in file_names: + stat = self.ftp.stat(file_name) + is_dir = self.ftp.path.isdir(os.path.join(path, file_name)) + + results.append(File( + name=file_name, + mtime=stat.st_mtime, + size=-1 if is_dir else stat.st_size, + is_dir=is_dir, + path=path + )) + + return results + + def process_path(self, path): + while self.failed_attempts < self.max_attempts: + try: + results = self.list(path) + self.failed_attempts = 0 + return results + except Exception as e: + print(e) + self.failed_attempts += 1 + self.ftp.close() + print("LIST FAILED; reconnecting...") + time.sleep(2 * random.uniform(0.5, 1.5)) + self.stop_when_connected() + + # if I get here, I never succeeded in getting the data + print("LIST ABANDONED %s", path) + self.failed_attempts = 0 + return [] + + +def process_and_queue(host, q: Queue): + + ftp = FTPConnection(host) + + while True: + file = q.get() + + if file.is_dir: + print(file) + listing = ftp.process_path(os.path.join(file.path, file.name)) + for f in listing: + q.put(f) + else: + pass + + q.task_done() + + +def do_the_thing(): + + host = "80.252.155.68" + ftp = FTPConnection(host) + root_listing = ftp.process_path("/") + ftp.ftp.close() + + q = Queue(maxsize=0) + num_threads = 10 + + for i in range(num_threads): + worker = Thread(target=process_and_queue, args=(host, q,)) + worker.setDaemon(True) + worker.start() + + for file in root_listing: + q.put(file) + + q.join() + + +if __name__ == '__main__': + do_the_thing() diff --git a/requirements.txt b/requirements.txt index beadaac..f6cb662 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,4 @@ praw humanfriendly apscheduler bcrypt -twisted \ No newline at end of file +ftputil \ No newline at end of file diff --git a/scrapy_od_database/handlers.py b/scrapy_od_database/handlers.py deleted file mode 100644 index c83e6ae..0000000 --- a/scrapy_od_database/handlers.py +++ /dev/null @@ -1,24 +0,0 @@ -import json -from twisted.protocols.ftp import FTPFileListProtocol -from scrapy.http import Response -from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler - - -# Inspired by https://github.com/laserson/ftptree -class FtpListingHandler(FTPDownloadHandler): - - def gotClient(self, client, request, file_path): - - protocol = FTPFileListProtocol() - - return client.list(file_path, protocol).addCallbacks( - callback=self._build_response, - callbackArgs=(request, protocol), - errback=self._failed, - errbackArgs=(request, )) - - def _build_response(self, result, request, protocol): - - self.result = result - body = json.dumps(protocol.files).encode() - return Response(url=request.url, status=200, body=body) diff --git a/scrapy_od_database/items.py b/scrapy_od_database/items.py deleted file mode 100644 index e1591ab..0000000 --- a/scrapy_od_database/items.py +++ /dev/null @@ -1,9 +0,0 @@ -from scrapy import Item, Field - - -class File(Item): - path = Field() - name = Field() - mime = Field() - mtime = Field() - size = Field() diff --git a/scrapy_od_database/spiders/ftp_links_spider.py b/scrapy_od_database/spiders/ftp_links_spider.py deleted file mode 100644 index a710dd2..0000000 --- a/scrapy_od_database/spiders/ftp_links_spider.py +++ /dev/null @@ -1,49 +0,0 @@ -import json -import scrapy -import os -from scrapy_od_database.items import File - - -class AnonFtpRequest(scrapy.Request): - - anon_meta = { - "ftp_user": "anonymous", - "ftp_password": "od-database" - } - - def __init__(self, *args, **kwargs): - super(AnonFtpRequest, self).__init__(*args, **kwargs) - self.meta.update(self.anon_meta) - - -class FtpLinksSpider(scrapy.Spider): - """Scrapy spider for ftp directories. Will gather all files recursively""" - - name = "ftp_links" - - handle_httpstatus_list = [404] - - def __index__(self, **kw): - super(FtpLinksSpider, self).__init__(**kw) - self.base_url = kw.get("base_url") - - def start_requests(self): - yield AnonFtpRequest(url=self.base_url, callback=self.parse) - - def parse(self, response): - stripped_url = response.url[len(self.base_url) - 1:] - - files = json.loads(response.body) - for file in files: - - if file['filetype'] == 'd': - yield AnonFtpRequest(os.path.join(response.url, file["filename"])) - - if file['filetype'] == '-': - print(file) - result = File( - name=file['filename'], - path=stripped_url.strip("/"), - size=file['size'], - mtime=file['date']) - yield result