From 0304c98a3152d17bb463c4257d0f1c6aad4b2bc1 Mon Sep 17 00:00:00 2001 From: Simon Date: Sun, 10 Jun 2018 14:12:55 -0400 Subject: [PATCH] Added basic ftp spider for scrapy --- requirements.txt | 3 +- scrapy_od_database/handlers.py | 24 +++++++++ scrapy_od_database/items.py | 9 ++++ scrapy_od_database/settings.py | 1 + .../spiders/ftp_links_spider.py | 49 +++++++++++++++++++ scrapy_od_database/spiders/od_links_spider.py | 9 ++-- task.py | 3 ++ 7 files changed, 94 insertions(+), 4 deletions(-) create mode 100644 scrapy_od_database/handlers.py create mode 100644 scrapy_od_database/items.py create mode 100644 scrapy_od_database/spiders/ftp_links_spider.py diff --git a/requirements.txt b/requirements.txt index 618a493..beadaac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ Flask-Caching praw humanfriendly apscheduler -bcrypt \ No newline at end of file +bcrypt +twisted \ No newline at end of file diff --git a/scrapy_od_database/handlers.py b/scrapy_od_database/handlers.py new file mode 100644 index 0000000..c83e6ae --- /dev/null +++ b/scrapy_od_database/handlers.py @@ -0,0 +1,24 @@ +import json +from twisted.protocols.ftp import FTPFileListProtocol +from scrapy.http import Response +from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler + + +# Inspired by https://github.com/laserson/ftptree +class FtpListingHandler(FTPDownloadHandler): + + def gotClient(self, client, request, file_path): + + protocol = FTPFileListProtocol() + + return client.list(file_path, protocol).addCallbacks( + callback=self._build_response, + callbackArgs=(request, protocol), + errback=self._failed, + errbackArgs=(request, )) + + def _build_response(self, result, request, protocol): + + self.result = result + body = json.dumps(protocol.files).encode() + return Response(url=request.url, status=200, body=body) diff --git a/scrapy_od_database/items.py b/scrapy_od_database/items.py new file mode 100644 index 0000000..e1591ab --- /dev/null +++ b/scrapy_od_database/items.py @@ -0,0 +1,9 @@ +from scrapy import Item, Field + + +class File(Item): + path = Field() + name = Field() + mime = Field() + mtime = Field() + size = Field() diff --git a/scrapy_od_database/settings.py b/scrapy_od_database/settings.py index 4ac5477..0e14ea1 100644 --- a/scrapy_od_database/settings.py +++ b/scrapy_od_database/settings.py @@ -11,6 +11,7 @@ BOT_NAME = 'scrapy_od_database' SPIDER_MODULES = ['scrapy_od_database.spiders'] NEWSPIDER_MODULE = 'scrapy_od_database.spiders' +DOWNLOAD_HANDLERS = {'ftp': 'scrapy_od_database.handlers.FtpListingHandler'} LOG_LEVEL = 'ERROR' FEED_FORMAT = 'json' diff --git a/scrapy_od_database/spiders/ftp_links_spider.py b/scrapy_od_database/spiders/ftp_links_spider.py new file mode 100644 index 0000000..a710dd2 --- /dev/null +++ b/scrapy_od_database/spiders/ftp_links_spider.py @@ -0,0 +1,49 @@ +import json +import scrapy +import os +from scrapy_od_database.items import File + + +class AnonFtpRequest(scrapy.Request): + + anon_meta = { + "ftp_user": "anonymous", + "ftp_password": "od-database" + } + + def __init__(self, *args, **kwargs): + super(AnonFtpRequest, self).__init__(*args, **kwargs) + self.meta.update(self.anon_meta) + + +class FtpLinksSpider(scrapy.Spider): + """Scrapy spider for ftp directories. Will gather all files recursively""" + + name = "ftp_links" + + handle_httpstatus_list = [404] + + def __index__(self, **kw): + super(FtpLinksSpider, self).__init__(**kw) + self.base_url = kw.get("base_url") + + def start_requests(self): + yield AnonFtpRequest(url=self.base_url, callback=self.parse) + + def parse(self, response): + stripped_url = response.url[len(self.base_url) - 1:] + + files = json.loads(response.body) + for file in files: + + if file['filetype'] == 'd': + yield AnonFtpRequest(os.path.join(response.url, file["filename"])) + + if file['filetype'] == '-': + print(file) + result = File( + name=file['filename'], + path=stripped_url.strip("/"), + size=file['size'], + mtime=file['date']) + yield result diff --git a/scrapy_od_database/spiders/od_links_spider.py b/scrapy_od_database/spiders/od_links_spider.py index 4771cc9..fe80f19 100644 --- a/scrapy_od_database/spiders/od_links_spider.py +++ b/scrapy_od_database/spiders/od_links_spider.py @@ -1,5 +1,5 @@ import scrapy -from os import path +import os from urllib.parse import unquote @@ -65,9 +65,12 @@ class LinksSpider(scrapy.Spider): # Save file information stripped_url = response.url[len(self.base_url) - 1:] self.crawled_links.add(response.url) + + path, name = os.path.split(stripped_url) + yield { - "path": unquote(path.split(stripped_url)[0]).strip("/"), - "name": unquote(path.split(stripped_url)[1]), + "path": unquote(path).strip("/"), + "name": unquote(name), "size": int(response.headers["Content-Length"].decode("utf-8")) if "Content-Length" in response.headers else -1, "mime": response.headers["Content-Type"].decode("utf-8").split(";", maxsplit=1)[0] if "Content-Type" in response.headers else "?", diff --git a/task.py b/task.py index 613e5b8..bb80cd7 100644 --- a/task.py +++ b/task.py @@ -56,6 +56,7 @@ class TaskManager: os.remove("data.json") print("Imported in SQLite3") + # TODO: Extract 'callbacks' for posts and comments in a function if post_id: # Reply to post stats = self.db.get_website_stats(website.id) @@ -75,6 +76,8 @@ class TaskManager: print(comment) reddit_comment = self.reddit_bot.reddit.comment(comment_id) self.reddit_bot.reply(reddit_comment, comment) + busy.value = 0 print("Done crawling task") +