Added basic ftp spider for scrapy

2025-12-14 15:19:03 +00:00 · 2018-06-10 14:12:55 -04:00
parent f1e8183cdf
commit 0304c98a31
7 changed files with 94 additions and 4 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,3 +9,4 @@ praw
 humanfriendly
 apscheduler
 bcrypt
 twisted
--- a/scrapy_od_database/handlers.py
+++ b/scrapy_od_database/handlers.py
@@ -0,0 +1,24 @@
 import json
 from twisted.protocols.ftp import FTPFileListProtocol
 from scrapy.http import Response
 from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
 # Inspired by https://github.com/laserson/ftptree
 class FtpListingHandler(FTPDownloadHandler):
    def gotClient(self, client, request, file_path):
        protocol = FTPFileListProtocol()
        return client.list(file_path, protocol).addCallbacks(
            callback=self._build_response,
            callbackArgs=(request, protocol),
            errback=self._failed,
            errbackArgs=(request, ))
    def _build_response(self, result, request, protocol):
        self.result = result
        body = json.dumps(protocol.files).encode()
        return Response(url=request.url, status=200, body=body)
--- a/scrapy_od_database/items.py
+++ b/scrapy_od_database/items.py
@@ -0,0 +1,9 @@
 from scrapy import Item, Field
 class File(Item):
    path = Field()
    name = Field()
    mime = Field()
    mtime = Field()
    size = Field()
--- a/scrapy_od_database/settings.py
+++ b/scrapy_od_database/settings.py
@@ -11,6 +11,7 @@ BOT_NAME = 'scrapy_od_database'
 SPIDER_MODULES = ['scrapy_od_database.spiders']
 NEWSPIDER_MODULE = 'scrapy_od_database.spiders'
 DOWNLOAD_HANDLERS = {'ftp': 'scrapy_od_database.handlers.FtpListingHandler'}
 LOG_LEVEL = 'ERROR'
 FEED_FORMAT = 'json'
--- a/scrapy_od_database/spiders/ftp_links_spider.py
+++ b/scrapy_od_database/spiders/ftp_links_spider.py
@@ -0,0 +1,49 @@
 import json
 import scrapy
 import os
 from scrapy_od_database.items import File
 class AnonFtpRequest(scrapy.Request):
    anon_meta = {
        "ftp_user": "anonymous",
        "ftp_password": "od-database"
    }
    def __init__(self, *args, **kwargs):
        super(AnonFtpRequest, self).__init__(*args, **kwargs)
        self.meta.update(self.anon_meta)
 class FtpLinksSpider(scrapy.Spider):
    """Scrapy spider for ftp directories. Will gather all files recursively"""
    name = "ftp_links"
    handle_httpstatus_list = [404]
    def __index__(self, **kw):
        super(FtpLinksSpider, self).__init__(**kw)
        self.base_url = kw.get("base_url")
    def start_requests(self):
        yield AnonFtpRequest(url=self.base_url, callback=self.parse)
    def parse(self, response):
        stripped_url = response.url[len(self.base_url) - 1:]
        files = json.loads(response.body)
        for file in files:
            if file['filetype'] == 'd':
                yield AnonFtpRequest(os.path.join(response.url, file["filename"]))
            if file['filetype'] == '-':
                print(file)
                result = File(
                    name=file['filename'],
                    path=stripped_url.strip("/"),
                    size=file['size'],
                    mtime=file['date'])
                yield result
--- a/scrapy_od_database/spiders/od_links_spider.py
+++ b/scrapy_od_database/spiders/od_links_spider.py
@@ -1,5 +1,5 @@
 import scrapy
-from os import path
+import os
 from urllib.parse import unquote
@@ -65,9 +65,12 @@ class LinksSpider(scrapy.Spider):
            # Save file information
            stripped_url = response.url[len(self.base_url) - 1:]
            self.crawled_links.add(response.url)
            path, name = os.path.split(stripped_url)
            yield {
-                "path": unquote(path.split(stripped_url)[0]).strip("/"),
+                "path": unquote(path).strip("/"),
-                "name": unquote(path.split(stripped_url)[1]),
+                "name": unquote(name),
                "size": int(response.headers["Content-Length"].decode("utf-8")) if "Content-Length" in response.headers else -1,
                "mime": response.headers["Content-Type"].decode("utf-8").split(";", maxsplit=1)[0]
                if "Content-Type" in response.headers else "?",
--- a/task.py
+++ b/task.py
@@ -56,6 +56,7 @@ class TaskManager:
        os.remove("data.json")
        print("Imported in SQLite3")
        # TODO: Extract 'callbacks' for posts and comments in a function
        if post_id:
            # Reply to post
            stats = self.db.get_website_stats(website.id)
@@ -75,6 +76,8 @@ class TaskManager:
            print(comment)
            reddit_comment = self.reddit_bot.reddit.comment(comment_id)
            self.reddit_bot.reply(reddit_comment, comment)
        busy.value = 0
        print("Done crawling task")