Removed unsuitable scrapy spider and implemented custom crawler

2025-12-19 17:44:52 +00:00 · 2018-06-10 20:08:59 -04:00
parent d8c16d53e6
commit f2d914060b
6 changed files with 129 additions and 85 deletions
--- a/scrapy_od_database/handlers.py
+++ b/scrapy_od_database/handlers.py
@@ -1,24 +0,0 @@
-import json
-from twisted.protocols.ftp import FTPFileListProtocol
-from scrapy.http import Response
-from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
-
-
-# Inspired by https://github.com/laserson/ftptree
-class FtpListingHandler(FTPDownloadHandler):
-
-    def gotClient(self, client, request, file_path):
-
-        protocol = FTPFileListProtocol()
-
-        return client.list(file_path, protocol).addCallbacks(
-            callback=self._build_response,
-            callbackArgs=(request, protocol),
-            errback=self._failed,
-            errbackArgs=(request, ))
-
-    def _build_response(self, result, request, protocol):
-
-        self.result = result
-        body = json.dumps(protocol.files).encode()
-        return Response(url=request.url, status=200, body=body)
--- a/scrapy_od_database/items.py
+++ b/scrapy_od_database/items.py
@@ -1,9 +0,0 @@
-from scrapy import Item, Field
-
-
-class File(Item):
-    path = Field()
-    name = Field()
-    mime = Field()
-    mtime = Field()
-    size = Field()
--- a/scrapy_od_database/spiders/ftp_links_spider.py
+++ b/scrapy_od_database/spiders/ftp_links_spider.py
@@ -1,49 +0,0 @@
-import json
-import scrapy
-import os
-from scrapy_od_database.items import File
-
-
-class AnonFtpRequest(scrapy.Request):
-
-    anon_meta = {
-        "ftp_user": "anonymous",
-        "ftp_password": "od-database"
-    }
-
-    def __init__(self, *args, **kwargs):
-        super(AnonFtpRequest, self).__init__(*args, **kwargs)
-        self.meta.update(self.anon_meta)
-
-
-class FtpLinksSpider(scrapy.Spider):
-    """Scrapy spider for ftp directories. Will gather all files recursively"""
-
-    name = "ftp_links"
-
-    handle_httpstatus_list = [404]
-
-    def __index__(self, **kw):
-        super(FtpLinksSpider, self).__init__(**kw)
-        self.base_url = kw.get("base_url")
-
-    def start_requests(self):
-        yield AnonFtpRequest(url=self.base_url, callback=self.parse)
-
-    def parse(self, response):
-        stripped_url = response.url[len(self.base_url) - 1:]
-
-        files = json.loads(response.body)
-        for file in files:
-
-            if file['filetype'] == 'd':
-                yield AnonFtpRequest(os.path.join(response.url, file["filename"]))
-
-            if file['filetype'] == '-':
-                print(file)
-                result = File(
-                    name=file['filename'],
-                    path=stripped_url.strip("/"),
-                    size=file['size'],
-                    mtime=file['date'])
-                yield result