Removed unsuitable scrapy spider and implemented custom crawler

This commit is contained in:
Simon
2018-06-10 20:08:59 -04:00
parent d8c16d53e6
commit f2d914060b
6 changed files with 129 additions and 85 deletions

View File

@@ -1,24 +0,0 @@
import json
from twisted.protocols.ftp import FTPFileListProtocol
from scrapy.http import Response
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
# Inspired by https://github.com/laserson/ftptree
class FtpListingHandler(FTPDownloadHandler):
def gotClient(self, client, request, file_path):
protocol = FTPFileListProtocol()
return client.list(file_path, protocol).addCallbacks(
callback=self._build_response,
callbackArgs=(request, protocol),
errback=self._failed,
errbackArgs=(request, ))
def _build_response(self, result, request, protocol):
self.result = result
body = json.dumps(protocol.files).encode()
return Response(url=request.url, status=200, body=body)

View File

@@ -1,9 +0,0 @@
from scrapy import Item, Field
class File(Item):
path = Field()
name = Field()
mime = Field()
mtime = Field()
size = Field()

View File

@@ -1,49 +0,0 @@
import json
import scrapy
import os
from scrapy_od_database.items import File
class AnonFtpRequest(scrapy.Request):
anon_meta = {
"ftp_user": "anonymous",
"ftp_password": "od-database"
}
def __init__(self, *args, **kwargs):
super(AnonFtpRequest, self).__init__(*args, **kwargs)
self.meta.update(self.anon_meta)
class FtpLinksSpider(scrapy.Spider):
"""Scrapy spider for ftp directories. Will gather all files recursively"""
name = "ftp_links"
handle_httpstatus_list = [404]
def __index__(self, **kw):
super(FtpLinksSpider, self).__init__(**kw)
self.base_url = kw.get("base_url")
def start_requests(self):
yield AnonFtpRequest(url=self.base_url, callback=self.parse)
def parse(self, response):
stripped_url = response.url[len(self.base_url) - 1:]
files = json.loads(response.body)
for file in files:
if file['filetype'] == 'd':
yield AnonFtpRequest(os.path.join(response.url, file["filename"]))
if file['filetype'] == '-':
print(file)
result = File(
name=file['filename'],
path=stripped_url.strip("/"),
size=file['size'],
mtime=file['date'])
yield result