Added basic ftp spider for scrapy

This commit is contained in:
Simon 2018-06-10 14:12:55 -04:00
parent f1e8183cdf
commit 0304c98a31
7 changed files with 94 additions and 4 deletions

View File

@ -9,3 +9,4 @@ praw
humanfriendly
apscheduler
bcrypt
twisted

View File

@ -0,0 +1,24 @@
import json
from twisted.protocols.ftp import FTPFileListProtocol
from scrapy.http import Response
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
# Inspired by https://github.com/laserson/ftptree
class FtpListingHandler(FTPDownloadHandler):
def gotClient(self, client, request, file_path):
protocol = FTPFileListProtocol()
return client.list(file_path, protocol).addCallbacks(
callback=self._build_response,
callbackArgs=(request, protocol),
errback=self._failed,
errbackArgs=(request, ))
def _build_response(self, result, request, protocol):
self.result = result
body = json.dumps(protocol.files).encode()
return Response(url=request.url, status=200, body=body)

View File

@ -0,0 +1,9 @@
from scrapy import Item, Field
class File(Item):
path = Field()
name = Field()
mime = Field()
mtime = Field()
size = Field()

View File

@ -11,6 +11,7 @@ BOT_NAME = 'scrapy_od_database'
SPIDER_MODULES = ['scrapy_od_database.spiders']
NEWSPIDER_MODULE = 'scrapy_od_database.spiders'
DOWNLOAD_HANDLERS = {'ftp': 'scrapy_od_database.handlers.FtpListingHandler'}
LOG_LEVEL = 'ERROR'
FEED_FORMAT = 'json'

View File

@ -0,0 +1,49 @@
import json
import scrapy
import os
from scrapy_od_database.items import File
class AnonFtpRequest(scrapy.Request):
anon_meta = {
"ftp_user": "anonymous",
"ftp_password": "od-database"
}
def __init__(self, *args, **kwargs):
super(AnonFtpRequest, self).__init__(*args, **kwargs)
self.meta.update(self.anon_meta)
class FtpLinksSpider(scrapy.Spider):
"""Scrapy spider for ftp directories. Will gather all files recursively"""
name = "ftp_links"
handle_httpstatus_list = [404]
def __index__(self, **kw):
super(FtpLinksSpider, self).__init__(**kw)
self.base_url = kw.get("base_url")
def start_requests(self):
yield AnonFtpRequest(url=self.base_url, callback=self.parse)
def parse(self, response):
stripped_url = response.url[len(self.base_url) - 1:]
files = json.loads(response.body)
for file in files:
if file['filetype'] == 'd':
yield AnonFtpRequest(os.path.join(response.url, file["filename"]))
if file['filetype'] == '-':
print(file)
result = File(
name=file['filename'],
path=stripped_url.strip("/"),
size=file['size'],
mtime=file['date'])
yield result

View File

@ -1,5 +1,5 @@
import scrapy
from os import path
import os
from urllib.parse import unquote
@ -65,9 +65,12 @@ class LinksSpider(scrapy.Spider):
# Save file information
stripped_url = response.url[len(self.base_url) - 1:]
self.crawled_links.add(response.url)
path, name = os.path.split(stripped_url)
yield {
"path": unquote(path.split(stripped_url)[0]).strip("/"),
"name": unquote(path.split(stripped_url)[1]),
"path": unquote(path).strip("/"),
"name": unquote(name),
"size": int(response.headers["Content-Length"].decode("utf-8")) if "Content-Length" in response.headers else -1,
"mime": response.headers["Content-Type"].decode("utf-8").split(";", maxsplit=1)[0]
if "Content-Type" in response.headers else "?",

View File

@ -56,6 +56,7 @@ class TaskManager:
os.remove("data.json")
print("Imported in SQLite3")
# TODO: Extract 'callbacks' for posts and comments in a function
if post_id:
# Reply to post
stats = self.db.get_website_stats(website.id)
@ -75,6 +76,8 @@ class TaskManager:
print(comment)
reddit_comment = self.reddit_bot.reddit.comment(comment_id)
self.reddit_bot.reply(reddit_comment, comment)
busy.value = 0
print("Done crawling task")