From 0304c98a3152d17bb463c4257d0f1c6aad4b2bc1 Mon Sep 17 00:00:00 2001
From: Simon <fortier.simon@hotmail.com>
Date: Sun, 10 Jun 2018 14:12:55 -0400
Subject: [PATCH] Added basic ftp spider for scrapy

---
 requirements.txt                              |  3 +-
 scrapy_od_database/handlers.py                | 24 +++++++++
 scrapy_od_database/items.py                   |  9 ++++
 scrapy_od_database/settings.py                |  1 +
 .../spiders/ftp_links_spider.py               | 49 +++++++++++++++++++
 scrapy_od_database/spiders/od_links_spider.py |  9 ++--
 task.py                                       |  3 ++
 7 files changed, 94 insertions(+), 4 deletions(-)
 create mode 100644 scrapy_od_database/handlers.py
 create mode 100644 scrapy_od_database/items.py
 create mode 100644 scrapy_od_database/spiders/ftp_links_spider.py

diff --git a/requirements.txt b/requirements.txt
index 618a493..beadaac 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,5 @@ Flask-Caching
 praw
 humanfriendly
 apscheduler
-bcrypt
\ No newline at end of file
+bcrypt
+twisted
\ No newline at end of file
diff --git a/scrapy_od_database/handlers.py b/scrapy_od_database/handlers.py
new file mode 100644
index 0000000..c83e6ae
--- /dev/null
+++ b/scrapy_od_database/handlers.py
@@ -0,0 +1,24 @@
+import json
+from twisted.protocols.ftp import FTPFileListProtocol
+from scrapy.http import Response
+from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
+
+
+# Inspired by https://github.com/laserson/ftptree
+class FtpListingHandler(FTPDownloadHandler):
+
+    def gotClient(self, client, request, file_path):
+
+        protocol = FTPFileListProtocol()
+
+        return client.list(file_path, protocol).addCallbacks(
+            callback=self._build_response,
+            callbackArgs=(request, protocol),
+            errback=self._failed,
+            errbackArgs=(request, ))
+
+    def _build_response(self, result, request, protocol):
+
+        self.result = result
+        body = json.dumps(protocol.files).encode()
+        return Response(url=request.url, status=200, body=body)
diff --git a/scrapy_od_database/items.py b/scrapy_od_database/items.py
new file mode 100644
index 0000000..e1591ab
--- /dev/null
+++ b/scrapy_od_database/items.py
@@ -0,0 +1,9 @@
+from scrapy import Item, Field
+
+
+class File(Item):
+    path = Field()
+    name = Field()
+    mime = Field()
+    mtime = Field()
+    size = Field()
diff --git a/scrapy_od_database/settings.py b/scrapy_od_database/settings.py
index 4ac5477..0e14ea1 100644
--- a/scrapy_od_database/settings.py
+++ b/scrapy_od_database/settings.py
@@ -11,6 +11,7 @@ BOT_NAME = 'scrapy_od_database'
 
 SPIDER_MODULES = ['scrapy_od_database.spiders']
 NEWSPIDER_MODULE = 'scrapy_od_database.spiders'
+DOWNLOAD_HANDLERS = {'ftp': 'scrapy_od_database.handlers.FtpListingHandler'}
 
 LOG_LEVEL = 'ERROR'
 FEED_FORMAT = 'json'
diff --git a/scrapy_od_database/spiders/ftp_links_spider.py b/scrapy_od_database/spiders/ftp_links_spider.py
new file mode 100644
index 0000000..a710dd2
--- /dev/null
+++ b/scrapy_od_database/spiders/ftp_links_spider.py
@@ -0,0 +1,49 @@
+import json
+import scrapy
+import os
+from scrapy_od_database.items import File
+
+
+class AnonFtpRequest(scrapy.Request):
+
+    anon_meta = {
+        "ftp_user": "anonymous",
+        "ftp_password": "od-database"
+    }
+
+    def __init__(self, *args, **kwargs):
+        super(AnonFtpRequest, self).__init__(*args, **kwargs)
+        self.meta.update(self.anon_meta)
+
+
+class FtpLinksSpider(scrapy.Spider):
+    """Scrapy spider for ftp directories. Will gather all files recursively"""
+
+    name = "ftp_links"
+
+    handle_httpstatus_list = [404]
+
+    def __index__(self, **kw):
+        super(FtpLinksSpider, self).__init__(**kw)
+        self.base_url = kw.get("base_url")
+
+    def start_requests(self):
+        yield AnonFtpRequest(url=self.base_url, callback=self.parse)
+
+    def parse(self, response):
+        stripped_url = response.url[len(self.base_url) - 1:]
+
+        files = json.loads(response.body)
+        for file in files:
+
+            if file['filetype'] == 'd':
+                yield AnonFtpRequest(os.path.join(response.url, file["filename"]))
+
+            if file['filetype'] == '-':
+                print(file)
+                result = File(
+                    name=file['filename'],
+                    path=stripped_url.strip("/"),
+                    size=file['size'],
+                    mtime=file['date'])
+                yield result
diff --git a/scrapy_od_database/spiders/od_links_spider.py b/scrapy_od_database/spiders/od_links_spider.py
index 4771cc9..fe80f19 100644
--- a/scrapy_od_database/spiders/od_links_spider.py
+++ b/scrapy_od_database/spiders/od_links_spider.py
@@ -1,5 +1,5 @@
 import scrapy
-from os import path
+import os
 from urllib.parse import unquote
 
 
@@ -65,9 +65,12 @@ class LinksSpider(scrapy.Spider):
             # Save file information
             stripped_url = response.url[len(self.base_url) - 1:]
             self.crawled_links.add(response.url)
+
+            path, name = os.path.split(stripped_url)
+
             yield {
-                "path": unquote(path.split(stripped_url)[0]).strip("/"),
-                "name": unquote(path.split(stripped_url)[1]),
+                "path": unquote(path).strip("/"),
+                "name": unquote(name),
                 "size": int(response.headers["Content-Length"].decode("utf-8")) if "Content-Length" in response.headers else -1,
                 "mime": response.headers["Content-Type"].decode("utf-8").split(";", maxsplit=1)[0]
                 if "Content-Type" in response.headers else "?",
diff --git a/task.py b/task.py
index 613e5b8..bb80cd7 100644
--- a/task.py
+++ b/task.py
@@ -56,6 +56,7 @@ class TaskManager:
         os.remove("data.json")
         print("Imported in SQLite3")
 
+        # TODO: Extract 'callbacks' for posts and comments in a function
         if post_id:
             # Reply to post
             stats = self.db.get_website_stats(website.id)
@@ -75,6 +76,8 @@ class TaskManager:
             print(comment)
             reddit_comment = self.reddit_bot.reddit.comment(comment_id)
             self.reddit_bot.reply(reddit_comment, comment)
+
         busy.value = 0
         print("Done crawling task")
 
+