Removed unsuitable scrapy spider and implemented custom crawler

2025-12-14 23:29:04 +00:00 · 2018-06-10 20:08:59 -04:00
parent d8c16d53e6
commit f2d914060b
6 changed files with 129 additions and 85 deletions
--- a/app.py
+++ b/app.py
@@ -157,8 +157,7 @@ def try_enqueue(url):
        return "A parent directory of this url has already been posted", "danger"

    if not od_util.is_valid_url(url):
-        return "<strong>Error:</strong> Invalid url. Make sure to include the http(s):// suffix. " \
-               "FTP is not supported", "danger"
+        return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme.", "danger"

    if od_util.is_blacklisted(url):

--- a/ftp_crawler.py
+++ b/ftp_crawler.py
@@ -0,0 +1,127 @@
+#! /usr/bin/env python
+
+from threading import Thread
+from queue import Queue
+import os
+import time
+import ftputil
+import random
+
+
+class File:
+
+    def __init__(self, name: str, size: int, mtime: str, path: str, is_dir: bool):
+        self.name = name
+        self.size = size
+        self.mtime = mtime
+        self.path = path
+        self.is_dir = is_dir
+
+    def __str__(self):
+        return ("DIR " if self.is_dir else "FILE ") + self.path + "/" + self.name
+
+
+class FTPConnection(object):
+    def __init__(self, host):
+        self.host = host
+        self.failed_attempts = 0
+        self.max_attempts = 5
+        self.stop_when_connected()
+        self._list_fn = None
+
+    def _connect(self):
+        # attempt an anonymous FTP connection
+        print("CONNECT %s ATTEMPT", self.host)
+        self.ftp = ftputil.FTPHost(self.host, "anonymous", "od-database")
+        print("CONNECT %s SUCCESS", self.host)
+
+    def stop_when_connected(self):
+        # continually tries to reconnect ad infinitum
+        # TODO: Max retries
+        try:
+            self._connect()
+        except Exception:
+            print("CONNECT %s FAILED; trying again...", self.host)
+            time.sleep(5 * random.uniform(0.5, 1.5))
+            self.stop_when_connected()
+
+    def list(self, path) -> list:
+        results = []
+        self.ftp.chdir(path)
+        file_names = self.ftp.listdir(path)
+
+        for file_name in file_names:
+            stat = self.ftp.stat(file_name)
+            is_dir = self.ftp.path.isdir(os.path.join(path, file_name))
+
+            results.append(File(
+                name=file_name,
+                mtime=stat.st_mtime,
+                size=-1 if is_dir else stat.st_size,
+                is_dir=is_dir,
+                path=path
+            ))
+
+        return results
+
+    def process_path(self, path):
+        while self.failed_attempts < self.max_attempts:
+            try:
+                results = self.list(path)
+                self.failed_attempts = 0
+                return results
+            except Exception as e:
+                print(e)
+                self.failed_attempts += 1
+                self.ftp.close()
+                print("LIST FAILED; reconnecting...")
+                time.sleep(2 * random.uniform(0.5, 1.5))
+                self.stop_when_connected()
+
+        # if I get here, I never succeeded in getting the data
+        print("LIST ABANDONED %s", path)
+        self.failed_attempts = 0
+        return []
+
+
+def process_and_queue(host, q: Queue):
+
+    ftp = FTPConnection(host)
+
+    while True:
+        file = q.get()
+
+        if file.is_dir:
+            print(file)
+            listing = ftp.process_path(os.path.join(file.path, file.name))
+            for f in listing:
+                q.put(f)
+        else:
+            pass
+
+        q.task_done()
+
+
+def do_the_thing():
+
+    host = "80.252.155.68"
+    ftp = FTPConnection(host)
+    root_listing = ftp.process_path("/")
+    ftp.ftp.close()
+
+    q = Queue(maxsize=0)
+    num_threads = 10
+
+    for i in range(num_threads):
+        worker = Thread(target=process_and_queue, args=(host, q,))
+        worker.setDaemon(True)
+        worker.start()
+
+    for file in root_listing:
+        q.put(file)
+
+    q.join()
+
+
+if __name__ == '__main__':
+    do_the_thing()
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,4 @@ praw
 humanfriendly
 apscheduler
 bcrypt
-twisted
+ftputil
--- a/scrapy_od_database/handlers.py
+++ b/scrapy_od_database/handlers.py
@@ -1,24 +0,0 @@
-import json
-from twisted.protocols.ftp import FTPFileListProtocol
-from scrapy.http import Response
-from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
-
-
-# Inspired by https://github.com/laserson/ftptree
-class FtpListingHandler(FTPDownloadHandler):
-
-    def gotClient(self, client, request, file_path):
-
-        protocol = FTPFileListProtocol()
-
-        return client.list(file_path, protocol).addCallbacks(
-            callback=self._build_response,
-            callbackArgs=(request, protocol),
-            errback=self._failed,
-            errbackArgs=(request, ))
-
-    def _build_response(self, result, request, protocol):
-
-        self.result = result
-        body = json.dumps(protocol.files).encode()
-        return Response(url=request.url, status=200, body=body)
--- a/scrapy_od_database/items.py
+++ b/scrapy_od_database/items.py
@@ -1,9 +0,0 @@
-from scrapy import Item, Field
-
-
-class File(Item):
-    path = Field()
-    name = Field()
-    mime = Field()
-    mtime = Field()
-    size = Field()
--- a/scrapy_od_database/spiders/ftp_links_spider.py
+++ b/scrapy_od_database/spiders/ftp_links_spider.py
@@ -1,49 +0,0 @@
-import json
-import scrapy
-import os
-from scrapy_od_database.items import File
-
-
-class AnonFtpRequest(scrapy.Request):
-
-    anon_meta = {
-        "ftp_user": "anonymous",
-        "ftp_password": "od-database"
-    }
-
-    def __init__(self, *args, **kwargs):
-        super(AnonFtpRequest, self).__init__(*args, **kwargs)
-        self.meta.update(self.anon_meta)
-
-
-class FtpLinksSpider(scrapy.Spider):
-    """Scrapy spider for ftp directories. Will gather all files recursively"""
-
-    name = "ftp_links"
-
-    handle_httpstatus_list = [404]
-
-    def __index__(self, **kw):
-        super(FtpLinksSpider, self).__init__(**kw)
-        self.base_url = kw.get("base_url")
-
-    def start_requests(self):
-        yield AnonFtpRequest(url=self.base_url, callback=self.parse)
-
-    def parse(self, response):
-        stripped_url = response.url[len(self.base_url) - 1:]
-
-        files = json.loads(response.body)
-        for file in files:
-
-            if file['filetype'] == 'd':
-                yield AnonFtpRequest(os.path.join(response.url, file["filename"]))
-
-            if file['filetype'] == '-':
-                print(file)
-                result = File(
-                    name=file['filename'],
-                    path=stripped_url.strip("/"),
-                    size=file['size'],
-                    mtime=file['date'])
-                yield result