od-database/scrapy_od-database/spiders/od_links_spider.py

import scrapy
from os import path
from urllib.parse import unquote
from od_util import has_extension


class LinksSpider(scrapy.Spider):
    """Scrapy spider for open directories. Will gather all download links recursively"""

    name = "od_links"

    black_list = (
        "?C=N&O=D",
        "?C=M&O=A",
        "?C=S&O=A",
        "?C=D&O=A",
        "?C=N;O=D",
        "?C=M;O=A",
        "?C=S;O=A",
        "?C=D;O=A"
    )

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.crawled_links = set()

    def __index__(self, **kw):
        super(LinksSpider, self).__init__(**kw)
        self.base_url = kw.get("base_url")

    def should_ask_headers(self, link):
        """Whether or not to send HEAD request"""
        return link not in self.crawled_links and has_extension(link)

    def should_crawl(self, link):
        """Whether or not the link should be followed"""
        if link in self.crawled_links:
            return False

        if link.endswith(tuple(self.black_list)):
            return False

        if not link.startswith(self.base_url):
            return False

        return link.rsplit("?", maxsplit=1)[0].endswith("/")

    def start_requests(self):
        yield scrapy.Request(url=self.base_url, callback=self.parse)

    def parse(self, response):
        if response.status == 200:
            links = response.xpath('//a/@href').extract()
            for link in links:
                full_link = response.urljoin(link)

                if self.should_ask_headers(full_link):
                    yield scrapy.Request(full_link, method="HEAD", callback=self.save_file)
                elif self.should_crawl(full_link):
                    self.crawled_links.add(full_link)
                    yield scrapy.Request(full_link, callback=self.parse)

    def save_file(self, response):

        if response.status == 200:
            # Save file information
            stripped_url = response.url[len(self.base_url) - 1:]
            self.crawled_links.add(response.url)
            yield {
                "path": unquote(path.split(stripped_url)[0]).strip("/"),
                "name": unquote(path.split(stripped_url)[1]),
                "size": int(response.headers["Content-Length"].decode("utf-8")) if "Content-Length" in response.headers else -1,
                "mime": response.headers["Content-Type"].decode("utf-8").split(";", maxsplit=1)[0]
                if "Content-Type" in response.headers else "?",
                "mtime": response.headers["Date"].decode("utf-8") if "Date" in response.headers else "?"
            }