Renamed package (again) and removed unused files

2025-12-16 08:09:04 +00:00 · 2018-05-31 08:30:08 -04:00
parent ca651278d0
commit 819e2fbddb
7 changed files with 10 additions and 21 deletions
--- a/scrapy_od_database/init.py
+++ b/scrapy_od_database/init.py
--- a/scrapy_od_database/settings.py
+++ b/scrapy_od_database/settings.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'scrapy_od_database'
+
+SPIDER_MODULES = ['scrapy_od_database.spiders']
+NEWSPIDER_MODULE = 'scrapy_od_database.spiders'
+
+LOG_LEVEL = 'ERROR'
+FEED_FORMAT = 'json'
+FEED_URI = 'data.json'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+CONCURRENT_REQUESTS = 40
+RETRY_TIMES = 5
+DOWNLOAD_TIMEOUT = 50
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+CONCURRENT_REQUESTS_PER_DOMAIN = 50
+# CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'scrapy_downl_od.middlewares.ScrapyDownlOdSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'scrapy_downl_od.middlewares.ScrapyDownlOdDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'scrapy_downl_od.pipelines.ScrapyDownlOdPipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
--- a/scrapy_od_database/spiders/init.py
+++ b/scrapy_od_database/spiders/init.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/scrapy_od_database/spiders/od_links_spider.py
+++ b/scrapy_od_database/spiders/od_links_spider.py
@@ -0,0 +1,76 @@
+import scrapy
+from os import path
+from urllib.parse import unquote
+
+
+class LinksSpider(scrapy.Spider):
+    """Scrapy spider for open directories. Will gather all download links recursively"""
+
+    name = "od_links"
+
+    black_list = (
+        "?C=N&O=D",
+        "?C=M&O=A",
+        "?C=S&O=A",
+        "?C=D&O=A",
+        "?C=N;O=D",
+        "?C=M;O=A",
+        "?C=S;O=A",
+        "?C=D;O=A"
+    )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.crawled_links = set()
+
+    def __index__(self, **kw):
+        super(LinksSpider, self).__init__(**kw)
+        self.base_url = kw.get("base_url")
+
+    def should_ask_headers(self, link):
+        """Whether or not to send HEAD request"""
+        return link not in self.crawled_links and not link.rsplit("?", maxsplit=1)[0].endswith("/")
+
+    def should_crawl(self, link):
+        """Whether or not the link should be followed"""
+        if link in self.crawled_links:
+            return False
+
+        if link.endswith(tuple(self.black_list)):
+            return False
+
+        if not link.startswith(self.base_url):
+            return False
+
+        return link.rsplit("?", maxsplit=1)[0].endswith("/")
+
+    def start_requests(self):
+        yield scrapy.Request(url=self.base_url, callback=self.parse)
+
+    def parse(self, response):
+        if response.status == 200:
+            links = response.xpath('//a/@href').extract()
+            for link in links:
+                full_link = response.urljoin(link)
+
+                if self.should_ask_headers(full_link):
+                    yield scrapy.Request(full_link, method="HEAD", callback=self.save_file)
+                elif self.should_crawl(full_link):
+                    self.crawled_links.add(full_link)
+                    yield scrapy.Request(full_link, callback=self.parse)
+
+    def save_file(self, response):
+
+        if response.status == 200:
+            # Save file information
+            stripped_url = response.url[len(self.base_url) - 1:]
+            self.crawled_links.add(response.url)
+            yield {
+                "path": unquote(path.split(stripped_url)[0]).strip("/"),
+                "name": unquote(path.split(stripped_url)[1]),
+                "size": int(response.headers["Content-Length"].decode("utf-8")) if "Content-Length" in response.headers else -1,
+                "mime": response.headers["Content-Type"].decode("utf-8").split(";", maxsplit=1)[0]
+                if "Content-Type" in response.headers else "?",
+                "mtime": response.headers["Date"].decode("utf-8") if "Date" in response.headers else "?"
+            }
+