mirror of
https://github.com/simon987/od-database.git
synced 2025-12-16 08:09:04 +00:00
Renamed package (again) and removed unused files
This commit is contained in:
0
scrapy_od_database/__init__.py
Normal file
0
scrapy_od_database/__init__.py
Normal file
94
scrapy_od_database/settings.py
Normal file
94
scrapy_od_database/settings.py
Normal file
@@ -0,0 +1,94 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://doc.scrapy.org/en/latest/topics/settings.html
|
||||
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'scrapy_od_database'
|
||||
|
||||
SPIDER_MODULES = ['scrapy_od_database.spiders']
|
||||
NEWSPIDER_MODULE = 'scrapy_od_database.spiders'
|
||||
|
||||
LOG_LEVEL = 'ERROR'
|
||||
FEED_FORMAT = 'json'
|
||||
FEED_URI = 'data.json'
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
CONCURRENT_REQUESTS = 40
|
||||
RETRY_TIMES = 5
|
||||
DOWNLOAD_TIMEOUT = 50
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
CONCURRENT_REQUESTS_PER_DOMAIN = 50
|
||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 'scrapy_downl_od.middlewares.ScrapyDownlOdSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'scrapy_downl_od.middlewares.ScrapyDownlOdDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://doc.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
# 'scrapy_downl_od.pipelines.ScrapyDownlOdPipeline': 300,
|
||||
#}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
4
scrapy_od_database/spiders/__init__.py
Normal file
4
scrapy_od_database/spiders/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
76
scrapy_od_database/spiders/od_links_spider.py
Normal file
76
scrapy_od_database/spiders/od_links_spider.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import scrapy
|
||||
from os import path
|
||||
from urllib.parse import unquote
|
||||
|
||||
|
||||
class LinksSpider(scrapy.Spider):
|
||||
"""Scrapy spider for open directories. Will gather all download links recursively"""
|
||||
|
||||
name = "od_links"
|
||||
|
||||
black_list = (
|
||||
"?C=N&O=D",
|
||||
"?C=M&O=A",
|
||||
"?C=S&O=A",
|
||||
"?C=D&O=A",
|
||||
"?C=N;O=D",
|
||||
"?C=M;O=A",
|
||||
"?C=S;O=A",
|
||||
"?C=D;O=A"
|
||||
)
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.crawled_links = set()
|
||||
|
||||
def __index__(self, **kw):
|
||||
super(LinksSpider, self).__init__(**kw)
|
||||
self.base_url = kw.get("base_url")
|
||||
|
||||
def should_ask_headers(self, link):
|
||||
"""Whether or not to send HEAD request"""
|
||||
return link not in self.crawled_links and not link.rsplit("?", maxsplit=1)[0].endswith("/")
|
||||
|
||||
def should_crawl(self, link):
|
||||
"""Whether or not the link should be followed"""
|
||||
if link in self.crawled_links:
|
||||
return False
|
||||
|
||||
if link.endswith(tuple(self.black_list)):
|
||||
return False
|
||||
|
||||
if not link.startswith(self.base_url):
|
||||
return False
|
||||
|
||||
return link.rsplit("?", maxsplit=1)[0].endswith("/")
|
||||
|
||||
def start_requests(self):
|
||||
yield scrapy.Request(url=self.base_url, callback=self.parse)
|
||||
|
||||
def parse(self, response):
|
||||
if response.status == 200:
|
||||
links = response.xpath('//a/@href').extract()
|
||||
for link in links:
|
||||
full_link = response.urljoin(link)
|
||||
|
||||
if self.should_ask_headers(full_link):
|
||||
yield scrapy.Request(full_link, method="HEAD", callback=self.save_file)
|
||||
elif self.should_crawl(full_link):
|
||||
self.crawled_links.add(full_link)
|
||||
yield scrapy.Request(full_link, callback=self.parse)
|
||||
|
||||
def save_file(self, response):
|
||||
|
||||
if response.status == 200:
|
||||
# Save file information
|
||||
stripped_url = response.url[len(self.base_url) - 1:]
|
||||
self.crawled_links.add(response.url)
|
||||
yield {
|
||||
"path": unquote(path.split(stripped_url)[0]).strip("/"),
|
||||
"name": unquote(path.split(stripped_url)[1]),
|
||||
"size": int(response.headers["Content-Length"].decode("utf-8")) if "Content-Length" in response.headers else -1,
|
||||
"mime": response.headers["Content-Type"].decode("utf-8").split(";", maxsplit=1)[0]
|
||||
if "Content-Type" in response.headers else "?",
|
||||
"mtime": response.headers["Date"].decode("utf-8") if "Date" in response.headers else "?"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user