mirror of
https://github.com/simon987/od-database.git
synced 2025-04-20 10:56:47 +00:00
78 lines
2.6 KiB
Python
78 lines
2.6 KiB
Python
import scrapy
|
|
from os import path
|
|
from urllib.parse import unquote
|
|
from od_util import has_extension
|
|
|
|
|
|
class LinksSpider(scrapy.Spider):
|
|
"""Scrapy spider for open directories. Will gather all download links recursively"""
|
|
|
|
name = "od_links"
|
|
|
|
black_list = (
|
|
"?C=N&O=D",
|
|
"?C=M&O=A",
|
|
"?C=S&O=A",
|
|
"?C=D&O=A",
|
|
"?C=N;O=D",
|
|
"?C=M;O=A",
|
|
"?C=S;O=A",
|
|
"?C=D;O=A"
|
|
)
|
|
|
|
def __init__(self, **kwargs):
|
|
super().__init__(**kwargs)
|
|
self.crawled_links = set()
|
|
|
|
def __index__(self, **kw):
|
|
super(LinksSpider, self).__init__(**kw)
|
|
self.base_url = kw.get("base_url")
|
|
|
|
def should_ask_headers(self, link):
|
|
"""Whether or not to send HEAD request"""
|
|
return link not in self.crawled_links and has_extension(link)
|
|
|
|
def should_crawl(self, link):
|
|
"""Whether or not the link should be followed"""
|
|
if link in self.crawled_links:
|
|
return False
|
|
|
|
if link.endswith(tuple(self.black_list)):
|
|
return False
|
|
|
|
if not link.startswith(self.base_url):
|
|
return False
|
|
|
|
return link.rsplit("?", maxsplit=1)[0].endswith("/")
|
|
|
|
def start_requests(self):
|
|
yield scrapy.Request(url=self.base_url, callback=self.parse)
|
|
|
|
def parse(self, response):
|
|
if response.status == 200:
|
|
links = response.xpath('//a/@href').extract()
|
|
for link in links:
|
|
full_link = response.urljoin(link)
|
|
|
|
if self.should_ask_headers(full_link):
|
|
yield scrapy.Request(full_link, method="HEAD", callback=self.save_file)
|
|
elif self.should_crawl(full_link):
|
|
self.crawled_links.add(full_link)
|
|
yield scrapy.Request(full_link, callback=self.parse)
|
|
|
|
def save_file(self, response):
|
|
|
|
if response.status == 200:
|
|
# Save file information
|
|
stripped_url = response.url[len(self.base_url) - 1:]
|
|
self.crawled_links.add(response.url)
|
|
yield {
|
|
"path": unquote(path.split(stripped_url)[0]).strip("/"),
|
|
"name": unquote(path.split(stripped_url)[1]),
|
|
"size": int(response.headers["Content-Length"].decode("utf-8")) if "Content-Length" in response.headers else -1,
|
|
"mime": response.headers["Content-Type"].decode("utf-8").split(";", maxsplit=1)[0]
|
|
if "Content-Type" in response.headers else "?",
|
|
"mtime": response.headers["Date"].decode("utf-8") if "Date" in response.headers else "?"
|
|
}
|
|
|