mirror of
https://github.com/simon987/od-database.git
synced 2025-12-19 17:44:52 +00:00
Removed unsuitable scrapy spider and implemented custom crawler
This commit is contained in:
@@ -1,24 +0,0 @@
|
||||
import json
|
||||
from twisted.protocols.ftp import FTPFileListProtocol
|
||||
from scrapy.http import Response
|
||||
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
|
||||
|
||||
|
||||
# Inspired by https://github.com/laserson/ftptree
|
||||
class FtpListingHandler(FTPDownloadHandler):
|
||||
|
||||
def gotClient(self, client, request, file_path):
|
||||
|
||||
protocol = FTPFileListProtocol()
|
||||
|
||||
return client.list(file_path, protocol).addCallbacks(
|
||||
callback=self._build_response,
|
||||
callbackArgs=(request, protocol),
|
||||
errback=self._failed,
|
||||
errbackArgs=(request, ))
|
||||
|
||||
def _build_response(self, result, request, protocol):
|
||||
|
||||
self.result = result
|
||||
body = json.dumps(protocol.files).encode()
|
||||
return Response(url=request.url, status=200, body=body)
|
||||
@@ -1,9 +0,0 @@
|
||||
from scrapy import Item, Field
|
||||
|
||||
|
||||
class File(Item):
|
||||
path = Field()
|
||||
name = Field()
|
||||
mime = Field()
|
||||
mtime = Field()
|
||||
size = Field()
|
||||
@@ -1,49 +0,0 @@
|
||||
import json
|
||||
import scrapy
|
||||
import os
|
||||
from scrapy_od_database.items import File
|
||||
|
||||
|
||||
class AnonFtpRequest(scrapy.Request):
|
||||
|
||||
anon_meta = {
|
||||
"ftp_user": "anonymous",
|
||||
"ftp_password": "od-database"
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AnonFtpRequest, self).__init__(*args, **kwargs)
|
||||
self.meta.update(self.anon_meta)
|
||||
|
||||
|
||||
class FtpLinksSpider(scrapy.Spider):
|
||||
"""Scrapy spider for ftp directories. Will gather all files recursively"""
|
||||
|
||||
name = "ftp_links"
|
||||
|
||||
handle_httpstatus_list = [404]
|
||||
|
||||
def __index__(self, **kw):
|
||||
super(FtpLinksSpider, self).__init__(**kw)
|
||||
self.base_url = kw.get("base_url")
|
||||
|
||||
def start_requests(self):
|
||||
yield AnonFtpRequest(url=self.base_url, callback=self.parse)
|
||||
|
||||
def parse(self, response):
|
||||
stripped_url = response.url[len(self.base_url) - 1:]
|
||||
|
||||
files = json.loads(response.body)
|
||||
for file in files:
|
||||
|
||||
if file['filetype'] == 'd':
|
||||
yield AnonFtpRequest(os.path.join(response.url, file["filename"]))
|
||||
|
||||
if file['filetype'] == '-':
|
||||
print(file)
|
||||
result = File(
|
||||
name=file['filename'],
|
||||
path=stripped_url.strip("/"),
|
||||
size=file['size'],
|
||||
mtime=file['date'])
|
||||
yield result
|
||||
Reference in New Issue
Block a user