mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
Added basic ftp spider for scrapy
This commit is contained in:
parent
f1e8183cdf
commit
0304c98a31
@ -9,3 +9,4 @@ praw
|
|||||||
humanfriendly
|
humanfriendly
|
||||||
apscheduler
|
apscheduler
|
||||||
bcrypt
|
bcrypt
|
||||||
|
twisted
|
24
scrapy_od_database/handlers.py
Normal file
24
scrapy_od_database/handlers.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import json
|
||||||
|
from twisted.protocols.ftp import FTPFileListProtocol
|
||||||
|
from scrapy.http import Response
|
||||||
|
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
|
||||||
|
|
||||||
|
|
||||||
|
# Inspired by https://github.com/laserson/ftptree
|
||||||
|
class FtpListingHandler(FTPDownloadHandler):
|
||||||
|
|
||||||
|
def gotClient(self, client, request, file_path):
|
||||||
|
|
||||||
|
protocol = FTPFileListProtocol()
|
||||||
|
|
||||||
|
return client.list(file_path, protocol).addCallbacks(
|
||||||
|
callback=self._build_response,
|
||||||
|
callbackArgs=(request, protocol),
|
||||||
|
errback=self._failed,
|
||||||
|
errbackArgs=(request, ))
|
||||||
|
|
||||||
|
def _build_response(self, result, request, protocol):
|
||||||
|
|
||||||
|
self.result = result
|
||||||
|
body = json.dumps(protocol.files).encode()
|
||||||
|
return Response(url=request.url, status=200, body=body)
|
9
scrapy_od_database/items.py
Normal file
9
scrapy_od_database/items.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
from scrapy import Item, Field
|
||||||
|
|
||||||
|
|
||||||
|
class File(Item):
|
||||||
|
path = Field()
|
||||||
|
name = Field()
|
||||||
|
mime = Field()
|
||||||
|
mtime = Field()
|
||||||
|
size = Field()
|
@ -11,6 +11,7 @@ BOT_NAME = 'scrapy_od_database'
|
|||||||
|
|
||||||
SPIDER_MODULES = ['scrapy_od_database.spiders']
|
SPIDER_MODULES = ['scrapy_od_database.spiders']
|
||||||
NEWSPIDER_MODULE = 'scrapy_od_database.spiders'
|
NEWSPIDER_MODULE = 'scrapy_od_database.spiders'
|
||||||
|
DOWNLOAD_HANDLERS = {'ftp': 'scrapy_od_database.handlers.FtpListingHandler'}
|
||||||
|
|
||||||
LOG_LEVEL = 'ERROR'
|
LOG_LEVEL = 'ERROR'
|
||||||
FEED_FORMAT = 'json'
|
FEED_FORMAT = 'json'
|
||||||
|
49
scrapy_od_database/spiders/ftp_links_spider.py
Normal file
49
scrapy_od_database/spiders/ftp_links_spider.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
import json
|
||||||
|
import scrapy
|
||||||
|
import os
|
||||||
|
from scrapy_od_database.items import File
|
||||||
|
|
||||||
|
|
||||||
|
class AnonFtpRequest(scrapy.Request):
|
||||||
|
|
||||||
|
anon_meta = {
|
||||||
|
"ftp_user": "anonymous",
|
||||||
|
"ftp_password": "od-database"
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(AnonFtpRequest, self).__init__(*args, **kwargs)
|
||||||
|
self.meta.update(self.anon_meta)
|
||||||
|
|
||||||
|
|
||||||
|
class FtpLinksSpider(scrapy.Spider):
|
||||||
|
"""Scrapy spider for ftp directories. Will gather all files recursively"""
|
||||||
|
|
||||||
|
name = "ftp_links"
|
||||||
|
|
||||||
|
handle_httpstatus_list = [404]
|
||||||
|
|
||||||
|
def __index__(self, **kw):
|
||||||
|
super(FtpLinksSpider, self).__init__(**kw)
|
||||||
|
self.base_url = kw.get("base_url")
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
yield AnonFtpRequest(url=self.base_url, callback=self.parse)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
stripped_url = response.url[len(self.base_url) - 1:]
|
||||||
|
|
||||||
|
files = json.loads(response.body)
|
||||||
|
for file in files:
|
||||||
|
|
||||||
|
if file['filetype'] == 'd':
|
||||||
|
yield AnonFtpRequest(os.path.join(response.url, file["filename"]))
|
||||||
|
|
||||||
|
if file['filetype'] == '-':
|
||||||
|
print(file)
|
||||||
|
result = File(
|
||||||
|
name=file['filename'],
|
||||||
|
path=stripped_url.strip("/"),
|
||||||
|
size=file['size'],
|
||||||
|
mtime=file['date'])
|
||||||
|
yield result
|
@ -1,5 +1,5 @@
|
|||||||
import scrapy
|
import scrapy
|
||||||
from os import path
|
import os
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
|
|
||||||
@ -65,9 +65,12 @@ class LinksSpider(scrapy.Spider):
|
|||||||
# Save file information
|
# Save file information
|
||||||
stripped_url = response.url[len(self.base_url) - 1:]
|
stripped_url = response.url[len(self.base_url) - 1:]
|
||||||
self.crawled_links.add(response.url)
|
self.crawled_links.add(response.url)
|
||||||
|
|
||||||
|
path, name = os.path.split(stripped_url)
|
||||||
|
|
||||||
yield {
|
yield {
|
||||||
"path": unquote(path.split(stripped_url)[0]).strip("/"),
|
"path": unquote(path).strip("/"),
|
||||||
"name": unquote(path.split(stripped_url)[1]),
|
"name": unquote(name),
|
||||||
"size": int(response.headers["Content-Length"].decode("utf-8")) if "Content-Length" in response.headers else -1,
|
"size": int(response.headers["Content-Length"].decode("utf-8")) if "Content-Length" in response.headers else -1,
|
||||||
"mime": response.headers["Content-Type"].decode("utf-8").split(";", maxsplit=1)[0]
|
"mime": response.headers["Content-Type"].decode("utf-8").split(";", maxsplit=1)[0]
|
||||||
if "Content-Type" in response.headers else "?",
|
if "Content-Type" in response.headers else "?",
|
||||||
|
3
task.py
3
task.py
@ -56,6 +56,7 @@ class TaskManager:
|
|||||||
os.remove("data.json")
|
os.remove("data.json")
|
||||||
print("Imported in SQLite3")
|
print("Imported in SQLite3")
|
||||||
|
|
||||||
|
# TODO: Extract 'callbacks' for posts and comments in a function
|
||||||
if post_id:
|
if post_id:
|
||||||
# Reply to post
|
# Reply to post
|
||||||
stats = self.db.get_website_stats(website.id)
|
stats = self.db.get_website_stats(website.id)
|
||||||
@ -75,6 +76,8 @@ class TaskManager:
|
|||||||
print(comment)
|
print(comment)
|
||||||
reddit_comment = self.reddit_bot.reddit.comment(comment_id)
|
reddit_comment = self.reddit_bot.reddit.comment(comment_id)
|
||||||
self.reddit_bot.reply(reddit_comment, comment)
|
self.reddit_bot.reply(reddit_comment, comment)
|
||||||
|
|
||||||
busy.value = 0
|
busy.value = 0
|
||||||
print("Done crawling task")
|
print("Done crawling task")
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user