mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 10:26:44 +00:00
Removed unsuitable scrapy spider and implemented custom crawler
This commit is contained in:
parent
d8c16d53e6
commit
f2d914060b
3
app.py
3
app.py
@ -157,8 +157,7 @@ def try_enqueue(url):
|
|||||||
return "A parent directory of this url has already been posted", "danger"
|
return "A parent directory of this url has already been posted", "danger"
|
||||||
|
|
||||||
if not od_util.is_valid_url(url):
|
if not od_util.is_valid_url(url):
|
||||||
return "<strong>Error:</strong> Invalid url. Make sure to include the http(s):// suffix. " \
|
return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme.", "danger"
|
||||||
"FTP is not supported", "danger"
|
|
||||||
|
|
||||||
if od_util.is_blacklisted(url):
|
if od_util.is_blacklisted(url):
|
||||||
|
|
||||||
|
127
ftp_crawler.py
Normal file
127
ftp_crawler.py
Normal file
@ -0,0 +1,127 @@
|
|||||||
|
#! /usr/bin/env python
|
||||||
|
|
||||||
|
from threading import Thread
|
||||||
|
from queue import Queue
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import ftputil
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
class File:
|
||||||
|
|
||||||
|
def __init__(self, name: str, size: int, mtime: str, path: str, is_dir: bool):
|
||||||
|
self.name = name
|
||||||
|
self.size = size
|
||||||
|
self.mtime = mtime
|
||||||
|
self.path = path
|
||||||
|
self.is_dir = is_dir
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return ("DIR " if self.is_dir else "FILE ") + self.path + "/" + self.name
|
||||||
|
|
||||||
|
|
||||||
|
class FTPConnection(object):
|
||||||
|
def __init__(self, host):
|
||||||
|
self.host = host
|
||||||
|
self.failed_attempts = 0
|
||||||
|
self.max_attempts = 5
|
||||||
|
self.stop_when_connected()
|
||||||
|
self._list_fn = None
|
||||||
|
|
||||||
|
def _connect(self):
|
||||||
|
# attempt an anonymous FTP connection
|
||||||
|
print("CONNECT %s ATTEMPT", self.host)
|
||||||
|
self.ftp = ftputil.FTPHost(self.host, "anonymous", "od-database")
|
||||||
|
print("CONNECT %s SUCCESS", self.host)
|
||||||
|
|
||||||
|
def stop_when_connected(self):
|
||||||
|
# continually tries to reconnect ad infinitum
|
||||||
|
# TODO: Max retries
|
||||||
|
try:
|
||||||
|
self._connect()
|
||||||
|
except Exception:
|
||||||
|
print("CONNECT %s FAILED; trying again...", self.host)
|
||||||
|
time.sleep(5 * random.uniform(0.5, 1.5))
|
||||||
|
self.stop_when_connected()
|
||||||
|
|
||||||
|
def list(self, path) -> list:
|
||||||
|
results = []
|
||||||
|
self.ftp.chdir(path)
|
||||||
|
file_names = self.ftp.listdir(path)
|
||||||
|
|
||||||
|
for file_name in file_names:
|
||||||
|
stat = self.ftp.stat(file_name)
|
||||||
|
is_dir = self.ftp.path.isdir(os.path.join(path, file_name))
|
||||||
|
|
||||||
|
results.append(File(
|
||||||
|
name=file_name,
|
||||||
|
mtime=stat.st_mtime,
|
||||||
|
size=-1 if is_dir else stat.st_size,
|
||||||
|
is_dir=is_dir,
|
||||||
|
path=path
|
||||||
|
))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def process_path(self, path):
|
||||||
|
while self.failed_attempts < self.max_attempts:
|
||||||
|
try:
|
||||||
|
results = self.list(path)
|
||||||
|
self.failed_attempts = 0
|
||||||
|
return results
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
self.failed_attempts += 1
|
||||||
|
self.ftp.close()
|
||||||
|
print("LIST FAILED; reconnecting...")
|
||||||
|
time.sleep(2 * random.uniform(0.5, 1.5))
|
||||||
|
self.stop_when_connected()
|
||||||
|
|
||||||
|
# if I get here, I never succeeded in getting the data
|
||||||
|
print("LIST ABANDONED %s", path)
|
||||||
|
self.failed_attempts = 0
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def process_and_queue(host, q: Queue):
|
||||||
|
|
||||||
|
ftp = FTPConnection(host)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
file = q.get()
|
||||||
|
|
||||||
|
if file.is_dir:
|
||||||
|
print(file)
|
||||||
|
listing = ftp.process_path(os.path.join(file.path, file.name))
|
||||||
|
for f in listing:
|
||||||
|
q.put(f)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
|
q.task_done()
|
||||||
|
|
||||||
|
|
||||||
|
def do_the_thing():
|
||||||
|
|
||||||
|
host = "80.252.155.68"
|
||||||
|
ftp = FTPConnection(host)
|
||||||
|
root_listing = ftp.process_path("/")
|
||||||
|
ftp.ftp.close()
|
||||||
|
|
||||||
|
q = Queue(maxsize=0)
|
||||||
|
num_threads = 10
|
||||||
|
|
||||||
|
for i in range(num_threads):
|
||||||
|
worker = Thread(target=process_and_queue, args=(host, q,))
|
||||||
|
worker.setDaemon(True)
|
||||||
|
worker.start()
|
||||||
|
|
||||||
|
for file in root_listing:
|
||||||
|
q.put(file)
|
||||||
|
|
||||||
|
q.join()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
do_the_thing()
|
@ -9,4 +9,4 @@ praw
|
|||||||
humanfriendly
|
humanfriendly
|
||||||
apscheduler
|
apscheduler
|
||||||
bcrypt
|
bcrypt
|
||||||
twisted
|
ftputil
|
@ -1,24 +0,0 @@
|
|||||||
import json
|
|
||||||
from twisted.protocols.ftp import FTPFileListProtocol
|
|
||||||
from scrapy.http import Response
|
|
||||||
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
|
|
||||||
|
|
||||||
|
|
||||||
# Inspired by https://github.com/laserson/ftptree
|
|
||||||
class FtpListingHandler(FTPDownloadHandler):
|
|
||||||
|
|
||||||
def gotClient(self, client, request, file_path):
|
|
||||||
|
|
||||||
protocol = FTPFileListProtocol()
|
|
||||||
|
|
||||||
return client.list(file_path, protocol).addCallbacks(
|
|
||||||
callback=self._build_response,
|
|
||||||
callbackArgs=(request, protocol),
|
|
||||||
errback=self._failed,
|
|
||||||
errbackArgs=(request, ))
|
|
||||||
|
|
||||||
def _build_response(self, result, request, protocol):
|
|
||||||
|
|
||||||
self.result = result
|
|
||||||
body = json.dumps(protocol.files).encode()
|
|
||||||
return Response(url=request.url, status=200, body=body)
|
|
@ -1,9 +0,0 @@
|
|||||||
from scrapy import Item, Field
|
|
||||||
|
|
||||||
|
|
||||||
class File(Item):
|
|
||||||
path = Field()
|
|
||||||
name = Field()
|
|
||||||
mime = Field()
|
|
||||||
mtime = Field()
|
|
||||||
size = Field()
|
|
@ -1,49 +0,0 @@
|
|||||||
import json
|
|
||||||
import scrapy
|
|
||||||
import os
|
|
||||||
from scrapy_od_database.items import File
|
|
||||||
|
|
||||||
|
|
||||||
class AnonFtpRequest(scrapy.Request):
|
|
||||||
|
|
||||||
anon_meta = {
|
|
||||||
"ftp_user": "anonymous",
|
|
||||||
"ftp_password": "od-database"
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super(AnonFtpRequest, self).__init__(*args, **kwargs)
|
|
||||||
self.meta.update(self.anon_meta)
|
|
||||||
|
|
||||||
|
|
||||||
class FtpLinksSpider(scrapy.Spider):
|
|
||||||
"""Scrapy spider for ftp directories. Will gather all files recursively"""
|
|
||||||
|
|
||||||
name = "ftp_links"
|
|
||||||
|
|
||||||
handle_httpstatus_list = [404]
|
|
||||||
|
|
||||||
def __index__(self, **kw):
|
|
||||||
super(FtpLinksSpider, self).__init__(**kw)
|
|
||||||
self.base_url = kw.get("base_url")
|
|
||||||
|
|
||||||
def start_requests(self):
|
|
||||||
yield AnonFtpRequest(url=self.base_url, callback=self.parse)
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
stripped_url = response.url[len(self.base_url) - 1:]
|
|
||||||
|
|
||||||
files = json.loads(response.body)
|
|
||||||
for file in files:
|
|
||||||
|
|
||||||
if file['filetype'] == 'd':
|
|
||||||
yield AnonFtpRequest(os.path.join(response.url, file["filename"]))
|
|
||||||
|
|
||||||
if file['filetype'] == '-':
|
|
||||||
print(file)
|
|
||||||
result = File(
|
|
||||||
name=file['filename'],
|
|
||||||
path=stripped_url.strip("/"),
|
|
||||||
size=file['size'],
|
|
||||||
mtime=file['date'])
|
|
||||||
yield result
|
|
Loading…
x
Reference in New Issue
Block a user