Slowly losing my sanity part 1: Removed scrapy dependency and moved to custom solution. Added multi-threaded ftp crawler

This commit is contained in:
Simon 2018-06-11 15:46:55 -04:00
parent b649b82854
commit 7f496ce7a8
10 changed files with 338 additions and 284 deletions

133
crawler/crawler.py Normal file
View File

@ -0,0 +1,133 @@
import os
import json
from urllib.parse import urlparse
from timeout_decorator.timeout_decorator import TimeoutError
from threading import Thread
from queue import Queue, Empty
class TooManyConnectionsError(Exception):
pass
class File:
def __init__(self, name: str, size: int, mtime: str, path: str, is_dir: bool):
self.name = name
self.size = size
self.mtime = mtime
self.path = path
self.is_dir = is_dir
def __str__(self):
return ("DIR " if self.is_dir else "FILE ") + self.path + "/" + self.name
def to_json(self):
return json.dumps({
"name": self.name,
"size": self.size,
"mtime": self.mtime,
"path": self.path,
})
class RemoteDirectory:
SCHEMES = ()
def __init__(self, base_url):
self.base_url = base_url
def list_dir(self, path: str) -> list:
raise NotImplementedError
def close(self):
pass
class RemoteDirectoryFactory:
from crawler.ftp import FtpDirectory
from crawler.http import HttpDirectory
DIR_ENGINES = (FtpDirectory, HttpDirectory)
@staticmethod
def get_directory(url) -> RemoteDirectory:
parsed_url = urlparse(url)
for dir_engine in RemoteDirectoryFactory.DIR_ENGINES:
if parsed_url.scheme in dir_engine.SCHEMES:
return dir_engine(url)
class RemoteDirectoryCrawler:
def __init__(self, url, max_threads: int):
self.url = url
self.max_threads = max_threads
def crawl_directory(self):
try:
directory = RemoteDirectoryFactory.get_directory(self.url)
root_listing = directory.list_dir("/dl2/") # todo get path
directory.close()
except TimeoutError:
return
in_q = Queue(maxsize=0)
files_q = Queue(maxsize=0)
for f in root_listing:
if f.is_dir:
in_q.put(f)
else:
files_q.put(f)
threads = []
for i in range(self.max_threads):
worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self.url, in_q, files_q))
threads.append(worker)
worker.start()
in_q.join()
print("DONE")
# Kill threads
for _ in threads:
in_q.put(None)
for t in threads:
t.join()
print(files_q.qsize())
return []
@staticmethod
def _process_listings(url: str, in_q: Queue, files_q: Queue):
directory = RemoteDirectoryFactory.get_directory(url)
while directory:
try:
file = in_q.get(timeout=60)
except Empty:
break
if file is None:
break
try:
listing = directory.list_dir(os.path.join(file.path, file.name, ""))
for f in listing:
if f.is_dir:
in_q.put(f)
else:
files_q.put(f)
except TooManyConnectionsError as e:
print("TOO MANY CONNNS")
except TimeoutError:
pass
finally:
in_q.task_done()

79
crawler/ftp.py Normal file
View File

@ -0,0 +1,79 @@
#! /usr/bin/env python
from urllib.parse import urlparse
import os
import time
import ftputil
import ftputil.error
from ftputil.session import session_factory
import random
import timeout_decorator
from crawler.crawler import RemoteDirectory, File, TooManyConnectionsError
class FtpDirectory(RemoteDirectory):
SCHEMES = ("ftp", )
def __init__(self, url):
host = urlparse(url).netloc
super().__init__(host)
self.failed_attempts = 0
self.max_attempts = 2
self.ftp = None
self.stop_when_connected()
def _connect(self):
self.ftp = ftputil.FTPHost(self.base_url, "anonymous", "od-database", session_factory=session_factory(
use_passive_mode=False
))
def stop_when_connected(self):
while self.failed_attempts < self.max_attempts:
try:
self._connect()
self.failed_attempts = 0
break
except ftputil.error.FTPError as e:
if e.errno == 530:
print("Cancel connection - too many connections")
break
self.failed_attempts += 1
print("Connection error; reconnecting...")
time.sleep(2 * random.uniform(0.5, 1.5))
self.stop_when_connected()
@timeout_decorator.timeout(15, use_signals=False)
def list_dir(self, path) -> list:
if not self.ftp:
print("Conn closed")
return []
results = []
try:
self.ftp.chdir(path)
file_names = self.ftp.listdir(path)
for file_name in file_names:
stat = self.ftp.stat(file_name)
is_dir = self.ftp.path.isdir(os.path.join(path, file_name))
results.append(File(
name=file_name,
mtime=stat.st_mtime,
size=-1 if is_dir else stat.st_size,
is_dir=is_dir,
path=path
))
except ftputil.error.FTPError as e:
if e.errno == 530:
raise TooManyConnectionsError()
pass
return results
def close(self):
if self.ftp:
self.ftp.close()

123
crawler/http.py Normal file
View File

@ -0,0 +1,123 @@
from urllib.parse import urlparse, urljoin, unquote
import os
from lxml import etree
from itertools import repeat
from crawler.crawler import RemoteDirectory, File
import requests
from requests.exceptions import RequestException
from multiprocessing.pool import ThreadPool
class HttpDirectory(RemoteDirectory):
SCHEMES = ("http", "https",)
HEADERS = {}
BLACK_LIST = (
"?C=N&O=D",
"?C=M&O=A",
"?C=S&O=A",
"?C=D&O=A",
"?C=N;O=D",
"?C=M;O=A",
"?C=S;O=A",
"?C=D;O=A"
)
def __init__(self, url):
super().__init__(url)
self.parser = etree.HTMLParser(collect_ids=False)
def list_dir(self, path) -> list:
results = []
path_url = urljoin(self.base_url, path)
body = self._fetch_body(path_url)
links = self._parse_links(body)
urls_to_request = []
for link in links:
if self._should_ignore(link):
continue
file_url = urljoin(path_url, link[1])
path, file_name = os.path.split(file_url[len(self.base_url) - 1:])
if self._isdir(link):
results.append(File(
name=file_name,
mtime="",
size=-1,
is_dir=True,
path=path
))
else:
urls_to_request.append(file_url)
pool = ThreadPool(processes=10)
files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
for f in files:
if f:
results.append(f)
return results
def _get_url(self, path: str):
return urljoin(self.base_url, path)
@staticmethod
def _fetch_body(url: str):
# todo timeout
print("FETCH " + url)
r = requests.get(url, headers=HttpDirectory.HEADERS)
return r.text
def _parse_links(self, body: str) -> set:
result = set()
tree = etree.HTML(body, parser=self.parser)
links = tree.findall(".//a/[@href]")
for link in links:
result.add((link.text, link.get("href")))
return result
@staticmethod
def _isdir(url):
return url[1].rsplit("?", maxsplit=1)[0].endswith("/")
def _request_file(self, url):
# todo timeout
retries = 3
while retries > 0:
try:
print("HEAD " + url)
r = requests.head(url, headers=HttpDirectory.HEADERS, allow_redirects=False, timeout=50)
stripped_url = r.url[len(self.base_url) - 1:]
path, name = os.path.split(stripped_url)
return File(
path=unquote(path).strip("/"),
name=unquote(name),
size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
mtime=r.headers["Date"] if "Date" in r.headers else "?",
is_dir=False
)
except RequestException:
retries -= 1
return None
@staticmethod
def _should_ignore(link):
return link[0] == "../" or link[1].endswith(HttpDirectory.BLACK_LIST)

View File

@ -1,127 +0,0 @@
#! /usr/bin/env python
from threading import Thread
from queue import Queue
import os
import time
import ftputil
import ftputil.error
import random
class File:
def __init__(self, name: str, size: int, mtime: str, path: str, is_dir: bool):
self.name = name
self.size = size
self.mtime = mtime
self.path = path
self.is_dir = is_dir
self.ftp = None
def __str__(self):
return ("DIR " if self.is_dir else "FILE ") + self.path + "/" + self.name
class FTPConnection(object):
def __init__(self, host):
self.host = host
self.failed_attempts = 0
self.max_attempts = 2
self.ftp = None
self.stop_when_connected()
def _connect(self):
print("Connecting to " + self.host)
self.ftp = ftputil.FTPHost(self.host, "anonymous", "od-database")
def stop_when_connected(self):
while self.failed_attempts < self.max_attempts:
try:
self._connect()
self.failed_attempts = 0
break
except ftputil.error.FTPError as e:
if e.errno == 530:
print("Cancel connection - too many connections")
break
self.failed_attempts += 1
print("LIST FAILED; reconnecting...")
time.sleep(2 * random.uniform(0.5, 1.5))
self.stop_when_connected()
def list_dir(self, path) -> list:
if not self.ftp:
return []
results = []
self.ftp.chdir(path)
try:
file_names = self.ftp.listdir(path)
for file_name in file_names:
stat = self.ftp.stat(file_name)
is_dir = self.ftp.path.isdir(os.path.join(path, file_name))
results.append(File(
name=file_name,
mtime=stat.st_mtime,
size=-1 if is_dir else stat.st_size,
is_dir=is_dir,
path=path
))
except ftputil.error.FTPError:
print("ERROR parsing " + path)
return results
def process_and_queue(host, q: Queue):
ftp = FTPConnection(host)
while ftp.ftp:
file = q.get()
if file.is_dir:
print(file)
try:
listing = ftp.list_dir(os.path.join(file.path, file.name))
for f in listing:
q.put(f)
except ftputil.error.PermanentError as e:
if e.errno == 530:
# Too many connections, retry this dir but kill this thread
q.put(file)
ftp.ftp.close()
print("Dropping connection because too many")
else:
pass
q.task_done()
def crawl_ftp_server(host: str, max_threads: int) -> list:
ftp = FTPConnection(host)
root_listing = ftp.list_dir("/")
if ftp.ftp:
ftp.ftp.close()
q = Queue(maxsize=0)
for i in range(max_threads):
worker = Thread(target=process_and_queue, args=(host, q,))
worker.setDaemon(True)
worker.start()
for file in root_listing:
q.put(file)
q.join()
return []
if __name__ == '__main__':
import sys
crawl_ftp_server(sys.argv[1], 50)

View File

@ -75,7 +75,7 @@ def is_od(url):
ftp.close()
return True
else:
r = requests.get(url, timeout=15, allow_redirects=False)
r = requests.get(url, timeout=30, allow_redirects=False)
if r.status_code != 200:
print("No redirects allowed!")
return False

View File

@ -1,4 +1,3 @@
scrapy
flask
requests
bs4
@ -9,4 +8,5 @@ praw
humanfriendly
apscheduler
bcrypt
ftputil
ftputil
lxml

View File

@ -1,71 +0,0 @@
# -*- coding: utf-8 -*-
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'scrapy_od_database'
SPIDER_MODULES = ['scrapy_od_database.spiders']
NEWSPIDER_MODULE = 'scrapy_od_database.spiders'
DOWNLOAD_HANDLERS = {'ftp': 'scrapy_od_database.handlers.FtpListingHandler'}
LOG_LEVEL = 'ERROR'
FEED_FORMAT = 'json'
FEED_URI = 'data.json'
USER_AGENT = 'Mozilla/5.0 (X11; od-database.simon987.net) AppleWebKit/537.36 ' \
'(KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 40
RETRY_TIMES = 6
DOWNLOAD_TIMEOUT = 90
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 40
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View File

@ -1,4 +0,0 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -1,79 +0,0 @@
import scrapy
import os
from urllib.parse import unquote
class LinksSpider(scrapy.Spider):
"""Scrapy spider for open directories. Will gather all download links recursively"""
name = "od_links"
black_list = (
"?C=N&O=D",
"?C=M&O=A",
"?C=S&O=A",
"?C=D&O=A",
"?C=N;O=D",
"?C=M;O=A",
"?C=S;O=A",
"?C=D;O=A"
)
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.crawled_links = set()
def __index__(self, **kw):
super(LinksSpider, self).__init__(**kw)
self.base_url = kw.get("base_url")
def should_ask_headers(self, link):
"""Whether or not to send HEAD request"""
return link not in self.crawled_links and not link.rsplit("?", maxsplit=1)[0].endswith("/")
def should_crawl(self, link):
"""Whether or not the link should be followed"""
if link in self.crawled_links:
return False
if link.endswith(tuple(self.black_list)):
return False
if not link.startswith(self.base_url):
return False
return link.rsplit("?", maxsplit=1)[0].endswith("/")
def start_requests(self):
yield scrapy.Request(url=self.base_url, callback=self.parse)
def parse(self, response):
if response.status == 200:
links = response.xpath('//a/@href').extract()
for link in links:
full_link = response.urljoin(link)
if self.should_ask_headers(full_link):
yield scrapy.Request(full_link, method="HEAD", callback=self.save_file)
elif self.should_crawl(full_link):
self.crawled_links.add(full_link)
yield scrapy.Request(full_link, callback=self.parse)
def save_file(self, response):
if response.status == 200:
# Save file information
stripped_url = response.url[len(self.base_url) - 1:]
self.crawled_links.add(response.url)
path, name = os.path.split(stripped_url)
yield {
"path": unquote(path).strip("/"),
"name": unquote(name),
"size": int(response.headers["Content-Length"].decode("utf-8")) if "Content-Length" in response.headers else -1,
"mime": response.headers["Content-Type"].decode("utf-8").split(";", maxsplit=1)[0]
if "Content-Type" in response.headers else "?",
"mtime": response.headers["Date"].decode("utf-8") if "Date" in response.headers else "?"
}