Slowly losing my sanity part 1: Removed scrapy dependency and moved to custom solution. Added multi-threaded ftp crawler

This commit is contained in:
Simon
2018-06-11 15:46:55 -04:00
parent b649b82854
commit 7f496ce7a8
10 changed files with 338 additions and 284 deletions

0
crawler/__init__.py Normal file
View File

133
crawler/crawler.py Normal file
View File

@@ -0,0 +1,133 @@
import os
import json
from urllib.parse import urlparse
from timeout_decorator.timeout_decorator import TimeoutError
from threading import Thread
from queue import Queue, Empty
class TooManyConnectionsError(Exception):
pass
class File:
def __init__(self, name: str, size: int, mtime: str, path: str, is_dir: bool):
self.name = name
self.size = size
self.mtime = mtime
self.path = path
self.is_dir = is_dir
def __str__(self):
return ("DIR " if self.is_dir else "FILE ") + self.path + "/" + self.name
def to_json(self):
return json.dumps({
"name": self.name,
"size": self.size,
"mtime": self.mtime,
"path": self.path,
})
class RemoteDirectory:
SCHEMES = ()
def __init__(self, base_url):
self.base_url = base_url
def list_dir(self, path: str) -> list:
raise NotImplementedError
def close(self):
pass
class RemoteDirectoryFactory:
from crawler.ftp import FtpDirectory
from crawler.http import HttpDirectory
DIR_ENGINES = (FtpDirectory, HttpDirectory)
@staticmethod
def get_directory(url) -> RemoteDirectory:
parsed_url = urlparse(url)
for dir_engine in RemoteDirectoryFactory.DIR_ENGINES:
if parsed_url.scheme in dir_engine.SCHEMES:
return dir_engine(url)
class RemoteDirectoryCrawler:
def __init__(self, url, max_threads: int):
self.url = url
self.max_threads = max_threads
def crawl_directory(self):
try:
directory = RemoteDirectoryFactory.get_directory(self.url)
root_listing = directory.list_dir("/dl2/") # todo get path
directory.close()
except TimeoutError:
return
in_q = Queue(maxsize=0)
files_q = Queue(maxsize=0)
for f in root_listing:
if f.is_dir:
in_q.put(f)
else:
files_q.put(f)
threads = []
for i in range(self.max_threads):
worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self.url, in_q, files_q))
threads.append(worker)
worker.start()
in_q.join()
print("DONE")
# Kill threads
for _ in threads:
in_q.put(None)
for t in threads:
t.join()
print(files_q.qsize())
return []
@staticmethod
def _process_listings(url: str, in_q: Queue, files_q: Queue):
directory = RemoteDirectoryFactory.get_directory(url)
while directory:
try:
file = in_q.get(timeout=60)
except Empty:
break
if file is None:
break
try:
listing = directory.list_dir(os.path.join(file.path, file.name, ""))
for f in listing:
if f.is_dir:
in_q.put(f)
else:
files_q.put(f)
except TooManyConnectionsError as e:
print("TOO MANY CONNNS")
except TimeoutError:
pass
finally:
in_q.task_done()

79
crawler/ftp.py Normal file
View File

@@ -0,0 +1,79 @@
#! /usr/bin/env python
from urllib.parse import urlparse
import os
import time
import ftputil
import ftputil.error
from ftputil.session import session_factory
import random
import timeout_decorator
from crawler.crawler import RemoteDirectory, File, TooManyConnectionsError
class FtpDirectory(RemoteDirectory):
SCHEMES = ("ftp", )
def __init__(self, url):
host = urlparse(url).netloc
super().__init__(host)
self.failed_attempts = 0
self.max_attempts = 2
self.ftp = None
self.stop_when_connected()
def _connect(self):
self.ftp = ftputil.FTPHost(self.base_url, "anonymous", "od-database", session_factory=session_factory(
use_passive_mode=False
))
def stop_when_connected(self):
while self.failed_attempts < self.max_attempts:
try:
self._connect()
self.failed_attempts = 0
break
except ftputil.error.FTPError as e:
if e.errno == 530:
print("Cancel connection - too many connections")
break
self.failed_attempts += 1
print("Connection error; reconnecting...")
time.sleep(2 * random.uniform(0.5, 1.5))
self.stop_when_connected()
@timeout_decorator.timeout(15, use_signals=False)
def list_dir(self, path) -> list:
if not self.ftp:
print("Conn closed")
return []
results = []
try:
self.ftp.chdir(path)
file_names = self.ftp.listdir(path)
for file_name in file_names:
stat = self.ftp.stat(file_name)
is_dir = self.ftp.path.isdir(os.path.join(path, file_name))
results.append(File(
name=file_name,
mtime=stat.st_mtime,
size=-1 if is_dir else stat.st_size,
is_dir=is_dir,
path=path
))
except ftputil.error.FTPError as e:
if e.errno == 530:
raise TooManyConnectionsError()
pass
return results
def close(self):
if self.ftp:
self.ftp.close()

123
crawler/http.py Normal file
View File

@@ -0,0 +1,123 @@
from urllib.parse import urlparse, urljoin, unquote
import os
from lxml import etree
from itertools import repeat
from crawler.crawler import RemoteDirectory, File
import requests
from requests.exceptions import RequestException
from multiprocessing.pool import ThreadPool
class HttpDirectory(RemoteDirectory):
SCHEMES = ("http", "https",)
HEADERS = {}
BLACK_LIST = (
"?C=N&O=D",
"?C=M&O=A",
"?C=S&O=A",
"?C=D&O=A",
"?C=N;O=D",
"?C=M;O=A",
"?C=S;O=A",
"?C=D;O=A"
)
def __init__(self, url):
super().__init__(url)
self.parser = etree.HTMLParser(collect_ids=False)
def list_dir(self, path) -> list:
results = []
path_url = urljoin(self.base_url, path)
body = self._fetch_body(path_url)
links = self._parse_links(body)
urls_to_request = []
for link in links:
if self._should_ignore(link):
continue
file_url = urljoin(path_url, link[1])
path, file_name = os.path.split(file_url[len(self.base_url) - 1:])
if self._isdir(link):
results.append(File(
name=file_name,
mtime="",
size=-1,
is_dir=True,
path=path
))
else:
urls_to_request.append(file_url)
pool = ThreadPool(processes=10)
files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
for f in files:
if f:
results.append(f)
return results
def _get_url(self, path: str):
return urljoin(self.base_url, path)
@staticmethod
def _fetch_body(url: str):
# todo timeout
print("FETCH " + url)
r = requests.get(url, headers=HttpDirectory.HEADERS)
return r.text
def _parse_links(self, body: str) -> set:
result = set()
tree = etree.HTML(body, parser=self.parser)
links = tree.findall(".//a/[@href]")
for link in links:
result.add((link.text, link.get("href")))
return result
@staticmethod
def _isdir(url):
return url[1].rsplit("?", maxsplit=1)[0].endswith("/")
def _request_file(self, url):
# todo timeout
retries = 3
while retries > 0:
try:
print("HEAD " + url)
r = requests.head(url, headers=HttpDirectory.HEADERS, allow_redirects=False, timeout=50)
stripped_url = r.url[len(self.base_url) - 1:]
path, name = os.path.split(stripped_url)
return File(
path=unquote(path).strip("/"),
name=unquote(name),
size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
mtime=r.headers["Date"] if "Date" in r.headers else "?",
is_dir=False
)
except RequestException:
retries -= 1
return None
@staticmethod
def _should_ignore(link):
return link[0] == "../" or link[1].endswith(HttpDirectory.BLACK_LIST)