2018-06-11 20:02:30 -04:00

152 lines
4.0 KiB
Python

from urllib.parse import urljoin, unquote
import os
from lxml import etree
from itertools import repeat
from crawler.crawler import RemoteDirectory, File
import requests
from requests.exceptions import RequestException
from multiprocessing.pool import ThreadPool
import config
class Link:
def __init__(self, text: str, url: str):
self.text = text
self.url = url
class HttpDirectory(RemoteDirectory):
SCHEMES = ("http", "https",)
HEADERS = config.HEADERS
BLACK_LIST = (
"?C=N&O=D",
"?C=M&O=A",
"?C=S&O=A",
"?C=D&O=A",
"?C=N;O=D",
"?C=M;O=A",
"?C=S;O=A",
"?C=D;O=A"
)
MAX_RETRIES = 3
def __init__(self, url):
super().__init__(url)
self.parser = etree.HTMLParser(collect_ids=False)
def list_dir(self, path) -> list:
results = []
path_url = os.path.join(self.base_url, path.strip("/"), "")
body = self._fetch_body(path_url)
if not body:
return []
links = self._parse_links(body)
urls_to_request = []
for link in links:
if self._should_ignore(link):
continue
file_url = urljoin(path_url, link.url)
path, file_name = os.path.split(file_url[len(self.base_url) - 1:])
if self._isdir(link):
results.append(File(
name=file_name,
mtime="",
size=-1,
is_dir=True,
path=path
))
else:
urls_to_request.append(file_url)
results.extend(self.request_files(urls_to_request))
return results
def request_files(self, urls_to_request: list) -> list:
results = []
if len(urls_to_request) > 3:
# Many urls, use multi-threaded solution
pool = ThreadPool(processes=10)
files = pool.starmap(HttpDirectory._request_file, zip(repeat(self), urls_to_request))
for file in files:
if file:
results.append(file)
else:
# Too few urls to create thread pool
for url in urls_to_request:
file = self._request_file(url)
if file:
results.append(file)
return results
def _get_url(self, path: str):
return urljoin(self.base_url, path)
@staticmethod
def _fetch_body(url: str):
retries = HttpDirectory.MAX_RETRIES
while retries > 0:
try:
r = requests.get(url, headers=HttpDirectory.HEADERS)
return r.text
except RequestException:
retries -= 1
return None
def _parse_links(self, body: str) -> set:
result = set()
tree = etree.HTML(body, parser=self.parser)
links = tree.findall(".//a/[@href]")
for link in links:
result.add(Link(link.text, link.get("href")))
return result
@staticmethod
def _isdir(link: Link):
return link.url.rsplit("?", maxsplit=1)[0].endswith("/")
def _request_file(self, url):
retries = HttpDirectory.MAX_RETRIES
while retries > 0:
try:
r = requests.head(url, headers=HttpDirectory.HEADERS, allow_redirects=False, timeout=50)
stripped_url = url[len(self.base_url) - 1:]
path, name = os.path.split(stripped_url)
return File(
path=unquote(path).strip("/"),
name=unquote(name),
size=int(r.headers["Content-Length"]) if "Content-Length" in r.headers else -1,
mtime=r.headers["Date"] if "Date" in r.headers else "?",
is_dir=False
)
except RequestException:
retries -= 1
return None
@staticmethod
def _should_ignore(link: Link):
return link.text == "../" or link.url.endswith(HttpDirectory.BLACK_LIST)