Attempt to handle looping directories

This commit is contained in:
Simon 2018-06-21 11:54:40 -04:00
parent dd93d40a55
commit 073551df3c
3 changed files with 42 additions and 19 deletions

View File

@ -20,8 +20,13 @@ class File:
self.path = path self.path = path
self.is_dir = is_dir self.is_dir = is_dir
def __str__(self): def __bytes__(self):
return ("DIR " if self.is_dir else "FILE ") + self.path + "/" + self.name return b"|".join([
self.name.encode(),
b"D" if self.is_dir else b"F",
str(self.size).encode(),
str(self.mtime).encode(),
])
def to_json(self): def to_json(self):
return ujson.dumps({ return ujson.dumps({
@ -39,7 +44,7 @@ class RemoteDirectory:
def __init__(self, base_url): def __init__(self, base_url):
self.base_url = base_url self.base_url = base_url
def list_dir(self, path: str) -> list: def list_dir(self, path: str):
raise NotImplementedError raise NotImplementedError
def close(self): def close(self):
@ -82,8 +87,8 @@ class RemoteDirectoryCrawler:
try: try:
directory = RemoteDirectoryFactory.get_directory(self.url) directory = RemoteDirectoryFactory.get_directory(self.url)
root_listing = directory.list_dir("") path, root_listing = directory.list_dir("")
self.crawled_paths.append("") self.crawled_paths.append(path)
directory.close() directory.close()
except TimeoutError: except TimeoutError:
return CrawlResult(0, "timeout") return CrawlResult(0, "timeout")
@ -136,9 +141,9 @@ class RemoteDirectoryCrawler:
break break
try: try:
if path not in self.crawled_paths: path_id, listing = directory.list_dir(path)
self.crawled_paths.append(path) if len(listing) > 0 and path_id not in self.crawled_paths:
listing = directory.list_dir(path) self.crawled_paths.append(path_id)
timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES
for f in listing: for f in listing:
@ -148,6 +153,9 @@ class RemoteDirectoryCrawler:
files_q.put(f) files_q.put(f)
import sys import sys
print("LISTED " + repr(path) + "dirs:" + str(in_q.qsize())) print("LISTED " + repr(path) + "dirs:" + str(in_q.qsize()))
else:
pass
# print("SKIPPED: " + path + ", dropped " + str(len(listing)))
except TooManyConnectionsError: except TooManyConnectionsError:
print("Too many connections") print("Too many connections")
# Kill worker and resubmit listing task # Kill worker and resubmit listing task

View File

@ -44,7 +44,7 @@ class FtpDirectory(RemoteDirectory):
print("Connection error; reconnecting..." + e.strerror + " " + str(e.errno)) print("Connection error; reconnecting..." + e.strerror + " " + str(e.errno))
time.sleep(2 * random.uniform(0.5, 1.5)) time.sleep(2 * random.uniform(0.5, 1.5))
def list_dir(self, path) -> list: def list_dir(self, path):
if not self.ftp: if not self.ftp:
# No connection - assuming that connection was dropped because too many # No connection - assuming that connection was dropped because too many
raise TooManyConnectionsError() raise TooManyConnectionsError()
@ -65,7 +65,7 @@ class FtpDirectory(RemoteDirectory):
is_dir=is_dir, is_dir=is_dir,
path=path path=path
)) ))
return results return path, results
except ftputil.error.ParserError as e: except ftputil.error.ParserError as e:
print("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name)) print("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name))
break break
@ -82,7 +82,7 @@ class FtpDirectory(RemoteDirectory):
print(type(e)) print(type(e))
raise e raise e
return [] return path, []
def try_stat(self, path): def try_stat(self, path):

View File

@ -9,6 +9,7 @@ from requests.exceptions import RequestException
from multiprocessing.pool import ThreadPool from multiprocessing.pool import ThreadPool
import config import config
from dateutil.parser import parse as parse_date from dateutil.parser import parse as parse_date
import hashlib
class Anchor: class Anchor:
@ -66,7 +67,9 @@ class HttpDirectory(RemoteDirectory):
"?MA", "?MA",
"?SA", "?SA",
"?DA", "?DA",
"?ND" "?ND",
"?C=N&O=A",
"?C=N&O=A"
) )
MAX_RETRIES = 3 MAX_RETRIES = 3
@ -79,31 +82,40 @@ class HttpDirectory(RemoteDirectory):
def list_dir(self, path): def list_dir(self, path):
current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1]
path_identifier = hashlib.sha1(current_dir_name.encode())
path_url = urljoin(self.base_url, path, "") path_url = urljoin(self.base_url, path, "")
body = self._stream_body(path_url) body = self._stream_body(path_url)
if not body: if not body:
return None return None, None
anchors = self._parse_links(body) anchors = self._parse_links(body)
urls_to_request = [] urls_to_request = []
files = []
for anchor in anchors: for anchor in anchors:
if self._should_ignore(self.base_url, anchor): if self._should_ignore(self.base_url, anchor):
continue continue
if self._isdir(anchor): if self._isdir(anchor):
yield File(
directory = File(
name=anchor.href, name=anchor.href,
mtime=None, mtime=0,
size=None, size=0,
path=path, path=path,
is_dir=True is_dir=True
) )
path_identifier.update(bytes(directory))
files.append(directory)
else: else:
urls_to_request.append(urljoin(path_url, anchor.href)) urls_to_request.append(urljoin(path_url, anchor.href))
for file in self.request_files(urls_to_request): for file in self.request_files(urls_to_request):
yield file files.append(file)
path_identifier.update(bytes(file))
return path_identifier.hexdigest(), files
def request_files(self, urls_to_request: list) -> list: def request_files(self, urls_to_request: list) -> list:
@ -168,11 +180,14 @@ class HttpDirectory(RemoteDirectory):
def _parse_links(body): def _parse_links(body):
parser = HTMLAnchorParser() parser = HTMLAnchorParser()
anchors = []
for chunk in body: for chunk in body:
parser.feed(chunk) parser.feed(chunk)
for anchor in parser.anchors: for anchor in parser.anchors:
yield anchor anchors.append(anchor)
return anchors
@staticmethod @staticmethod
def _isdir(link: Anchor): def _isdir(link: Anchor):
@ -180,7 +195,7 @@ class HttpDirectory(RemoteDirectory):
@staticmethod @staticmethod
def _should_ignore(base_url, link: Anchor): def _should_ignore(base_url, link: Anchor):
if link.text == "../" or link.href == "../" or link.href == "./" \ if link.text == "../" or link.href == "../" or link.href == "./" or link.href == "" \
or link.href.endswith(HttpDirectory.BLACK_LIST): or link.href.endswith(HttpDirectory.BLACK_LIST):
return True return True