diff --git a/crawl_server/crawler.py b/crawl_server/crawler.py index 4acb78a..358786a 100644 --- a/crawl_server/crawler.py +++ b/crawl_server/crawler.py @@ -20,8 +20,13 @@ class File: self.path = path self.is_dir = is_dir - def __str__(self): - return ("DIR " if self.is_dir else "FILE ") + self.path + "/" + self.name + def __bytes__(self): + return b"|".join([ + self.name.encode(), + b"D" if self.is_dir else b"F", + str(self.size).encode(), + str(self.mtime).encode(), + ]) def to_json(self): return ujson.dumps({ @@ -39,7 +44,7 @@ class RemoteDirectory: def __init__(self, base_url): self.base_url = base_url - def list_dir(self, path: str) -> list: + def list_dir(self, path: str): raise NotImplementedError def close(self): @@ -82,8 +87,8 @@ class RemoteDirectoryCrawler: try: directory = RemoteDirectoryFactory.get_directory(self.url) - root_listing = directory.list_dir("") - self.crawled_paths.append("") + path, root_listing = directory.list_dir("") + self.crawled_paths.append(path) directory.close() except TimeoutError: return CrawlResult(0, "timeout") @@ -136,9 +141,9 @@ class RemoteDirectoryCrawler: break try: - if path not in self.crawled_paths: - self.crawled_paths.append(path) - listing = directory.list_dir(path) + path_id, listing = directory.list_dir(path) + if len(listing) > 0 and path_id not in self.crawled_paths: + self.crawled_paths.append(path_id) timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES for f in listing: @@ -148,6 +153,9 @@ class RemoteDirectoryCrawler: files_q.put(f) import sys print("LISTED " + repr(path) + "dirs:" + str(in_q.qsize())) + else: + pass + # print("SKIPPED: " + path + ", dropped " + str(len(listing))) except TooManyConnectionsError: print("Too many connections") # Kill worker and resubmit listing task diff --git a/crawl_server/remote_ftp.py b/crawl_server/remote_ftp.py index 3c81521..d5b500a 100644 --- a/crawl_server/remote_ftp.py +++ b/crawl_server/remote_ftp.py @@ -44,7 +44,7 @@ class FtpDirectory(RemoteDirectory): print("Connection error; reconnecting..." + e.strerror + " " + str(e.errno)) time.sleep(2 * random.uniform(0.5, 1.5)) - def list_dir(self, path) -> list: + def list_dir(self, path): if not self.ftp: # No connection - assuming that connection was dropped because too many raise TooManyConnectionsError() @@ -65,7 +65,7 @@ class FtpDirectory(RemoteDirectory): is_dir=is_dir, path=path )) - return results + return path, results except ftputil.error.ParserError as e: print("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name)) break @@ -82,7 +82,7 @@ class FtpDirectory(RemoteDirectory): print(type(e)) raise e - return [] + return path, [] def try_stat(self, path): diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index c8134e1..a5060bf 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -9,6 +9,7 @@ from requests.exceptions import RequestException from multiprocessing.pool import ThreadPool import config from dateutil.parser import parse as parse_date +import hashlib class Anchor: @@ -66,7 +67,9 @@ class HttpDirectory(RemoteDirectory): "?MA", "?SA", "?DA", - "?ND" + "?ND", + "?C=N&O=A", + "?C=N&O=A" ) MAX_RETRIES = 3 @@ -79,31 +82,40 @@ class HttpDirectory(RemoteDirectory): def list_dir(self, path): + current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1] + path_identifier = hashlib.sha1(current_dir_name.encode()) path_url = urljoin(self.base_url, path, "") body = self._stream_body(path_url) if not body: - return None + return None, None anchors = self._parse_links(body) urls_to_request = [] + files = [] for anchor in anchors: if self._should_ignore(self.base_url, anchor): continue if self._isdir(anchor): - yield File( + + directory = File( name=anchor.href, - mtime=None, - size=None, + mtime=0, + size=0, path=path, is_dir=True ) + path_identifier.update(bytes(directory)) + files.append(directory) else: urls_to_request.append(urljoin(path_url, anchor.href)) for file in self.request_files(urls_to_request): - yield file + files.append(file) + path_identifier.update(bytes(file)) + + return path_identifier.hexdigest(), files def request_files(self, urls_to_request: list) -> list: @@ -168,11 +180,14 @@ class HttpDirectory(RemoteDirectory): def _parse_links(body): parser = HTMLAnchorParser() + anchors = [] for chunk in body: parser.feed(chunk) for anchor in parser.anchors: - yield anchor + anchors.append(anchor) + + return anchors @staticmethod def _isdir(link: Anchor): @@ -180,7 +195,7 @@ class HttpDirectory(RemoteDirectory): @staticmethod def _should_ignore(base_url, link: Anchor): - if link.text == "../" or link.href == "../" or link.href == "./" \ + if link.text == "../" or link.href == "../" or link.href == "./" or link.href == "" \ or link.href.endswith(HttpDirectory.BLACK_LIST): return True