Attempt to handle looping directories

2025-10-25 19:56:51 +00:00 · 2018-06-21 11:54:40 -04:00 · 2018-06-21 11:54:40 -04:00 · 073551df3c
commit 073551df3c
parent dd93d40a55
3 changed files with 42 additions and 19 deletions
--- a/crawl_server/crawler.py
+++ b/crawl_server/crawler.py
@ -20,8 +20,13 @@ class File:
        self.path = path
        self.is_dir = is_dir

-    def __str__(self):
-        return ("DIR " if self.is_dir else "FILE ") + self.path + "/" + self.name
+    def __bytes__(self):
+        return b"|".join([
+            self.name.encode(),
+            b"D" if self.is_dir else b"F",
+            str(self.size).encode(),
+            str(self.mtime).encode(),
+        ])

    def to_json(self):
        return ujson.dumps({
@ -39,7 +44,7 @@ class RemoteDirectory:
    def __init__(self, base_url):
        self.base_url = base_url

-    def list_dir(self, path: str) -> list:
+    def list_dir(self, path: str):
        raise NotImplementedError

    def close(self):
@ -82,8 +87,8 @@ class RemoteDirectoryCrawler:

        try:
            directory = RemoteDirectoryFactory.get_directory(self.url)
-            root_listing = directory.list_dir("")
-            self.crawled_paths.append("")
+            path, root_listing = directory.list_dir("")
+            self.crawled_paths.append(path)
            directory.close()
        except TimeoutError:
            return CrawlResult(0, "timeout")
@ -136,9 +141,9 @@ class RemoteDirectoryCrawler:
                break

            try:
-                if path not in self.crawled_paths:
-                    self.crawled_paths.append(path)
-                    listing = directory.list_dir(path)
+                path_id, listing = directory.list_dir(path)
+                if len(listing) > 0 and path_id not in self.crawled_paths:
+                    self.crawled_paths.append(path_id)
                    timeout_retries = RemoteDirectoryCrawler.MAX_TIMEOUT_RETRIES

                    for f in listing:
@ -148,6 +153,9 @@ class RemoteDirectoryCrawler:
                            files_q.put(f)
                    import sys
                    print("LISTED " + repr(path) + "dirs:" + str(in_q.qsize()))
+                else:
+                    pass
+                    # print("SKIPPED: " + path + ", dropped " + str(len(listing)))
            except TooManyConnectionsError:
                print("Too many connections")
                # Kill worker and resubmit listing task
--- a/crawl_server/remote_ftp.py
+++ b/crawl_server/remote_ftp.py
@ -44,7 +44,7 @@ class FtpDirectory(RemoteDirectory):
                print("Connection error; reconnecting..." + e.strerror + " " + str(e.errno))
                time.sleep(2 * random.uniform(0.5, 1.5))

-    def list_dir(self, path) -> list:
+    def list_dir(self, path):
        if not self.ftp:
            # No connection - assuming that connection was dropped because too many
            raise TooManyConnectionsError()
@ -65,7 +65,7 @@ class FtpDirectory(RemoteDirectory):
                            is_dir=is_dir,
                            path=path
                        ))
-                return results
+                return path, results
            except ftputil.error.ParserError as e:
                print("TODO: fix parsing error: " + e.strerror + " @ " + str(e.file_name))
                break
@ -82,7 +82,7 @@ class FtpDirectory(RemoteDirectory):
                print(type(e))
                raise e

-        return []
+        return path, []

    def try_stat(self, path):

--- a/crawl_server/remote_http.py
+++ b/crawl_server/remote_http.py
@ -9,6 +9,7 @@ from requests.exceptions import RequestException
 from multiprocessing.pool import ThreadPool
 import config
 from dateutil.parser import parse as parse_date
+import hashlib


 class Anchor:
@ -66,7 +67,9 @@ class HttpDirectory(RemoteDirectory):
        "?MA",
        "?SA",
        "?DA",
-        "?ND"
+        "?ND",
+        "?C=N&O=A",
+        "?C=N&O=A"
    )
    MAX_RETRIES = 3

@ -79,31 +82,40 @@ class HttpDirectory(RemoteDirectory):

    def list_dir(self, path):

+        current_dir_name = path[path.rstrip("/").rfind("/") + 1: -1]
+        path_identifier = hashlib.sha1(current_dir_name.encode())
        path_url = urljoin(self.base_url, path, "")
        body = self._stream_body(path_url)
        if not body:
-            return None
+            return None, None
        anchors = self._parse_links(body)

        urls_to_request = []
+        files = []

        for anchor in anchors:
            if self._should_ignore(self.base_url, anchor):
                continue

            if self._isdir(anchor):
-                yield File(
+
+                directory = File(
                    name=anchor.href,
-                    mtime=None,
-                    size=None,
+                    mtime=0,
+                    size=0,
                    path=path,
                    is_dir=True
                )
+                path_identifier.update(bytes(directory))
+                files.append(directory)
            else:
                urls_to_request.append(urljoin(path_url, anchor.href))

        for file in self.request_files(urls_to_request):
-            yield file
+            files.append(file)
+            path_identifier.update(bytes(file))
+
+        return path_identifier.hexdigest(), files

    def request_files(self, urls_to_request: list) -> list:

@ -168,11 +180,14 @@ class HttpDirectory(RemoteDirectory):
    def _parse_links(body):

        parser = HTMLAnchorParser()
+        anchors = []

        for chunk in body:
            parser.feed(chunk)
            for anchor in parser.anchors:
-                yield anchor
+                anchors.append(anchor)
+
+        return anchors

    @staticmethod
    def _isdir(link: Anchor):
@ -180,7 +195,7 @@ class HttpDirectory(RemoteDirectory):

    @staticmethod
    def _should_ignore(base_url, link: Anchor):
-        if link.text == "../" or link.href == "../" or link.href == "./" \
+        if link.text == "../" or link.href == "../" or link.href == "./" or link.href == "" \
                or link.href.endswith(HttpDirectory.BLACK_LIST):
            return True