Decentralised crawling should work in theory + temporary fix for going further than the maximum 10k results elasticsearch allows by default

2025-12-11 14:08:51 +00:00 · 2018-06-21 19:44:27 -04:00
parent 098ad2be72
commit 14d384e366
9 changed files with 275 additions and 84 deletions
--- a/crawl_server/crawler.py
+++ b/crawl_server/crawler.py
@@ -84,46 +84,48 @@ class RemoteDirectoryCrawler:
        self.crawled_paths = list()

    def crawl_directory(self, out_file: str) -> CrawlResult:
-
        try:
-            directory = RemoteDirectoryFactory.get_directory(self.url)
-            path, root_listing = directory.list_dir("")
-            self.crawled_paths.append(path)
-            directory.close()
-        except TimeoutError:
-            return CrawlResult(0, "timeout")
+            try:
+                directory = RemoteDirectoryFactory.get_directory(self.url)
+                path, root_listing = directory.list_dir("")
+                self.crawled_paths.append(path)
+                directory.close()
+            except TimeoutError:
+                return CrawlResult(0, "timeout")

-        in_q = Queue(maxsize=0)
-        files_q = Queue(maxsize=0)
-        for f in root_listing:
-            if f.is_dir:
-                in_q.put(os.path.join(f.path, f.name, ""))
-            else:
-                files_q.put(f)
+            in_q = Queue(maxsize=0)
+            files_q = Queue(maxsize=0)
+            for f in root_listing:
+                if f.is_dir:
+                    in_q.put(os.path.join(f.path, f.name, ""))
+                else:
+                    files_q.put(f)

-        threads = []
-        for i in range(self.max_threads):
-            worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self, self.url, in_q, files_q))
-            threads.append(worker)
-            worker.start()
+            threads = []
+            for i in range(self.max_threads):
+                worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self, self.url, in_q, files_q))
+                threads.append(worker)
+                worker.start()

-        files_written = []  # Pass array to worker to get result
-        file_writer_thread = Thread(target=RemoteDirectoryCrawler._log_to_file, args=(files_q, out_file, files_written))
-        file_writer_thread.start()
+            files_written = []  # Pass array to worker to get result
+            file_writer_thread = Thread(target=RemoteDirectoryCrawler._log_to_file, args=(files_q, out_file, files_written))
+            file_writer_thread.start()

-        in_q.join()
-        files_q.join()
-        print("Done")
+            in_q.join()
+            files_q.join()
+            print("Done")

-        # Kill threads
-        for _ in threads:
-            in_q.put(None)
-        for t in threads:
-            t.join()
-        files_q.put(None)
-        file_writer_thread.join()
+            # Kill threads
+            for _ in threads:
+                in_q.put(None)
+            for t in threads:
+                t.join()
+            files_q.put(None)
+            file_writer_thread.join()

-        return CrawlResult(files_written[0], "success")
+            return CrawlResult(files_written[0], "success")
+        except Exception as e:
+            return CrawlResult(0, str(e) + " \nType:" + str(type(e)))

    def _process_listings(self, url: str, in_q: Queue, files_q: Queue):

--- a/crawl_server/server.py
+++ b/crawl_server/server.py
@@ -7,15 +7,14 @@ import config
 app = Flask(__name__)
 auth = HTTPTokenAuth(scheme="Token")

-tokens = [config.CRAWL_SERVER_TOKEN]
+token = config.CRAWL_SERVER_TOKEN

 tm = TaskManager("tm_db.sqlite3", 32)


@auth.verify_token
-def verify_token(token):
-    if token in tokens:
-        return True
+def verify_token(provided_token):
+    return token == provided_token


@app.route("/task/")
@@ -99,4 +98,4 @@ def get_stats():


 if __name__ == "__main__":
-    app.run(port=5001, host="0.0.0.0")
+    app.run(port=config.CRAWL_SERVER_PORT, host="0.0.0.0", ssl_context="adhoc")