diff --git a/crawl_server/crawler.py b/crawl_server/crawler.py index 4e7af05..3c1c794 100644 --- a/crawl_server/crawler.py +++ b/crawl_server/crawler.py @@ -87,8 +87,8 @@ class RemoteDirectoryCrawler: try: try: directory = RemoteDirectoryFactory.get_directory(self.url) - path, root_listing = directory.list_dir("") - self.crawled_paths.append(path) + path_id, root_listing = directory.list_dir(urlparse(self.url).path) + self.crawled_paths.append(path_id) directory.close() except TimeoutError: return CrawlResult(0, "timeout") diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index 359877d..54f5260 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -73,6 +73,11 @@ class HttpDirectory(RemoteDirectory): "?C=N&O=A", "?C=N&O=A" ) + FILE_NAME_BLACKLIST = ( + "Parent Directory", + "../" + + ) MAX_RETRIES = 3 def __init__(self, url): @@ -114,8 +119,8 @@ class HttpDirectory(RemoteDirectory): urls_to_request.append(urljoin(path_url, anchor.href)) for file in self.request_files(urls_to_request): - files.append(file) path_identifier.update(bytes(file)) + files.append(file) return path_identifier.hexdigest(), files @@ -197,7 +202,7 @@ class HttpDirectory(RemoteDirectory): @staticmethod def _should_ignore(base_url, link: Anchor): - if link.text == "../" or link.href == "../" or link.href == "./" or link.href == "" \ + if link.text in HttpDirectory.FILE_NAME_BLACKLIST or link.href in ("../", "./", "") \ or link.href.endswith(HttpDirectory.BLACK_LIST): return True diff --git a/task.py b/task.py index 1f1c830..bf671dd 100644 --- a/task.py +++ b/task.py @@ -1,5 +1,3 @@ -import random - from apscheduler.schedulers.background import BackgroundScheduler from search.search import ElasticSearchEngine from crawl_server.database import Task, TaskResult @@ -180,7 +178,7 @@ class TaskDispatcher: queued_tasks_by_server = self._get_current_tasks_by_server() server_with_most_free_slots = None - most_free_slots = 0 + most_free_slots = -10000 for server in queued_tasks_by_server: free_slots = server.slots - len(queued_tasks_by_server[server]) @@ -188,12 +186,11 @@ class TaskDispatcher: server_with_most_free_slots = server most_free_slots = free_slots - if server_with_most_free_slots: - print("Dispatching task to '" + - server_with_most_free_slots.name + "' " + - str(most_free_slots) + " free out of " + str(server_with_most_free_slots.slots)) + print("Dispatching task to '" + + server_with_most_free_slots.name + "' " + + str(most_free_slots) + " free out of " + str(server_with_most_free_slots.slots)) - return self.db.get_crawl_servers()[0] + return server_with_most_free_slots def get_queued_tasks(self) -> list: