diff --git a/crawl_server/remote_http.py b/crawl_server/remote_http.py index b9affc5..d8d2e24 100644 --- a/crawl_server/remote_http.py +++ b/crawl_server/remote_http.py @@ -36,7 +36,7 @@ class HttpDirectory(RemoteDirectory): def __init__(self, url): super().__init__(url) - self.parser = etree.HTMLParser(collect_ids=False, encoding="utf-8") + self.parser = etree.HTMLParser(collect_ids=False) def list_dir(self, path) -> list: results = [] @@ -103,17 +103,21 @@ class HttpDirectory(RemoteDirectory): while retries > 0: try: r = requests.get(url, headers=HttpDirectory.HEADERS) - return r.text + return r.content except RequestException: retries -= 1 return None - def _parse_links(self, body: str) -> set: + def _parse_links(self, body: bytes) -> set: result = set() tree = etree.HTML(body, parser=self.parser) - links = tree.findall(".//a/[@href]") + links = [] + try: + links = tree.findall(".//a/[@href]") + except AttributeError: + pass for link in links: result.add(Link(link.text, link.get("href"))) diff --git a/crawl_server/server.py b/crawl_server/server.py index 10af26d..40622fd 100644 --- a/crawl_server/server.py +++ b/crawl_server/server.py @@ -9,7 +9,7 @@ auth = HTTPTokenAuth(scheme="Token") tokens = [config.CRAWL_SERVER_TOKEN] -tm = TaskManager("tm_db.sqlite3", 8) +tm = TaskManager("tm_db.sqlite3", 64) @auth.verify_token diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index b754b09..ac4710d 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -54,7 +54,7 @@ class TaskManager: print("Starting task " + task.url) - crawler = RemoteDirectoryCrawler(task.url, 100) + crawler = RemoteDirectoryCrawler(task.url, 10) crawl_result = crawler.crawl_directory("./crawled/" + str(task.website_id) + ".json") result.file_count = crawl_result.file_count diff --git a/requirements.txt b/requirements.txt index 44a97b8..f0db9e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ elasticsearch python-dateutil flask_httpauth ujson +timeout_decorator diff --git a/task.py b/task.py index e559a15..5267dbf 100644 --- a/task.py +++ b/task.py @@ -91,7 +91,8 @@ class TaskDispatcher: for task in server.fetch_completed_tasks(): print("Completed task") file_list = server.fetch_website_files(task.website_id) - self.search.import_json(file_list, task.website_id) + if file_list: + self.search.import_json(file_list, task.website_id) def dispatch_task(self, task: Task): self._get_available_crawl_server().queue_task(task)