diff --git a/crawl_server/task_manager.py b/crawl_server/task_manager.py index 12216aa..a21abc4 100644 --- a/crawl_server/task_manager.py +++ b/crawl_server/task_manager.py @@ -1,20 +1,14 @@ from crawl_server.database import TaskManagerDatabase, Task from multiprocessing import Pool from apscheduler.schedulers.background import BackgroundScheduler -from enum import Enum from datetime import datetime from crawler.crawler import RemoteDirectoryCrawler -class TaskResultStatus(Enum): - SUCCESS = 0 - FAILURE = 1 - - class TaskResult: def __init__(self): - self.status_code: TaskResultStatus = None + self.status_code: str = None self.file_count = 0 self.start_time = None self.end_time = None @@ -56,7 +50,10 @@ class TaskManager: print("Starting task " + task.url) crawler = RemoteDirectoryCrawler(task.url, 10) - crawler.crawl_directory() + crawl_result = crawler.crawl_directory("12345.json") + + result.file_count = crawl_result.file_count + result.status_code = crawl_result.status_code print("End task " + task.url) @@ -67,6 +64,10 @@ class TaskManager: @staticmethod def task_complete(result: TaskResult): print("Task done " + str(result)) + print(result.status_code) + print(result.file_count) + print(result.start_time) + print(result.end_time) # todo save in db diff --git a/crawler/crawler.py b/crawler/crawler.py index 73968a7..cd35bf9 100644 --- a/crawler/crawler.py +++ b/crawler/crawler.py @@ -61,20 +61,46 @@ class RemoteDirectoryFactory: return dir_engine(url) +def export_to_json(q: Queue, out_file: str) -> int: + + counter = 0 + + with open(out_file, "w") as f: + while True: + try: + next_file: File = q.get_nowait() + f.write(next_file.to_json()) + f.write("\n") + counter += 1 + except Empty: + break + + return counter + + +class CrawlResult: + + def __init__(self, file_count: int, status_code: str): + self.file_count = file_count + self.status_code = status_code + + class RemoteDirectoryCrawler: def __init__(self, url, max_threads: int): self.url = url self.max_threads = max_threads + self.crawled_paths = set() - def crawl_directory(self): + def crawl_directory(self, out_file: str) -> CrawlResult: try: directory = RemoteDirectoryFactory.get_directory(self.url) - root_listing = directory.list_dir("/") + root_listing = directory.list_dir("") + self.crawled_paths.add("") directory.close() except TimeoutError: - return + return CrawlResult(0, "timeout") in_q = Queue(maxsize=0) files_q = Queue(maxsize=0) @@ -86,12 +112,15 @@ class RemoteDirectoryCrawler: threads = [] for i in range(self.max_threads): - worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self.url, in_q, files_q)) + worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self, self.url, in_q, files_q)) threads.append(worker) worker.start() in_q.join() - print("DONE") + print("Done") + + exported_count = export_to_json(files_q, out_file) + print("exported to " + out_file) # Kill threads for _ in threads: @@ -99,11 +128,9 @@ class RemoteDirectoryCrawler: for t in threads: t.join() - print(files_q.qsize()) - return [] + return CrawlResult(exported_count, "success") - @staticmethod - def _process_listings(url: str, in_q: Queue, files_q: Queue): + def _process_listings(self, url: str, in_q: Queue, files_q: Queue): directory = RemoteDirectoryFactory.get_directory(url) @@ -118,16 +145,21 @@ class RemoteDirectoryCrawler: break try: - listing = directory.list_dir(os.path.join(file.path, file.name, "")) + path = os.path.join(file.path, file.name, "") + if path not in self.crawled_paths: + listing = directory.list_dir(path) + self.crawled_paths.add(path) - for f in listing: - if f.is_dir: - in_q.put(f) - else: - files_q.put(f) + for f in listing: + if f.is_dir: + in_q.put(f) + else: + files_q.put(f) except TooManyConnectionsError: print("Too many connections") except TimeoutError: pass finally: in_q.task_done() + + diff --git a/crawler/http.py b/crawler/http.py index 3f09df7..dfa3dc6 100644 --- a/crawler/http.py +++ b/crawler/http.py @@ -38,10 +38,9 @@ class HttpDirectory(RemoteDirectory): self.parser = etree.HTMLParser(collect_ids=False) def list_dir(self, path) -> list: - results = [] - path_url = urljoin(self.base_url, path) + path_url = os.path.join(self.base_url, path.strip("/"), "") body = self._fetch_body(path_url) if not body: return [] @@ -130,7 +129,7 @@ class HttpDirectory(RemoteDirectory): try: r = requests.head(url, headers=HttpDirectory.HEADERS, allow_redirects=False, timeout=50) - stripped_url = r.url[len(self.base_url) - 1:] + stripped_url = url[len(self.base_url) - 1:] path, name = os.path.split(stripped_url)