Bug fixes + export to json

2025-04-24 12:45:51 +00:00 · 2018-06-11 20:02:30 -04:00 · 2018-06-11 20:02:30 -04:00 · fcfd7d4acc
commit fcfd7d4acc
parent d849227798
3 changed files with 58 additions and 26 deletions
--- a/crawl_server/task_manager.py
+++ b/crawl_server/task_manager.py
@ -1,20 +1,14 @@
 from crawl_server.database import TaskManagerDatabase, Task
 from multiprocessing import Pool
 from apscheduler.schedulers.background import BackgroundScheduler
-from enum import Enum
 from datetime import datetime
 from crawler.crawler import RemoteDirectoryCrawler


-class TaskResultStatus(Enum):
-    SUCCESS = 0
-    FAILURE = 1
-
-
 class TaskResult:

    def __init__(self):
-        self.status_code: TaskResultStatus = None
+        self.status_code: str = None
        self.file_count = 0
        self.start_time = None
        self.end_time = None
@ -56,7 +50,10 @@ class TaskManager:
        print("Starting task " + task.url)

        crawler = RemoteDirectoryCrawler(task.url, 10)
-        crawler.crawl_directory()
+        crawl_result = crawler.crawl_directory("12345.json")
+
+        result.file_count = crawl_result.file_count
+        result.status_code = crawl_result.status_code

        print("End task " + task.url)

@ -67,6 +64,10 @@ class TaskManager:
    @staticmethod
    def task_complete(result: TaskResult):
        print("Task done " + str(result))
+        print(result.status_code)
+        print(result.file_count)
+        print(result.start_time)
+        print(result.end_time)
        # todo save in db


--- a/crawler/crawler.py
+++ b/crawler/crawler.py
@ -61,20 +61,46 @@ class RemoteDirectoryFactory:
                return dir_engine(url)


+def export_to_json(q: Queue, out_file: str) -> int:
+
+    counter = 0
+
+    with open(out_file, "w") as f:
+        while True:
+            try:
+                next_file: File = q.get_nowait()
+                f.write(next_file.to_json())
+                f.write("\n")
+                counter += 1
+            except Empty:
+                break
+
+    return counter
+
+
+class CrawlResult:
+
+    def __init__(self, file_count: int, status_code: str):
+        self.file_count = file_count
+        self.status_code = status_code
+
+
 class RemoteDirectoryCrawler:

    def __init__(self, url, max_threads: int):
        self.url = url
        self.max_threads = max_threads
+        self.crawled_paths = set()

-    def crawl_directory(self):
+    def crawl_directory(self, out_file: str) -> CrawlResult:

        try:
            directory = RemoteDirectoryFactory.get_directory(self.url)
-            root_listing = directory.list_dir("/")
+            root_listing = directory.list_dir("")
+            self.crawled_paths.add("")
            directory.close()
        except TimeoutError:
-            return
+            return CrawlResult(0, "timeout")

        in_q = Queue(maxsize=0)
        files_q = Queue(maxsize=0)
@ -86,12 +112,15 @@ class RemoteDirectoryCrawler:

        threads = []
        for i in range(self.max_threads):
-            worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self.url, in_q, files_q))
+            worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self, self.url, in_q, files_q))
            threads.append(worker)
            worker.start()

        in_q.join()
-        print("DONE")
+        print("Done")
+
+        exported_count = export_to_json(files_q, out_file)
+        print("exported to " + out_file)

        # Kill threads
        for _ in threads:
@ -99,11 +128,9 @@ class RemoteDirectoryCrawler:
        for t in threads:
            t.join()

-        print(files_q.qsize())
-        return []
+        return CrawlResult(exported_count, "success")

-    @staticmethod
-    def _process_listings(url: str, in_q: Queue, files_q: Queue):
+    def _process_listings(self, url: str, in_q: Queue, files_q: Queue):

        directory = RemoteDirectoryFactory.get_directory(url)

@ -118,7 +145,10 @@ class RemoteDirectoryCrawler:
                break

            try:
-                listing = directory.list_dir(os.path.join(file.path, file.name, ""))
+                path = os.path.join(file.path, file.name, "")
+                if path not in self.crawled_paths:
+                    listing = directory.list_dir(path)
+                    self.crawled_paths.add(path)

                    for f in listing:
                        if f.is_dir:
@ -131,3 +161,5 @@ class RemoteDirectoryCrawler:
                pass
            finally:
                in_q.task_done()
+
+
--- a/crawler/http.py
+++ b/crawler/http.py
@ -38,10 +38,9 @@ class HttpDirectory(RemoteDirectory):
        self.parser = etree.HTMLParser(collect_ids=False)

    def list_dir(self, path) -> list:
-
        results = []

-        path_url = urljoin(self.base_url, path)
+        path_url = os.path.join(self.base_url, path.strip("/"), "")
        body = self._fetch_body(path_url)
        if not body:
            return []
@ -130,7 +129,7 @@ class HttpDirectory(RemoteDirectory):
            try:
                r = requests.head(url, headers=HttpDirectory.HEADERS, allow_redirects=False, timeout=50)

-                stripped_url = r.url[len(self.base_url) - 1:]
+                stripped_url = url[len(self.base_url) - 1:]

                path, name = os.path.split(stripped_url)