Bug fixes + export to json

This commit is contained in:
Simon 2018-06-11 20:02:30 -04:00
parent d849227798
commit fcfd7d4acc
3 changed files with 58 additions and 26 deletions

View File

@ -1,20 +1,14 @@
from crawl_server.database import TaskManagerDatabase, Task
from multiprocessing import Pool
from apscheduler.schedulers.background import BackgroundScheduler
from enum import Enum
from datetime import datetime
from crawler.crawler import RemoteDirectoryCrawler
class TaskResultStatus(Enum):
SUCCESS = 0
FAILURE = 1
class TaskResult:
def __init__(self):
self.status_code: TaskResultStatus = None
self.status_code: str = None
self.file_count = 0
self.start_time = None
self.end_time = None
@ -56,7 +50,10 @@ class TaskManager:
print("Starting task " + task.url)
crawler = RemoteDirectoryCrawler(task.url, 10)
crawler.crawl_directory()
crawl_result = crawler.crawl_directory("12345.json")
result.file_count = crawl_result.file_count
result.status_code = crawl_result.status_code
print("End task " + task.url)
@ -67,6 +64,10 @@ class TaskManager:
@staticmethod
def task_complete(result: TaskResult):
print("Task done " + str(result))
print(result.status_code)
print(result.file_count)
print(result.start_time)
print(result.end_time)
# todo save in db

View File

@ -61,20 +61,46 @@ class RemoteDirectoryFactory:
return dir_engine(url)
def export_to_json(q: Queue, out_file: str) -> int:
counter = 0
with open(out_file, "w") as f:
while True:
try:
next_file: File = q.get_nowait()
f.write(next_file.to_json())
f.write("\n")
counter += 1
except Empty:
break
return counter
class CrawlResult:
def __init__(self, file_count: int, status_code: str):
self.file_count = file_count
self.status_code = status_code
class RemoteDirectoryCrawler:
def __init__(self, url, max_threads: int):
self.url = url
self.max_threads = max_threads
self.crawled_paths = set()
def crawl_directory(self):
def crawl_directory(self, out_file: str) -> CrawlResult:
try:
directory = RemoteDirectoryFactory.get_directory(self.url)
root_listing = directory.list_dir("/")
root_listing = directory.list_dir("")
self.crawled_paths.add("")
directory.close()
except TimeoutError:
return
return CrawlResult(0, "timeout")
in_q = Queue(maxsize=0)
files_q = Queue(maxsize=0)
@ -86,12 +112,15 @@ class RemoteDirectoryCrawler:
threads = []
for i in range(self.max_threads):
worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self.url, in_q, files_q))
worker = Thread(target=RemoteDirectoryCrawler._process_listings, args=(self, self.url, in_q, files_q))
threads.append(worker)
worker.start()
in_q.join()
print("DONE")
print("Done")
exported_count = export_to_json(files_q, out_file)
print("exported to " + out_file)
# Kill threads
for _ in threads:
@ -99,11 +128,9 @@ class RemoteDirectoryCrawler:
for t in threads:
t.join()
print(files_q.qsize())
return []
return CrawlResult(exported_count, "success")
@staticmethod
def _process_listings(url: str, in_q: Queue, files_q: Queue):
def _process_listings(self, url: str, in_q: Queue, files_q: Queue):
directory = RemoteDirectoryFactory.get_directory(url)
@ -118,7 +145,10 @@ class RemoteDirectoryCrawler:
break
try:
listing = directory.list_dir(os.path.join(file.path, file.name, ""))
path = os.path.join(file.path, file.name, "")
if path not in self.crawled_paths:
listing = directory.list_dir(path)
self.crawled_paths.add(path)
for f in listing:
if f.is_dir:
@ -131,3 +161,5 @@ class RemoteDirectoryCrawler:
pass
finally:
in_q.task_done()

View File

@ -38,10 +38,9 @@ class HttpDirectory(RemoteDirectory):
self.parser = etree.HTMLParser(collect_ids=False)
def list_dir(self, path) -> list:
results = []
path_url = urljoin(self.base_url, path)
path_url = os.path.join(self.base_url, path.strip("/"), "")
body = self._fetch_body(path_url)
if not body:
return []
@ -130,7 +129,7 @@ class HttpDirectory(RemoteDirectory):
try:
r = requests.head(url, headers=HttpDirectory.HEADERS, allow_redirects=False, timeout=50)
stripped_url = r.url[len(self.base_url) - 1:]
stripped_url = url[len(self.base_url) - 1:]
path, name = os.path.split(stripped_url)