Task crawl result now logged in a database

This commit is contained in:
Simon
2018-06-12 11:03:45 -04:00
parent 011b8455a7
commit 6d48f1f780
6 changed files with 70 additions and 29 deletions

View File

@@ -1,23 +1,14 @@
from crawl_server.database import TaskManagerDatabase, Task
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
from multiprocessing import Pool
from apscheduler.schedulers.background import BackgroundScheduler
from datetime import datetime
from crawler.crawler import RemoteDirectoryCrawler
class TaskResult:
def __init__(self):
self.status_code: str = None
self.file_count = 0
self.start_time = None
self.end_time = None
self.website_id = None
class TaskManager:
def __init__(self, db_path, max_processes=8):
self.db_path = db_path
self.db = TaskManagerDatabase(db_path)
self.pool = Pool(processes=max_processes)
@@ -38,19 +29,21 @@ class TaskManager:
print("pooled " + task.url)
self.pool.apply_async(
TaskManager.run_task,
args=(task, ),
callback=TaskManager.task_complete
args=(task, self.db_path),
callback=TaskManager.task_complete,
error_callback=TaskManager.task_error
)
@staticmethod
def run_task(task):
def run_task(task, db_path):
result = TaskResult()
result.start_time = datetime.utcnow()
result.website_id = task.website_id
print("Starting task " + task.url)
crawler = RemoteDirectoryCrawler(task.url, 100)
crawl_result = crawler.crawl_directory("12345.json")
crawl_result = crawler.crawl_directory("crawled/" + str(task.website_id) + ".json")
result.file_count = crawl_result.file_count
result.status_code = crawl_result.status_code
@@ -59,15 +52,24 @@ class TaskManager:
result.end_time = datetime.utcnow()
return result
return dict(result=result, db_path=db_path)
@staticmethod
def task_complete(result: TaskResult):
def task_complete(kwargs):
result = kwargs["result"]
db_path = kwargs["db_path"]
print(result.status_code)
print(result.file_count)
print(result.start_time)
print(result.end_time)
# todo save in db
db = TaskManagerDatabase(db_path)
db.log_result(result)
@staticmethod
def task_error(err):
print("ERROR")
print(err)