od-database/crawl_server/task_manager.py
2018-06-11 19:00:43 -04:00

74 lines
1.7 KiB
Python

from crawl_server.database import TaskManagerDatabase, Task
from multiprocessing import Pool
from apscheduler.schedulers.background import BackgroundScheduler
from enum import Enum
from datetime import datetime
from crawler.crawler import RemoteDirectoryCrawler
class TaskResultStatus(Enum):
SUCCESS = 0
FAILURE = 1
class TaskResult:
def __init__(self):
self.status_code: TaskResultStatus = None
self.file_count = 0
self.start_time = None
self.end_time = None
self.website_id = None
class TaskManager:
def __init__(self, db_path, max_processes=8):
self.db = TaskManagerDatabase(db_path)
self.pool = Pool(processes=max_processes)
scheduler = BackgroundScheduler()
scheduler.add_job(self.execute_queued_task, "interval", seconds=1)
scheduler.start()
def put_task(self, task: Task):
self.db.put_task(task)
def get_tasks(self):
return self.db.get_tasks()
def execute_queued_task(self):
task = self.db.pop_task()
if task:
print("pooled " + task.url)
self.pool.apply_async(
TaskManager.run_task,
args=(task, ),
callback=TaskManager.task_complete
)
@staticmethod
def run_task(task):
result = TaskResult()
result.start_time = datetime.utcnow()
print("Starting task " + task.url)
crawler = RemoteDirectoryCrawler(task.url, 10)
crawler.crawl_directory()
print("End task " + task.url)
result.end_time = datetime.utcnow()
return result
@staticmethod
def task_complete(result: TaskResult):
print("Task done " + str(result))
# todo save in db