mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 02:16:47 +00:00
74 lines
1.7 KiB
Python
74 lines
1.7 KiB
Python
from crawl_server.database import TaskManagerDatabase, Task
|
|
from multiprocessing import Pool
|
|
from apscheduler.schedulers.background import BackgroundScheduler
|
|
from enum import Enum
|
|
from datetime import datetime
|
|
from crawler.crawler import RemoteDirectoryCrawler
|
|
|
|
|
|
class TaskResultStatus(Enum):
|
|
SUCCESS = 0
|
|
FAILURE = 1
|
|
|
|
|
|
class TaskResult:
|
|
|
|
def __init__(self):
|
|
self.status_code: TaskResultStatus = None
|
|
self.file_count = 0
|
|
self.start_time = None
|
|
self.end_time = None
|
|
self.website_id = None
|
|
|
|
|
|
class TaskManager:
|
|
|
|
def __init__(self, db_path, max_processes=8):
|
|
self.db = TaskManagerDatabase(db_path)
|
|
self.pool = Pool(processes=max_processes)
|
|
|
|
scheduler = BackgroundScheduler()
|
|
scheduler.add_job(self.execute_queued_task, "interval", seconds=1)
|
|
scheduler.start()
|
|
|
|
def put_task(self, task: Task):
|
|
self.db.put_task(task)
|
|
|
|
def get_tasks(self):
|
|
return self.db.get_tasks()
|
|
|
|
def execute_queued_task(self):
|
|
|
|
task = self.db.pop_task()
|
|
if task:
|
|
print("pooled " + task.url)
|
|
self.pool.apply_async(
|
|
TaskManager.run_task,
|
|
args=(task, ),
|
|
callback=TaskManager.task_complete
|
|
)
|
|
|
|
@staticmethod
|
|
def run_task(task):
|
|
result = TaskResult()
|
|
result.start_time = datetime.utcnow()
|
|
|
|
print("Starting task " + task.url)
|
|
|
|
crawler = RemoteDirectoryCrawler(task.url, 10)
|
|
crawler.crawl_directory()
|
|
|
|
print("End task " + task.url)
|
|
|
|
result.end_time = datetime.utcnow()
|
|
|
|
return result
|
|
|
|
@staticmethod
|
|
def task_complete(result: TaskResult):
|
|
print("Task done " + str(result))
|
|
# todo save in db
|
|
|
|
|
|
|