mirror of
https://github.com/simon987/od-database.git
synced 2025-12-13 23:09:01 +00:00
Task crawl result now logged in a database
This commit is contained in:
@@ -1,23 +1,14 @@
|
||||
from crawl_server.database import TaskManagerDatabase, Task
|
||||
from crawl_server.database import TaskManagerDatabase, Task, TaskResult
|
||||
from multiprocessing import Pool
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from datetime import datetime
|
||||
from crawler.crawler import RemoteDirectoryCrawler
|
||||
|
||||
|
||||
class TaskResult:
|
||||
|
||||
def __init__(self):
|
||||
self.status_code: str = None
|
||||
self.file_count = 0
|
||||
self.start_time = None
|
||||
self.end_time = None
|
||||
self.website_id = None
|
||||
|
||||
|
||||
class TaskManager:
|
||||
|
||||
def __init__(self, db_path, max_processes=8):
|
||||
self.db_path = db_path
|
||||
self.db = TaskManagerDatabase(db_path)
|
||||
self.pool = Pool(processes=max_processes)
|
||||
|
||||
@@ -38,19 +29,21 @@ class TaskManager:
|
||||
print("pooled " + task.url)
|
||||
self.pool.apply_async(
|
||||
TaskManager.run_task,
|
||||
args=(task, ),
|
||||
callback=TaskManager.task_complete
|
||||
args=(task, self.db_path),
|
||||
callback=TaskManager.task_complete,
|
||||
error_callback=TaskManager.task_error
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def run_task(task):
|
||||
def run_task(task, db_path):
|
||||
result = TaskResult()
|
||||
result.start_time = datetime.utcnow()
|
||||
result.website_id = task.website_id
|
||||
|
||||
print("Starting task " + task.url)
|
||||
|
||||
crawler = RemoteDirectoryCrawler(task.url, 100)
|
||||
crawl_result = crawler.crawl_directory("12345.json")
|
||||
crawl_result = crawler.crawl_directory("crawled/" + str(task.website_id) + ".json")
|
||||
|
||||
result.file_count = crawl_result.file_count
|
||||
result.status_code = crawl_result.status_code
|
||||
@@ -59,15 +52,24 @@ class TaskManager:
|
||||
|
||||
result.end_time = datetime.utcnow()
|
||||
|
||||
return result
|
||||
return dict(result=result, db_path=db_path)
|
||||
|
||||
@staticmethod
|
||||
def task_complete(result: TaskResult):
|
||||
def task_complete(kwargs):
|
||||
result = kwargs["result"]
|
||||
db_path = kwargs["db_path"]
|
||||
print(result.status_code)
|
||||
print(result.file_count)
|
||||
print(result.start_time)
|
||||
print(result.end_time)
|
||||
# todo save in db
|
||||
|
||||
db = TaskManagerDatabase(db_path)
|
||||
db.log_result(result)
|
||||
|
||||
@staticmethod
|
||||
def task_error(err):
|
||||
print("ERROR")
|
||||
print(err)
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user