Use task_tracker for task tracking

This commit is contained in:
simon987
2019-03-24 20:21:43 -04:00
parent 00e3fd7340
commit 4ffe805b8d
29 changed files with 196 additions and 317 deletions

118
tasks.py
View File

@@ -1,12 +1,18 @@
import json
import logging
import os
import time
from threading import Thread
from uuid import uuid4
from apscheduler.schedulers.background import BackgroundScheduler
from search.search import ElasticSearchEngine
import json
import database
import urllib3
import config
import database
from search.search import ElasticSearchEngine
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
from ws_bucket_client.api import WsBucketApi
urllib3.disable_warnings()
logger = logging.getLogger("default")
@@ -15,20 +21,22 @@ logger = logging.getLogger("default")
class Task:
def __init__(self, website_id: int, url: str, priority: int = 1,
callback_type: str = None, callback_args: str = None):
callback_type: str = None, callback_args: str = None,
upload_token: str = None):
self.website_id = website_id
self.url = url
self.priority = priority
self.callback_type = callback_type
self.callback_args = json.loads(callback_args) if callback_args else {}
self.upload_token = upload_token
def to_json(self):
return {
"website_id": self.website_id,
"url": self.url,
"priority": self.priority,
"callback_type": self.callback_type,
"callback_args": json.dumps(self.callback_args)
"callback_args": json.dumps(self.callback_args),
"upload_token": self.upload_token
}
def __str__(self):
@@ -38,25 +46,13 @@ class Task:
return self.__str__()
class TaskResult:
class IndexingTask:
def __init__(self, status_code=None, file_count=0, start_time=0,
end_time=0, website_id=0, server_name=""):
self.status_code = status_code
self.file_count = file_count
self.start_time = start_time
self.end_time = end_time
def __init__(self, website_id: int, file_path: str, callback_type: str, callback_args):
self.website_id = website_id
self.server_name = server_name
def to_json(self):
return {
"status_code": self.status_code,
"file_count": self.file_count,
"start_time": self.start_time,
"end_time": self.end_time,
"website_id": self.website_id
}
self.file_path = file_path
self.callback_type = callback_type
self.callback_args = callback_args
class TaskManager:
@@ -64,14 +60,47 @@ class TaskManager:
def __init__(self):
self.search = ElasticSearchEngine("od-database")
self.db = database.Database("db.sqlite3")
self.tracker = TaskTrackerApi(config.TT_API)
def complete_task(self, file_list, task, task_result, crawler_name):
self.worker = Worker.from_file(self.tracker)
if not self.worker:
self.worker = self.tracker.make_worker("oddb_master")
self.worker.dump_to_file()
self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
self.worker.request_access(config.TT_INDEX_PROJECT, True, False)
self.search.delete_docs(task_result.website_id)
self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)
self._indexer_thread = Thread(target=self._do_indexing)
self._indexer_thread.start()
def _do_indexing(self):
while True:
logger.debug("Fetching indexing task...")
task = self.tracker.fetch_task(worker=self.worker, project_id=config.TT_INDEX_PROJECT)
if task:
try:
recipe = task.json_recipe()
logger.debug("Got indexing task: " + str(recipe))
filename = os.path.join(config.WSB_PATH, format_file_name(recipe["website_id"], recipe["upload_token"]))
except Exception as e:
print(e)
finally:
try:
self._complete_task(filename, Task(recipe["website_id"], recipe["url"]))
except:
pass
else:
time.sleep(5)
def _complete_task(self, file_list, task):
self.search.delete_docs(task.website_id)
if file_list:
def iter_lines():
with open(file_list, "r") as f:
line = f.readline()
while line:
@@ -82,11 +111,38 @@ class TaskManager:
self.db.update_website_date_if_exists(task.website_id)
task_result.server_id = crawler_name
def fetch_indexing_task(self):
self.db.log_result(task_result)
task = self.tracker.fetch_task(worker=self.worker, project_id=config.TT_INDEX_PROJECT)
print(task)
def queue_task(self, task: Task):
self.db.put_task(task)
print("Queued task and made it available to crawlers: " + str(task.website_id))
max_assign_time = 24 * 7 * 3600
upload_token = uuid4().__str__()
bucket_response = self.bucket.allocate(upload_token.__str__(),
21474837499, # 20Gib
format_file_name(task.website_id, upload_token),
to_dispose_date=int(time.time() + max_assign_time),
upload_hook="")
if not bucket_response:
return
print("Allocated upload bucket: %d, t=%s, r=%s" % (task.website_id, upload_token, bucket_response.text))
task.upload_token = upload_token
tracker_response = self.worker.submit_task(config.TT_CRAWL_PROJECT,
recipe=task.__str__(),
priority=task.priority,
max_assign_time=max_assign_time,
hash64=task.website_id,
verification_count=1,
max_retries=3
)
print("Queued task and made it available to crawlers: t=%s, r=%s" % (task, tracker_response.text))
def format_file_name(website_id, token):
return "%d_%s.NDJSON" % (website_id, token, )