mirror of
https://github.com/simon987/od-database.git
synced 2025-12-13 23:09:01 +00:00
Crawl tasks are now fetched by the crawlers instead of pushed by the server
This commit is contained in:
90
tasks.py
Normal file
90
tasks.py
Normal file
@@ -0,0 +1,90 @@
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from werkzeug.datastructures import FileStorage
|
||||
from search.search import ElasticSearchEngine
|
||||
import json
|
||||
import database
|
||||
import urllib3
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
|
||||
class Task:
|
||||
|
||||
def __init__(self, website_id: int, url: str, priority: int = 1,
|
||||
callback_type: str = None, callback_args: str = None):
|
||||
self.website_id = website_id
|
||||
self.url = url
|
||||
self.priority = priority
|
||||
self.callback_type = callback_type
|
||||
self.callback_args = json.loads(callback_args) if callback_args else {}
|
||||
|
||||
def to_json(self):
|
||||
return {
|
||||
"website_id": self.website_id,
|
||||
"url": self.url,
|
||||
"priority": self.priority,
|
||||
"callback_type": self.callback_type,
|
||||
"callback_args": json.dumps(self.callback_args)
|
||||
}
|
||||
|
||||
def __str__(self):
|
||||
return json.dumps(self.to_json())
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
|
||||
class TaskResult:
|
||||
|
||||
def __init__(self, status_code=None, file_count=0, start_time=0,
|
||||
end_time=0, website_id=0, server_name=""):
|
||||
self.status_code = status_code
|
||||
self.file_count = file_count
|
||||
self.start_time = start_time
|
||||
self.end_time = end_time
|
||||
self.website_id = website_id
|
||||
self.server_name = server_name
|
||||
|
||||
def to_json(self):
|
||||
return {
|
||||
"status_code": self.status_code,
|
||||
"file_count": self.file_count,
|
||||
"start_time": self.start_time,
|
||||
"end_time": self.end_time,
|
||||
"website_id": self.website_id
|
||||
}
|
||||
|
||||
|
||||
class TaskManager:
|
||||
|
||||
def __init__(self):
|
||||
self.search = ElasticSearchEngine("od-database")
|
||||
self.db = database.Database("db.sqlite3")
|
||||
|
||||
def complete_task(self, file_list, task, task_result, crawler_name):
|
||||
|
||||
if file_list:
|
||||
self.search.delete_docs(task_result.website_id)
|
||||
|
||||
def iter_lines():
|
||||
|
||||
with open(file_list, "r") as f:
|
||||
line = f.readline()
|
||||
while line:
|
||||
yield line
|
||||
line = f.readline()
|
||||
|
||||
self.search.import_json(iter_lines(), task.website_id)
|
||||
|
||||
self.db.update_website_date_if_exists(task.website_id)
|
||||
|
||||
task_result.server_id = crawler_name
|
||||
|
||||
self.db.log_result(task_result)
|
||||
|
||||
def queue_task(self, task: Task):
|
||||
self.db.put_task(task)
|
||||
print("Queued task and made it available to crawlers: " + str(task.website_id))
|
||||
|
||||
def get_queued_tasks(self) -> list:
|
||||
return self.db.get_tasks()
|
||||
Reference in New Issue
Block a user