mirror of
https://github.com/simon987/od-database.git
synced 2025-04-16 17:06:46 +00:00
69 lines
2.3 KiB
Python
69 lines
2.3 KiB
Python
from apscheduler.schedulers.background import BackgroundScheduler
|
|
import os
|
|
from database import Website
|
|
from multiprocessing import Value, Process
|
|
from scrapy.crawler import CrawlerProcess
|
|
from scrapy.utils.project import get_project_settings
|
|
from database import Database
|
|
from reddit_bot import RedditBot
|
|
import praw
|
|
|
|
|
|
class TaskManager:
|
|
|
|
def __init__(self):
|
|
self.busy = Value("i", 0)
|
|
self.current_website = None
|
|
self.current_task = None
|
|
|
|
reddit = praw.Reddit('opendirectories-bot',
|
|
user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)')
|
|
self.reddit_bot = RedditBot("crawled.txt", reddit)
|
|
|
|
self.db = Database("db.sqlite3")
|
|
scheduler = BackgroundScheduler()
|
|
scheduler.add_job(self.check_new_task, "interval", seconds=1)
|
|
scheduler.start()
|
|
|
|
def check_new_task(self):
|
|
if self.current_task is None:
|
|
task = self.db.dequeue()
|
|
|
|
if task:
|
|
website_id, post_id = task
|
|
website = self.db.get_website_by_id(website_id)
|
|
self.current_task = Process(target=self.execute_task,
|
|
args=(website, self.busy, post_id))
|
|
self.current_website = website
|
|
self.current_task.start()
|
|
|
|
elif self.busy.value == 0:
|
|
self.current_task.terminate()
|
|
self.current_task = None
|
|
self.current_website = None
|
|
|
|
def execute_task(self, website: Website, busy: Value, post_id: str):
|
|
busy.value = 1
|
|
if os.path.exists("data.json"):
|
|
os.remove("data.json")
|
|
print("Started crawling task")
|
|
process = CrawlerProcess(get_project_settings())
|
|
process.crawl("od_links", base_url=website.url)
|
|
process.start()
|
|
print("Done crawling")
|
|
|
|
self.db.import_json("data.json", website)
|
|
os.remove("data.json")
|
|
print("Imported in SQLite3")
|
|
|
|
if post_id:
|
|
# TODO check should_comment()
|
|
stats = self.db.get_website_stats(website.id)
|
|
comment = self.reddit_bot.get_comment(stats, website.id)
|
|
print(comment)
|
|
print(self.reddit_bot.reddit.submission(post_id))
|
|
|
|
busy.value = 0
|
|
print("Done crawling task")
|
|
|