od-database/task.py

from apscheduler.schedulers.background import BackgroundScheduler
import os
from database import Website
from multiprocessing import Value, Process
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from database import Database
from reddit_bot import RedditBot
import praw


class TaskManager:

    def __init__(self):
        self.busy = Value("i", 0)
        self.current_website = None
        self.current_task = None

        reddit = praw.Reddit('opendirectories-bot',
                             user_agent='github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)')
        self.reddit_bot = RedditBot("crawled.txt", reddit)

        self.db = Database("db.sqlite3")
        scheduler = BackgroundScheduler()
        scheduler.add_job(self.check_new_task, "interval", seconds=1)
        scheduler.start()

    def check_new_task(self):
        if self.current_task is None:
            task = self.db.dequeue()

            if task:
                website_id, post_id = task
                website = self.db.get_website_by_id(website_id)
                self.current_task = Process(target=self.execute_task,
                                            args=(website, self.busy, post_id))
                self.current_website = website
                self.current_task.start()

        elif self.busy.value == 0:
            self.current_task.terminate()
            self.current_task = None
            self.current_website = None

    def execute_task(self, website: Website, busy: Value, post_id: str):
        busy.value = 1
        if os.path.exists("data.json"):
            os.remove("data.json")
        print("Started crawling task")
        process = CrawlerProcess(get_project_settings())
        process.crawl("od_links", base_url=website.url)
        process.start()
        print("Done crawling")

        self.db.import_json("data.json", website)
        os.remove("data.json")
        print("Imported in SQLite3")

        if post_id:
            # TODO check should_comment()
            stats = self.db.get_website_stats(website.id)
            comment = self.reddit_bot.get_comment(stats, website.id)
            print(comment)
            print(self.reddit_bot.reddit.submission(post_id))

        busy.value = 0
        print("Done crawling task")