mirror of
https://github.com/simon987/od-database.git
synced 2025-12-13 23:09:01 +00:00
Initial commit
This commit is contained in:
67
task.py
Normal file
67
task.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
import os
|
||||
from database import Website
|
||||
from multiprocessing import Value, Process
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.utils.project import get_project_settings
|
||||
from database import Database
|
||||
from reddit_bot import RedditBot
|
||||
import praw
|
||||
|
||||
|
||||
class TaskManager:
|
||||
|
||||
def __init__(self):
|
||||
self.busy = Value("i", 0)
|
||||
self.current_website = None
|
||||
self.current_task = None
|
||||
|
||||
reddit = praw.Reddit('opendirectories-bot',
|
||||
user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)')
|
||||
self.reddit_bot = RedditBot("crawled.txt", reddit)
|
||||
|
||||
self.db = Database("db.sqlite3")
|
||||
scheduler = BackgroundScheduler()
|
||||
scheduler.add_job(self.check_new_task, "interval", seconds=1)
|
||||
scheduler.start()
|
||||
|
||||
def check_new_task(self):
|
||||
if self.current_task is None:
|
||||
task = self.db.dequeue()
|
||||
|
||||
if task:
|
||||
website_id, post_id = task
|
||||
website = self.db.get_website_by_id(website_id)
|
||||
self.current_task = Process(target=self.execute_task,
|
||||
args=(website, self.busy, post_id))
|
||||
self.current_website = website
|
||||
self.current_task.start()
|
||||
|
||||
elif self.busy.value == 0:
|
||||
self.current_task.terminate()
|
||||
self.current_task = None
|
||||
self.current_website = None
|
||||
|
||||
def execute_task(self, website: Website, busy: Value, post_id: str):
|
||||
busy.value = 1
|
||||
if os.path.exists("data.json"):
|
||||
os.remove("data.json")
|
||||
print("Started crawling task")
|
||||
process = CrawlerProcess(get_project_settings())
|
||||
process.crawl("od_links", base_url=website.url)
|
||||
process.start()
|
||||
print("Done crawling")
|
||||
|
||||
self.db.import_json("data.json", website)
|
||||
os.remove("data.json")
|
||||
print("Imported in SQLite3")
|
||||
|
||||
if post_id:
|
||||
stats = self.db.get_website_stats(website.id)
|
||||
comment = self.reddit_bot.get_comment(stats, website.id)
|
||||
print(comment)
|
||||
print(self.reddit_bot.reddit.submission(post_id))
|
||||
|
||||
busy.value = 0
|
||||
print("Done crawling task")
|
||||
|
||||
Reference in New Issue
Block a user