from queue import Queue from threading import Thread from urllib.parse import urljoin import browser_cookie3 import requests import re from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker from hexlib.concurrency import queue_iter from bs4 import BeautifulSoup cj = browser_cookie3.firefox() session = requests.Session() session.cookies = cj SECTIONS = { # Non-fiction "biographies": "https://forum.mobilism.org/viewforum.php?f=1285", "educational": "https://forum.mobilism.org/viewforum.php?f=122", "philosophy": "https://forum.mobilism.org/viewforum.php?f=1345", "food": "https://forum.mobilism.org/viewforum.php?f=1328", "health": "https://forum.mobilism.org/viewforum.php?f=545", "history": "https://forum.mobilism.org/viewforum.php?f=1346", "tech": "https://forum.mobilism.org/viewforum.php?f=892", "general": "https://forum.mobilism.org/viewforum.php?f=126", # Fiction "romance": "https://forum.mobilism.org/viewforum.php?f=1292", "erotic": "https://forum.mobilism.org/viewforum.php?f=1340", "scifi": "https://forum.mobilism.org/viewforum.php?f=1293", "mystery": "https://forum.mobilism.org/viewforum.php?f=1294", "classics": "https://forum.mobilism.org/viewforum.php?f=121", "children": "https://forum.mobilism.org/viewforum.php?f=1295", "magazines": "https://forum.mobilism.org/viewforum.php?f=123", "comics": "https://forum.mobilism.org/viewforum.php?f=311", "collections": "https://forum.mobilism.org/viewforum.php?f=1271", "adult": "https://forum.mobilism.org/viewforum.php?f=125" } BASE = "https://forum.mobilism.org" def get_topic_id(topic_url): return re.search("[&?]t=([0-9]+)", topic_url).group(1) def get_posts(link, start): r = session.get("%s&start=%d" % (link, start)) with open("test.html", "wb") as f: f.write(r.content) soup = BeautifulSoup(r.content, "html.parser") for elem in soup.find_all("a", attrs={"class": "topictitle"}): yield urljoin(BASE, elem.get("href")) api = TaskTrackerApi("http://localhost:8080/api") worker = Worker.from_file(api) if not worker: worker = api.make_worker("mobilism_insert") worker.dump_to_file() worker.request_access(project=1, assign=False, submit=True) input("Accept request") q = Queue() def submit_worker(q: Queue): for task in queue_iter(q): worker.submit_task(**task) for _ in range(4): t = Thread(target=submit_worker, args=(q,)) t.setDaemon(True) t.start() for page in range(0, 50000, 40): for topic_url in get_posts(SECTIONS["educational"], start=page): q.put(dict( project=1, recipe=topic_url, max_assign_time=60 * 10, unique_str=get_topic_id(topic_url), )) print(page) q.join()