1
0
mirror of https://github.com/simon987/mobilism_scrape.git synced 2025-04-08 05:16:44 +00:00
2020-07-06 19:35:22 -04:00

98 lines
2.8 KiB
Python

import re
from queue import Queue
from threading import Thread
from urllib.parse import urljoin
import browser_cookie3
import requests
from bs4 import BeautifulSoup
from hexlib.concurrency import queue_iter
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
cj = browser_cookie3.firefox()
session = requests.Session()
session.cookies = cj
SECTIONS = {
# Non-fiction
"biographies": "https://forum.mobilism.org/viewforum.php?f=1285",
"educational": "https://forum.mobilism.org/viewforum.php?f=122",
"philosophy": "https://forum.mobilism.org/viewforum.php?f=1345",
"food": "https://forum.mobilism.org/viewforum.php?f=1328",
"health": "https://forum.mobilism.org/viewforum.php?f=545",
"history": "https://forum.mobilism.org/viewforum.php?f=1346",
"tech": "https://forum.mobilism.org/viewforum.php?f=892",
"general": "https://forum.mobilism.org/viewforum.php?f=126",
# Fiction
"romance": "https://forum.mobilism.org/viewforum.php?f=1292",
"erotic": "https://forum.mobilism.org/viewforum.php?f=1340",
"scifi": "https://forum.mobilism.org/viewforum.php?f=1293",
"mystery": "https://forum.mobilism.org/viewforum.php?f=1294",
"classics": "https://forum.mobilism.org/viewforum.php?f=121",
"children": "https://forum.mobilism.org/viewforum.php?f=1295",
"magazines": "https://forum.mobilism.org/viewforum.php?f=123",
"comics": "https://forum.mobilism.org/viewforum.php?f=311",
"collections": "https://forum.mobilism.org/viewforum.php?f=1271",
"adult": "https://forum.mobilism.org/viewforum.php?f=125"
}
BASE = "https://forum.mobilism.org"
def get_topic_id(topic_url):
return re.search("[&?]t=([0-9]+)", topic_url).group(1)
def get_posts(link, start):
r = session.get("%s&start=%d" % (link, start))
with open("test.html", "wb") as f:
f.write(r.content)
soup = BeautifulSoup(r.content, "html.parser")
for elem in soup.find_all("a", attrs={"class": "topictitle"}):
yield urljoin(BASE, elem.get("href"))
api = TaskTrackerApi("https://tt.simon987.net/api")
worker = Worker.from_file(api)
if not worker:
worker = api.make_worker("mobilism_insert")
worker.dump_to_file()
worker.request_access(project=1, assign=False, submit=True)
input("Accept request")
q = Queue()
def submit_worker(q: Queue):
for task in queue_iter(q):
worker.submit_task(**task)
for _ in range(32):
t = Thread(target=submit_worker, args=(q,))
t.setDaemon(True)
t.start()
count = 0
for page in range(0, 50000, 40):
for topic_url in get_posts(SECTIONS["educational"], start=page):
q.put(dict(
project=1,
recipe=topic_url,
max_assign_time=60 * 10,
unique_str=get_topic_id(topic_url),
priority=max(32000 - page, 0)
))
count += 1
print(count)
q.join()