mirror of
https://github.com/simon987/mobilism_scrape.git
synced 2025-12-14 07:09:03 +00:00
initial
This commit is contained in:
94
generate_tasks.py
Normal file
94
generate_tasks.py
Normal file
@@ -0,0 +1,94 @@
|
||||
from queue import Queue
|
||||
from threading import Thread
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import browser_cookie3
|
||||
|
||||
import requests
|
||||
import re
|
||||
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
|
||||
from hexlib.concurrency import queue_iter
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
cj = browser_cookie3.firefox()
|
||||
|
||||
session = requests.Session()
|
||||
session.cookies = cj
|
||||
|
||||
SECTIONS = {
|
||||
# Non-fiction
|
||||
"biographies": "https://forum.mobilism.org/viewforum.php?f=1285",
|
||||
"educational": "https://forum.mobilism.org/viewforum.php?f=122",
|
||||
"philosophy": "https://forum.mobilism.org/viewforum.php?f=1345",
|
||||
"food": "https://forum.mobilism.org/viewforum.php?f=1328",
|
||||
"health": "https://forum.mobilism.org/viewforum.php?f=545",
|
||||
"history": "https://forum.mobilism.org/viewforum.php?f=1346",
|
||||
"tech": "https://forum.mobilism.org/viewforum.php?f=892",
|
||||
"general": "https://forum.mobilism.org/viewforum.php?f=126",
|
||||
|
||||
# Fiction
|
||||
"romance": "https://forum.mobilism.org/viewforum.php?f=1292",
|
||||
"erotic": "https://forum.mobilism.org/viewforum.php?f=1340",
|
||||
"scifi": "https://forum.mobilism.org/viewforum.php?f=1293",
|
||||
"mystery": "https://forum.mobilism.org/viewforum.php?f=1294",
|
||||
"classics": "https://forum.mobilism.org/viewforum.php?f=121",
|
||||
"children": "https://forum.mobilism.org/viewforum.php?f=1295",
|
||||
|
||||
"magazines": "https://forum.mobilism.org/viewforum.php?f=123",
|
||||
"comics": "https://forum.mobilism.org/viewforum.php?f=311",
|
||||
"collections": "https://forum.mobilism.org/viewforum.php?f=1271",
|
||||
"adult": "https://forum.mobilism.org/viewforum.php?f=125"
|
||||
}
|
||||
|
||||
BASE = "https://forum.mobilism.org"
|
||||
|
||||
|
||||
def get_topic_id(topic_url):
|
||||
return re.search("[&?]t=([0-9]+)", topic_url).group(1)
|
||||
|
||||
|
||||
def get_posts(link, start):
|
||||
r = session.get("%s&start=%d" % (link, start))
|
||||
|
||||
with open("test.html", "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
soup = BeautifulSoup(r.content, "html.parser")
|
||||
for elem in soup.find_all("a", attrs={"class": "topictitle"}):
|
||||
yield urljoin(BASE, elem.get("href"))
|
||||
|
||||
|
||||
api = TaskTrackerApi("http://localhost:8080/api")
|
||||
|
||||
worker = Worker.from_file(api)
|
||||
if not worker:
|
||||
worker = api.make_worker("mobilism_insert")
|
||||
worker.dump_to_file()
|
||||
|
||||
worker.request_access(project=1, assign=False, submit=True)
|
||||
input("Accept request")
|
||||
|
||||
q = Queue()
|
||||
|
||||
|
||||
def submit_worker(q: Queue):
|
||||
for task in queue_iter(q):
|
||||
worker.submit_task(**task)
|
||||
|
||||
|
||||
for _ in range(4):
|
||||
t = Thread(target=submit_worker, args=(q,))
|
||||
t.setDaemon(True)
|
||||
t.start()
|
||||
|
||||
for page in range(0, 50000, 40):
|
||||
for topic_url in get_posts(SECTIONS["educational"], start=page):
|
||||
q.put(dict(
|
||||
project=1,
|
||||
recipe=topic_url,
|
||||
max_assign_time=60 * 10,
|
||||
unique_str=get_topic_id(topic_url),
|
||||
))
|
||||
print(page)
|
||||
|
||||
q.join()
|
||||
Reference in New Issue
Block a user