commit 8fc858ab8fbea84655b5b60a4c10e4bf86b26265 Author: simon987 Date: Sun Jul 5 15:20:21 2020 -0400 initial diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5c6010f --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.iml +*.json +.idea/ \ No newline at end of file diff --git a/generate_tasks.py b/generate_tasks.py new file mode 100644 index 0000000..39da98b --- /dev/null +++ b/generate_tasks.py @@ -0,0 +1,94 @@ +from queue import Queue +from threading import Thread +from urllib.parse import urljoin + +import browser_cookie3 + +import requests +import re +from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker +from hexlib.concurrency import queue_iter +from bs4 import BeautifulSoup + +cj = browser_cookie3.firefox() + +session = requests.Session() +session.cookies = cj + +SECTIONS = { + # Non-fiction + "biographies": "https://forum.mobilism.org/viewforum.php?f=1285", + "educational": "https://forum.mobilism.org/viewforum.php?f=122", + "philosophy": "https://forum.mobilism.org/viewforum.php?f=1345", + "food": "https://forum.mobilism.org/viewforum.php?f=1328", + "health": "https://forum.mobilism.org/viewforum.php?f=545", + "history": "https://forum.mobilism.org/viewforum.php?f=1346", + "tech": "https://forum.mobilism.org/viewforum.php?f=892", + "general": "https://forum.mobilism.org/viewforum.php?f=126", + + # Fiction + "romance": "https://forum.mobilism.org/viewforum.php?f=1292", + "erotic": "https://forum.mobilism.org/viewforum.php?f=1340", + "scifi": "https://forum.mobilism.org/viewforum.php?f=1293", + "mystery": "https://forum.mobilism.org/viewforum.php?f=1294", + "classics": "https://forum.mobilism.org/viewforum.php?f=121", + "children": "https://forum.mobilism.org/viewforum.php?f=1295", + + "magazines": "https://forum.mobilism.org/viewforum.php?f=123", + "comics": "https://forum.mobilism.org/viewforum.php?f=311", + "collections": "https://forum.mobilism.org/viewforum.php?f=1271", + "adult": "https://forum.mobilism.org/viewforum.php?f=125" +} + +BASE = "https://forum.mobilism.org" + + +def get_topic_id(topic_url): + return re.search("[&?]t=([0-9]+)", topic_url).group(1) + + +def get_posts(link, start): + r = session.get("%s&start=%d" % (link, start)) + + with open("test.html", "wb") as f: + f.write(r.content) + + soup = BeautifulSoup(r.content, "html.parser") + for elem in soup.find_all("a", attrs={"class": "topictitle"}): + yield urljoin(BASE, elem.get("href")) + + +api = TaskTrackerApi("http://localhost:8080/api") + +worker = Worker.from_file(api) +if not worker: + worker = api.make_worker("mobilism_insert") + worker.dump_to_file() + +worker.request_access(project=1, assign=False, submit=True) +input("Accept request") + +q = Queue() + + +def submit_worker(q: Queue): + for task in queue_iter(q): + worker.submit_task(**task) + + +for _ in range(4): + t = Thread(target=submit_worker, args=(q,)) + t.setDaemon(True) + t.start() + +for page in range(0, 50000, 40): + for topic_url in get_posts(SECTIONS["educational"], start=page): + q.put(dict( + project=1, + recipe=topic_url, + max_assign_time=60 * 10, + unique_str=get_topic_id(topic_url), + )) + print(page) + +q.join() diff --git a/run.py b/run.py new file mode 100644 index 0000000..c9ec85a --- /dev/null +++ b/run.py @@ -0,0 +1,93 @@ +import gzip +import os +import pickle +import re +from base64 import b64decode +from urllib.parse import urlparse, unquote + +import requests +from bs4 import BeautifulSoup + + +def decode_cookiejar(b64_str): + data = b64decode(b64_str) + return pickle.loads(data) + + +# from hexlib.web import cookiejar_filter, encode_cookiejar, decode_cookiejar, save_cookiejar +# import browser_cookie3 +# +# cj = cookiejar_filter(browser_cookie3.firefox(), "forum.mobilism.org|mblservices.org") +# with open("cookies.txt", "wb") as f: +# f.write(encode_cookiejar(cj)) +cj = decode_cookiejar(os.environ["PROJECT_SECRET"]) + +session = requests.Session() +session.cookies = cj + +TOPIC_URL = "https://forum.mobilism.org/viewtopic.php?f=1346&t=3734829" + +PREMIUM_LINKS = ( + "tusfiles.com", "userscloud.com", "uploaded.net", "ul.to", "uploaded.to", "2shared.com", + "mediafire.com", "dailyuploads.net", "douploads.net", "centfile.com", "uploadev.org", "intoupload.net", + "uploadrar.com", "mixloads.com", "ddownload.com", "filezip.cc", "sendit.cloud", "dropapk.to", + "hulkload.com", "filerio.in", "rapidgator.net", "rg.to", "mega4up.com", "upload.ac", "dropgalaxy.in" +) + + +def is_supported_premium_dl(link): + parsed = urlparse(link.lower()) + return parsed.netloc in PREMIUM_LINKS + + +def _download(link, i): + filename = "%s%02d_%s.gz" % (topic_id, i, unquote(os.path.basename(link)).replace("/", "_")) + + r = session.get(link) + + with gzip.open(filename, "wb") as f: + f.write(r.content) + + +def do_premium_download(link, i): + r = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", data={ + "link": link, + "premium_acc": "on" + }, headers={ + "Content-Type": "application/x-www-form-urlencoded" + }) + + soup = BeautifulSoup(r.content, "html.parser") + form = soup.find("form") + + r2 = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", { + "link": form.find("input", attrs={"name": "link"}).get("value"), + "referer": form.find("input", attrs={"name": "referer"}).get("value"), + "filename": form.find("input", attrs={"name": "filename"}).get("value"), + "host": form.find("input", attrs={"name": "host"}).get("value"), + "path": form.find("input", attrs={"name": "path"}).get("value"), + }) + soup2 = BeautifulSoup(r2.content, "html.parser") + download_link = soup2.find("a", attrs={"download": lambda x: x}).get("download") + _download(download_link, i) + + +def get_topic_id(topic_url): + return re.search("[&?]t=([0-9]+)", topic_url).group(1) + + +def parse_topic(topic_url): + r = session.get(topic_url) + soup = BeautifulSoup(r.content, "html.parser") + + for i, elem in enumerate(soup.find_all(class_="postlink")): + if not elem.get("href"): + continue + + link = elem.get("href") + if is_supported_premium_dl(link): + do_premium_download(link, i) + + +topic_id = get_topic_id(TOPIC_URL) +parse_topic(TOPIC_URL)