diff --git a/generate_tasks.py b/generate_tasks.py index 39da98b..6513c3c 100644 --- a/generate_tasks.py +++ b/generate_tasks.py @@ -1,14 +1,14 @@ +import re from queue import Queue from threading import Thread from urllib.parse import urljoin import browser_cookie3 - import requests -import re -from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker -from hexlib.concurrency import queue_iter from bs4 import BeautifulSoup +from hexlib.concurrency import queue_iter + +from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker cj = browser_cookie3.firefox() @@ -58,7 +58,7 @@ def get_posts(link, start): yield urljoin(BASE, elem.get("href")) -api = TaskTrackerApi("http://localhost:8080/api") +api = TaskTrackerApi("https://tt.simon987.net/api") worker = Worker.from_file(api) if not worker: @@ -76,11 +76,12 @@ def submit_worker(q: Queue): worker.submit_task(**task) -for _ in range(4): +for _ in range(32): t = Thread(target=submit_worker, args=(q,)) t.setDaemon(True) t.start() +count = 0 for page in range(0, 50000, 40): for topic_url in get_posts(SECTIONS["educational"], start=page): q.put(dict( @@ -88,7 +89,9 @@ for page in range(0, 50000, 40): recipe=topic_url, max_assign_time=60 * 10, unique_str=get_topic_id(topic_url), + priority=max(32000 - page, 0) )) - print(page) + count += 1 + print(count) q.join() diff --git a/run b/run index a2ee897..2f0099a 100755 --- a/run +++ b/run @@ -1,13 +1,14 @@ #!/usr/bin/env python3 -import gzip import json import os -import pickle import re import subprocess -from base64 import b64decode +import tarfile +from io import BytesIO from sys import stderr +from tarfile import TarInfo, TarFile +from time import time from urllib.parse import urlparse, unquote import requests @@ -22,12 +23,22 @@ session = requests.Session() TOPIC_URL = os.environ["TASK_RECIPE"] -PREMIUM_LINKS = ( +METADATA = { + "ts": time(), + "topic_url": TOPIC_URL +} + +# See https://forum.mobilism.org/filehosts.xml +PREMIUM_LINKS = { "tusfiles.com", "userscloud.com", "uploaded.net", "ul.to", "uploaded.to", "2shared.com", "mediafire.com", "dailyuploads.net", "douploads.net", "centfile.com", "uploadev.org", "intoupload.net", - "uploadrar.com", "mixloads.com", "ddownload.com", "filezip.cc", "sendit.cloud", "dropapk.to", - "hulkload.com", "filerio.in", "rapidgator.net", "rg.to", "mega4up.com", "upload.ac", "dropgalaxy.in" -) + "mixloads.com", "ddownload.com", "filezip.cc", "sendit.cloud", "dropapk.to", + "hulkload.com", "filerio.in", "rapidgator.net", "rg.to", "mega4up.com", "upload.ac", "dropgalaxy.in", + "rapidshare.com", "uplod.it", "2shared.com", "billionuploads.com", "tusfiles.com", + "dropapk.com", "dropapk.to", "douploads.com", "douploads.me", "dailyuploads.net", "dailyuploads.cc", + "upload.ac", "ddl.to", "ddownload.com", "uploadev.com", "uploadev.org", "uploadrar.com", "uploadrar.net", + "filetitle.com", "filerio.in", "mega4up.com", "filezip.cc", "dropgalaxy.in" +} def is_supported_premium_dl(link): @@ -35,16 +46,26 @@ def is_supported_premium_dl(link): return parsed.netloc in PREMIUM_LINKS +def add_buf_to_tar(tar: TarFile, filename, data: bytes): + buf = BytesIO() + buf.write(data) + buf.flush() + buf.seek(0) + info = TarInfo(name=filename) + info.size = len(data) + tar.addfile(info, buf) + + def _download(link, i): - filename = "%s%02d_%s.gz" % (topic_id, i, unquote(os.path.basename(link)).replace("/", "_")) + filename = "%s%02d_%s" % (topic_id, i, unquote(os.path.basename(link)).replace("/", "_")) r = session.get(link) - with gzip.open(filename, "wb") as f: - f.write(r.content) + with tarfile.open(filename + ".tar.gz", "w:gz") as tar: + add_buf_to_tar(tar, filename, r.content) + add_buf_to_tar(tar, "meta.json", json.dumps(METADATA).encode()) - subprocess.run(["rclone", "copy", "--config", "tmp.conf", filename, "staging:mobilism/"]) - quit(0) + subprocess.run(["rclone", "copy", "--config", "tmp.conf", filename + ".tar.gz", "staging:mobilism/"]) def do_premium_download(link, i): @@ -55,32 +76,44 @@ def do_premium_download(link, i): "Content-Type": "application/x-www-form-urlencoded" }) + METADATA["do_premium_download"] = { + "link": link, + "response": r.text + } + soup = BeautifulSoup(r.content, "html.parser") form = soup.find("form") - with open("debug.do_premium_download.html", "wb") as f: - f.write(r.content) - if not form: if "The file you were looking for could not be found" not in r.text: print(r.content, file=stderr) return - r2 = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", { + data = { "link": form.find("input", attrs={"name": "link"}).get("value"), "referer": form.find("input", attrs={"name": "referer"}).get("value"), "filename": form.find("input", attrs={"name": "filename"}).get("value"), "host": form.find("input", attrs={"name": "host"}).get("value"), "path": form.find("input", attrs={"name": "path"}).get("value"), - }) + } + port_el = form.find("input", attrs={"name": "port"}) + if port_el: + data["port"] = port_el.get("value") + r2 = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", data) + + METADATA["do_premium_download2"] = { + "data": data, + "response": r2.text + } + soup2 = BeautifulSoup(r2.content, "html.parser") try: download_link = soup2.find("a", attrs={"download": lambda x: x}).get("download") _download(download_link, i) - except: - if "not found" not in r.text: + except Exception as e: + if "not found" not in r2.text: print(r2.content, file=stderr) - pass + raise e def get_topic_id(topic_url): @@ -91,8 +124,10 @@ def parse_topic(topic_url): r = session.get(topic_url) soup = BeautifulSoup(r.content, "html.parser") - with open("debug.parse_topic.html", "wb") as f: - f.write(r.content) + METADATA["parse_topic"] = { + "topic_url": topic_id, + "response": r.text + } for i, elem in enumerate(soup.find_all(class_="postlink")): if not elem.get("href"): @@ -100,7 +135,11 @@ def parse_topic(topic_url): link = elem.get("href") if is_supported_premium_dl(link): - do_premium_download(link, i) + try: + do_premium_download(link, i) + break + except: + continue def login(): @@ -116,8 +155,6 @@ def login(): }, headers={ "Content-Type": "application/x-www-form-urlencoded" }) - with open("debug.login.html", "wb") as f: - f.write(r.content) topic_id = get_topic_id(TOPIC_URL)