mobilism_scrape/run

#!/usr/bin/env python3

import gzip
import json
import os
import pickle
import re
import subprocess
from base64 import b64decode
from sys import stderr
from urllib.parse import urlparse, unquote

import requests
from bs4 import BeautifulSoup

secret = json.loads(os.environ["PROJECT_SECRET"])

with open("tmp.conf", "w") as f:
    f.write(secret["rclone"])

session = requests.Session()

TOPIC_URL = os.environ["TASK_RECIPE"]

PREMIUM_LINKS = (
    "tusfiles.com", "userscloud.com", "uploaded.net", "ul.to", "uploaded.to", "2shared.com",
    "mediafire.com", "dailyuploads.net", "douploads.net", "centfile.com", "uploadev.org", "intoupload.net",
    "uploadrar.com", "mixloads.com", "ddownload.com", "filezip.cc", "sendit.cloud", "dropapk.to",
    "hulkload.com", "filerio.in", "rapidgator.net", "rg.to", "mega4up.com", "upload.ac", "dropgalaxy.in"
)


def is_supported_premium_dl(link):
    parsed = urlparse(link.lower())
    return parsed.netloc in PREMIUM_LINKS


def _download(link, i):
    filename = "%s%02d_%s.gz" % (topic_id, i, unquote(os.path.basename(link)).replace("/", "_"))

    r = session.get(link)

    with gzip.open(filename, "wb") as f:
        f.write(r.content)

    subprocess.run(["rclone", "copy", "--config", "tmp.conf", filename, "staging:mobilism/"])
    quit(0)


def do_premium_download(link, i):
    r = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", data={
        "link": link,
        "premium_acc": "on"
    }, headers={
        "Content-Type": "application/x-www-form-urlencoded"
    })

    soup = BeautifulSoup(r.content, "html.parser")
    form = soup.find("form")

    with open("debug.do_premium_download.html", "wb") as f:
        f.write(r.content)

    if not form:
        if "The file you were looking for could not be found" not in r.text:
            print(r.content, file=stderr)
        return

    r2 = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", {
        "link": form.find("input", attrs={"name": "link"}).get("value"),
        "referer": form.find("input", attrs={"name": "referer"}).get("value"),
        "filename": form.find("input", attrs={"name": "filename"}).get("value"),
        "host": form.find("input", attrs={"name": "host"}).get("value"),
        "path": form.find("input", attrs={"name": "path"}).get("value"),
    })
    soup2 = BeautifulSoup(r2.content, "html.parser")
    try:
        download_link = soup2.find("a", attrs={"download": lambda x: x}).get("download")
        _download(download_link, i)
    except:
        if "not found" not in r.text:
            print(r2.content, file=stderr)
        pass


def get_topic_id(topic_url):
    return re.search("[&?]t=([0-9]+)", topic_url).group(1)


def parse_topic(topic_url):
    r = session.get(topic_url)
    soup = BeautifulSoup(r.content, "html.parser")

    with open("debug.parse_topic.html", "wb") as f:
        f.write(r.content)

    for i, elem in enumerate(soup.find_all(class_="postlink")):
        if not elem.get("href"):
            continue

        link = elem.get("href")
        if is_supported_premium_dl(link):
            do_premium_download(link, i)


def login():
    r = session.get("https://forum.mobilism.org/ucp.php?mode=login")
    soup = BeautifulSoup(r.content, "html.parser")

    r = session.post("https://forum.mobilism.org/ucp.php?mode=login", data={
        "username": "78419273891",
        "password": "uprising-5overtly",
        "login": "Login",
        "redirect": ["./ucp.php?mode=login", "index.php"],
        "sid": soup.find("input", attrs={"name": "sid"}).get("value")
    }, headers={
        "Content-Type": "application/x-www-form-urlencoded"
    })
    with open("debug.login.html", "wb") as f:
        f.write(r.content)


topic_id = get_topic_id(TOPIC_URL)
login()
parse_topic(TOPIC_URL)