mobilism_scrape/run

#!/usr/bin/env python3

import json
import os
import re
import subprocess
import tarfile
from io import BytesIO
from sys import stderr
from tarfile import TarInfo, TarFile
from time import time
from urllib.parse import urlparse, unquote

import requests
from bs4 import BeautifulSoup

secret = json.loads(os.environ["PROJECT_SECRET"])

with open("tmp.conf", "w") as f:
    f.write(secret["rclone"])

session = requests.Session()

TOPIC_URL = os.environ["TASK_RECIPE"]

METADATA = {
    "ts": time(),
    "topic_url": TOPIC_URL
}

# See https://forum.mobilism.org/filehosts.xml
PREMIUM_LINKS = {
    "tusfiles.com", "userscloud.com", "uploaded.net", "ul.to", "uploaded.to", "2shared.com",
    "mediafire.com", "dailyuploads.net", "douploads.net", "centfile.com", "uploadev.org", "intoupload.net",
    "mixloads.com", "ddownload.com", "filezip.cc", "sendit.cloud", "dropapk.to",
    "hulkload.com", "filerio.in", "rapidgator.net", "rg.to", "mega4up.com", "upload.ac", "dropgalaxy.in",
    "rapidshare.com", "uplod.it", "2shared.com", "billionuploads.com", "tusfiles.com",
    "dropapk.com", "dropapk.to", "douploads.com", "douploads.me", "dailyuploads.net", "dailyuploads.cc",
    "upload.ac", "ddl.to", "ddownload.com", "uploadev.com", "uploadev.org", "uploadrar.com", "uploadrar.net",
    "filetitle.com", "filerio.in", "mega4up.com", "filezip.cc", "dropgalaxy.in"
}


def is_supported_premium_dl(link):
    parsed = urlparse(link.lower())
    return parsed.netloc in PREMIUM_LINKS


def add_buf_to_tar(tar: TarFile, filename, data: bytes):
    buf = BytesIO()
    buf.write(data)
    buf.flush()
    buf.seek(0)
    info = TarInfo(name=filename)
    info.size = len(data)
    tar.addfile(info, buf)


def _download(link, i):
    filename = "%s%02d_%s" % (topic_id, i, unquote(os.path.basename(link)).replace("/", "_"))

    r = session.get(link)

    with tarfile.open(filename + ".tar.gz", "w:gz") as tar:
        add_buf_to_tar(tar, filename, r.content)
        add_buf_to_tar(tar, "meta.json", json.dumps(METADATA).encode())

    subprocess.run(["rclone", "copy", "--config", "tmp.conf", filename + ".tar.gz", "staging:mobilism/"])


def do_premium_download(link, i):
    r = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", data={
        "link": link,
        "premium_acc": "on"
    }, headers={
        "Content-Type": "application/x-www-form-urlencoded"
    })

    METADATA["do_premium_download"] = {
        "link": link,
        "response": r.text
    }

    soup = BeautifulSoup(r.content, "html.parser")
    form = soup.find("form")

    if not form:
        if "The file you were looking for could not be found" not in r.text:
            print(r.content, file=stderr)
        return

    data = {
        "link": form.find("input", attrs={"name": "link"}).get("value"),
        "referer": form.find("input", attrs={"name": "referer"}).get("value"),
        "filename": form.find("input", attrs={"name": "filename"}).get("value"),
        "host": form.find("input", attrs={"name": "host"}).get("value"),
        "path": form.find("input", attrs={"name": "path"}).get("value"),
    }
    port_el = form.find("input", attrs={"name": "port"})
    if port_el:
        data["port"] = port_el.get("value")
    r2 = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", data)

    METADATA["do_premium_download2"] = {
        "data": data,
        "response": r2.text
    }

    soup2 = BeautifulSoup(r2.content, "html.parser")
    try:
        download_link = soup2.find("a", attrs={"download": lambda x: x}).get("download")
        _download(download_link, i)
    except Exception as e:
        if "not found" not in r2.text:
            print(r2.content, file=stderr)
        raise e


def get_topic_id(topic_url):
    return re.search("[&?]t=([0-9]+)", topic_url).group(1)


def parse_topic(topic_url):
    r = session.get(topic_url)
    soup = BeautifulSoup(r.content, "html.parser")

    METADATA["parse_topic"] = {
        "topic_url": topic_id,
        "response": r.text
    }

    for i, elem in enumerate(soup.find_all(class_="postlink")):
        if not elem.get("href"):
            continue

        link = elem.get("href")
        if is_supported_premium_dl(link):
            try:
                do_premium_download(link, i)
                break
            except:
                continue


def login():
    r = session.get("https://forum.mobilism.org/ucp.php?mode=login")
    soup = BeautifulSoup(r.content, "html.parser")

    r = session.post("https://forum.mobilism.org/ucp.php?mode=login", data={
        "username": "78419273891",
        "password": "uprising-5overtly",
        "login": "Login",
        "redirect": ["./ucp.php?mode=login", "index.php"],
        "sid": soup.find("input", attrs={"name": "sid"}).get("value")
    }, headers={
        "Content-Type": "application/x-www-form-urlencoded"
    })


topic_id = get_topic_id(TOPIC_URL)
login()
parse_topic(TOPIC_URL)