mirror of
https://github.com/simon987/mobilism_scrape.git
synced 2025-04-18 01:56:45 +00:00
117 lines
3.3 KiB
Python
Executable File
117 lines
3.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import gzip
|
|
import json
|
|
import os
|
|
import pickle
|
|
import re
|
|
import subprocess
|
|
from base64 import b64decode
|
|
from hashlib import sha1
|
|
from sys import stderr
|
|
from urllib.parse import urlparse, unquote
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from requests.cookies import RequestsCookieJar
|
|
|
|
|
|
def decode_cookiejar(data):
|
|
cj = RequestsCookieJar()
|
|
cj._cookies = pickle.loads(b64decode(data))
|
|
return cj
|
|
|
|
|
|
# import browser_cookie3
|
|
# cj = cookiejar_filter(browser_cookie3.firefox(), "forum.mobilism.org|mblservices.org")
|
|
# with open("cookies.txt", "w") as f:
|
|
# f.write(encode_cookiejar(cj))
|
|
|
|
done = set()
|
|
|
|
secret = json.loads(os.environ["PROJECT_SECRET"])
|
|
|
|
with open("tmp.conf", "w") as f:
|
|
f.write(secret["rclone"])
|
|
|
|
cj = decode_cookiejar(secret["cookies"])
|
|
|
|
session = requests.Session()
|
|
session.cookies = cj
|
|
|
|
TOPIC_URL = os.environ["TASK_RECIPE"]
|
|
|
|
PREMIUM_LINKS = (
|
|
"tusfiles.com", "userscloud.com", "uploaded.net", "ul.to", "uploaded.to", "2shared.com",
|
|
"mediafire.com", "dailyuploads.net", "douploads.net", "centfile.com", "uploadev.org", "intoupload.net",
|
|
"uploadrar.com", "mixloads.com", "ddownload.com", "filezip.cc", "sendit.cloud", "dropapk.to",
|
|
"hulkload.com", "filerio.in", "rapidgator.net", "rg.to", "mega4up.com", "upload.ac", "dropgalaxy.in"
|
|
)
|
|
|
|
|
|
def is_supported_premium_dl(link):
|
|
parsed = urlparse(link.lower())
|
|
return parsed.netloc in PREMIUM_LINKS
|
|
|
|
|
|
def _download(link, i):
|
|
filename = "%s%02d_%s.gz" % (topic_id, i, unquote(os.path.basename(link)).replace("/", "_"))
|
|
|
|
r = session.get(link)
|
|
|
|
with gzip.open(filename, "wb") as f:
|
|
f.write(r.content)
|
|
|
|
sha1sum = sha1(r.content).hexdigest()
|
|
if sha1sum in done:
|
|
subprocess.run(["rclone", "copy", "--config", "tmp.conf", filename, "staging:mobilism/"])
|
|
done.add(sha1sum)
|
|
|
|
|
|
def do_premium_download(link, i):
|
|
r = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", data={
|
|
"link": link,
|
|
"premium_acc": "on"
|
|
}, headers={
|
|
"Content-Type": "application/x-www-form-urlencoded"
|
|
})
|
|
|
|
soup = BeautifulSoup(r.content, "html.parser")
|
|
form = soup.find("form")
|
|
|
|
r2 = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", {
|
|
"link": form.find("input", attrs={"name": "link"}).get("value"),
|
|
"referer": form.find("input", attrs={"name": "referer"}).get("value"),
|
|
"filename": form.find("input", attrs={"name": "filename"}).get("value"),
|
|
"host": form.find("input", attrs={"name": "host"}).get("value"),
|
|
"path": form.find("input", attrs={"name": "path"}).get("value"),
|
|
})
|
|
soup2 = BeautifulSoup(r2.content, "html.parser")
|
|
try:
|
|
download_link = soup2.find("a", attrs={"download": lambda x: x}).get("download")
|
|
_download(download_link, i)
|
|
except:
|
|
print(r2.content, file=stderr)
|
|
pass
|
|
|
|
|
|
def get_topic_id(topic_url):
|
|
return re.search("[&?]t=([0-9]+)", topic_url).group(1)
|
|
|
|
|
|
def parse_topic(topic_url):
|
|
r = session.get(topic_url)
|
|
soup = BeautifulSoup(r.content, "html.parser")
|
|
|
|
for i, elem in enumerate(soup.find_all(class_="postlink")):
|
|
if not elem.get("href"):
|
|
continue
|
|
|
|
link = elem.get("href")
|
|
if is_supported_premium_dl(link):
|
|
do_premium_download(link, i)
|
|
|
|
|
|
topic_id = get_topic_id(TOPIC_URL)
|
|
parse_topic(TOPIC_URL)
|