1
0
mirror of https://github.com/simon987/mobilism_scrape.git synced 2025-04-09 05:46:44 +00:00
This commit is contained in:
simon987 2020-07-06 19:30:32 -04:00
parent 57b994cc65
commit 532b27ab19
2 changed files with 72 additions and 32 deletions

@ -1,14 +1,14 @@
import re
from queue import Queue
from threading import Thread
from urllib.parse import urljoin
import browser_cookie3
import requests
import re
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
from hexlib.concurrency import queue_iter
from bs4 import BeautifulSoup
from hexlib.concurrency import queue_iter
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
cj = browser_cookie3.firefox()
@ -58,7 +58,7 @@ def get_posts(link, start):
yield urljoin(BASE, elem.get("href"))
api = TaskTrackerApi("http://localhost:8080/api")
api = TaskTrackerApi("https://tt.simon987.net/api")
worker = Worker.from_file(api)
if not worker:
@ -76,11 +76,12 @@ def submit_worker(q: Queue):
worker.submit_task(**task)
for _ in range(4):
for _ in range(32):
t = Thread(target=submit_worker, args=(q,))
t.setDaemon(True)
t.start()
count = 0
for page in range(0, 50000, 40):
for topic_url in get_posts(SECTIONS["educational"], start=page):
q.put(dict(
@ -88,7 +89,9 @@ for page in range(0, 50000, 40):
recipe=topic_url,
max_assign_time=60 * 10,
unique_str=get_topic_id(topic_url),
priority=max(32000 - page, 0)
))
print(page)
count += 1
print(count)
q.join()

87
run

@ -1,13 +1,14 @@
#!/usr/bin/env python3
import gzip
import json
import os
import pickle
import re
import subprocess
from base64 import b64decode
import tarfile
from io import BytesIO
from sys import stderr
from tarfile import TarInfo, TarFile
from time import time
from urllib.parse import urlparse, unquote
import requests
@ -22,12 +23,22 @@ session = requests.Session()
TOPIC_URL = os.environ["TASK_RECIPE"]
PREMIUM_LINKS = (
METADATA = {
"ts": time(),
"topic_url": TOPIC_URL
}
# See https://forum.mobilism.org/filehosts.xml
PREMIUM_LINKS = {
"tusfiles.com", "userscloud.com", "uploaded.net", "ul.to", "uploaded.to", "2shared.com",
"mediafire.com", "dailyuploads.net", "douploads.net", "centfile.com", "uploadev.org", "intoupload.net",
"uploadrar.com", "mixloads.com", "ddownload.com", "filezip.cc", "sendit.cloud", "dropapk.to",
"hulkload.com", "filerio.in", "rapidgator.net", "rg.to", "mega4up.com", "upload.ac", "dropgalaxy.in"
)
"mixloads.com", "ddownload.com", "filezip.cc", "sendit.cloud", "dropapk.to",
"hulkload.com", "filerio.in", "rapidgator.net", "rg.to", "mega4up.com", "upload.ac", "dropgalaxy.in",
"rapidshare.com", "uplod.it", "2shared.com", "billionuploads.com", "tusfiles.com",
"dropapk.com", "dropapk.to", "douploads.com", "douploads.me", "dailyuploads.net", "dailyuploads.cc",
"upload.ac", "ddl.to", "ddownload.com", "uploadev.com", "uploadev.org", "uploadrar.com", "uploadrar.net",
"filetitle.com", "filerio.in", "mega4up.com", "filezip.cc", "dropgalaxy.in"
}
def is_supported_premium_dl(link):
@ -35,16 +46,26 @@ def is_supported_premium_dl(link):
return parsed.netloc in PREMIUM_LINKS
def add_buf_to_tar(tar: TarFile, filename, data: bytes):
buf = BytesIO()
buf.write(data)
buf.flush()
buf.seek(0)
info = TarInfo(name=filename)
info.size = len(data)
tar.addfile(info, buf)
def _download(link, i):
filename = "%s%02d_%s.gz" % (topic_id, i, unquote(os.path.basename(link)).replace("/", "_"))
filename = "%s%02d_%s" % (topic_id, i, unquote(os.path.basename(link)).replace("/", "_"))
r = session.get(link)
with gzip.open(filename, "wb") as f:
f.write(r.content)
with tarfile.open(filename + ".tar.gz", "w:gz") as tar:
add_buf_to_tar(tar, filename, r.content)
add_buf_to_tar(tar, "meta.json", json.dumps(METADATA).encode())
subprocess.run(["rclone", "copy", "--config", "tmp.conf", filename, "staging:mobilism/"])
quit(0)
subprocess.run(["rclone", "copy", "--config", "tmp.conf", filename + ".tar.gz", "staging:mobilism/"])
def do_premium_download(link, i):
@ -55,32 +76,44 @@ def do_premium_download(link, i):
"Content-Type": "application/x-www-form-urlencoded"
})
METADATA["do_premium_download"] = {
"link": link,
"response": r.text
}
soup = BeautifulSoup(r.content, "html.parser")
form = soup.find("form")
with open("debug.do_premium_download.html", "wb") as f:
f.write(r.content)
if not form:
if "The file you were looking for could not be found" not in r.text:
print(r.content, file=stderr)
return
r2 = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", {
data = {
"link": form.find("input", attrs={"name": "link"}).get("value"),
"referer": form.find("input", attrs={"name": "referer"}).get("value"),
"filename": form.find("input", attrs={"name": "filename"}).get("value"),
"host": form.find("input", attrs={"name": "host"}).get("value"),
"path": form.find("input", attrs={"name": "path"}).get("value"),
})
}
port_el = form.find("input", attrs={"name": "port"})
if port_el:
data["port"] = port_el.get("value")
r2 = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", data)
METADATA["do_premium_download2"] = {
"data": data,
"response": r2.text
}
soup2 = BeautifulSoup(r2.content, "html.parser")
try:
download_link = soup2.find("a", attrs={"download": lambda x: x}).get("download")
_download(download_link, i)
except:
if "not found" not in r.text:
except Exception as e:
if "not found" not in r2.text:
print(r2.content, file=stderr)
pass
raise e
def get_topic_id(topic_url):
@ -91,8 +124,10 @@ def parse_topic(topic_url):
r = session.get(topic_url)
soup = BeautifulSoup(r.content, "html.parser")
with open("debug.parse_topic.html", "wb") as f:
f.write(r.content)
METADATA["parse_topic"] = {
"topic_url": topic_id,
"response": r.text
}
for i, elem in enumerate(soup.find_all(class_="postlink")):
if not elem.get("href"):
@ -100,7 +135,11 @@ def parse_topic(topic_url):
link = elem.get("href")
if is_supported_premium_dl(link):
do_premium_download(link, i)
try:
do_premium_download(link, i)
break
except:
continue
def login():
@ -116,8 +155,6 @@ def login():
}, headers={
"Content-Type": "application/x-www-form-urlencoded"
})
with open("debug.login.html", "wb") as f:
f.write(r.content)
topic_id = get_topic_id(TOPIC_URL)