This commit is contained in:
simon987 2020-07-06 19:30:32 -04:00
parent 57b994cc65
commit 532b27ab19
2 changed files with 72 additions and 32 deletions

View File

@ -1,14 +1,14 @@
import re
from queue import Queue from queue import Queue
from threading import Thread from threading import Thread
from urllib.parse import urljoin from urllib.parse import urljoin
import browser_cookie3 import browser_cookie3
import requests import requests
import re
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
from hexlib.concurrency import queue_iter
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from hexlib.concurrency import queue_iter
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
cj = browser_cookie3.firefox() cj = browser_cookie3.firefox()
@ -58,7 +58,7 @@ def get_posts(link, start):
yield urljoin(BASE, elem.get("href")) yield urljoin(BASE, elem.get("href"))
api = TaskTrackerApi("http://localhost:8080/api") api = TaskTrackerApi("https://tt.simon987.net/api")
worker = Worker.from_file(api) worker = Worker.from_file(api)
if not worker: if not worker:
@ -76,11 +76,12 @@ def submit_worker(q: Queue):
worker.submit_task(**task) worker.submit_task(**task)
for _ in range(4): for _ in range(32):
t = Thread(target=submit_worker, args=(q,)) t = Thread(target=submit_worker, args=(q,))
t.setDaemon(True) t.setDaemon(True)
t.start() t.start()
count = 0
for page in range(0, 50000, 40): for page in range(0, 50000, 40):
for topic_url in get_posts(SECTIONS["educational"], start=page): for topic_url in get_posts(SECTIONS["educational"], start=page):
q.put(dict( q.put(dict(
@ -88,7 +89,9 @@ for page in range(0, 50000, 40):
recipe=topic_url, recipe=topic_url,
max_assign_time=60 * 10, max_assign_time=60 * 10,
unique_str=get_topic_id(topic_url), unique_str=get_topic_id(topic_url),
priority=max(32000 - page, 0)
)) ))
print(page) count += 1
print(count)
q.join() q.join()

87
run
View File

@ -1,13 +1,14 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import gzip
import json import json
import os import os
import pickle
import re import re
import subprocess import subprocess
from base64 import b64decode import tarfile
from io import BytesIO
from sys import stderr from sys import stderr
from tarfile import TarInfo, TarFile
from time import time
from urllib.parse import urlparse, unquote from urllib.parse import urlparse, unquote
import requests import requests
@ -22,12 +23,22 @@ session = requests.Session()
TOPIC_URL = os.environ["TASK_RECIPE"] TOPIC_URL = os.environ["TASK_RECIPE"]
PREMIUM_LINKS = ( METADATA = {
"ts": time(),
"topic_url": TOPIC_URL
}
# See https://forum.mobilism.org/filehosts.xml
PREMIUM_LINKS = {
"tusfiles.com", "userscloud.com", "uploaded.net", "ul.to", "uploaded.to", "2shared.com", "tusfiles.com", "userscloud.com", "uploaded.net", "ul.to", "uploaded.to", "2shared.com",
"mediafire.com", "dailyuploads.net", "douploads.net", "centfile.com", "uploadev.org", "intoupload.net", "mediafire.com", "dailyuploads.net", "douploads.net", "centfile.com", "uploadev.org", "intoupload.net",
"uploadrar.com", "mixloads.com", "ddownload.com", "filezip.cc", "sendit.cloud", "dropapk.to", "mixloads.com", "ddownload.com", "filezip.cc", "sendit.cloud", "dropapk.to",
"hulkload.com", "filerio.in", "rapidgator.net", "rg.to", "mega4up.com", "upload.ac", "dropgalaxy.in" "hulkload.com", "filerio.in", "rapidgator.net", "rg.to", "mega4up.com", "upload.ac", "dropgalaxy.in",
) "rapidshare.com", "uplod.it", "2shared.com", "billionuploads.com", "tusfiles.com",
"dropapk.com", "dropapk.to", "douploads.com", "douploads.me", "dailyuploads.net", "dailyuploads.cc",
"upload.ac", "ddl.to", "ddownload.com", "uploadev.com", "uploadev.org", "uploadrar.com", "uploadrar.net",
"filetitle.com", "filerio.in", "mega4up.com", "filezip.cc", "dropgalaxy.in"
}
def is_supported_premium_dl(link): def is_supported_premium_dl(link):
@ -35,16 +46,26 @@ def is_supported_premium_dl(link):
return parsed.netloc in PREMIUM_LINKS return parsed.netloc in PREMIUM_LINKS
def add_buf_to_tar(tar: TarFile, filename, data: bytes):
buf = BytesIO()
buf.write(data)
buf.flush()
buf.seek(0)
info = TarInfo(name=filename)
info.size = len(data)
tar.addfile(info, buf)
def _download(link, i): def _download(link, i):
filename = "%s%02d_%s.gz" % (topic_id, i, unquote(os.path.basename(link)).replace("/", "_")) filename = "%s%02d_%s" % (topic_id, i, unquote(os.path.basename(link)).replace("/", "_"))
r = session.get(link) r = session.get(link)
with gzip.open(filename, "wb") as f: with tarfile.open(filename + ".tar.gz", "w:gz") as tar:
f.write(r.content) add_buf_to_tar(tar, filename, r.content)
add_buf_to_tar(tar, "meta.json", json.dumps(METADATA).encode())
subprocess.run(["rclone", "copy", "--config", "tmp.conf", filename, "staging:mobilism/"]) subprocess.run(["rclone", "copy", "--config", "tmp.conf", filename + ".tar.gz", "staging:mobilism/"])
quit(0)
def do_premium_download(link, i): def do_premium_download(link, i):
@ -55,32 +76,44 @@ def do_premium_download(link, i):
"Content-Type": "application/x-www-form-urlencoded" "Content-Type": "application/x-www-form-urlencoded"
}) })
METADATA["do_premium_download"] = {
"link": link,
"response": r.text
}
soup = BeautifulSoup(r.content, "html.parser") soup = BeautifulSoup(r.content, "html.parser")
form = soup.find("form") form = soup.find("form")
with open("debug.do_premium_download.html", "wb") as f:
f.write(r.content)
if not form: if not form:
if "The file you were looking for could not be found" not in r.text: if "The file you were looking for could not be found" not in r.text:
print(r.content, file=stderr) print(r.content, file=stderr)
return return
r2 = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", { data = {
"link": form.find("input", attrs={"name": "link"}).get("value"), "link": form.find("input", attrs={"name": "link"}).get("value"),
"referer": form.find("input", attrs={"name": "referer"}).get("value"), "referer": form.find("input", attrs={"name": "referer"}).get("value"),
"filename": form.find("input", attrs={"name": "filename"}).get("value"), "filename": form.find("input", attrs={"name": "filename"}).get("value"),
"host": form.find("input", attrs={"name": "host"}).get("value"), "host": form.find("input", attrs={"name": "host"}).get("value"),
"path": form.find("input", attrs={"name": "path"}).get("value"), "path": form.find("input", attrs={"name": "path"}).get("value"),
}) }
port_el = form.find("input", attrs={"name": "port"})
if port_el:
data["port"] = port_el.get("value")
r2 = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", data)
METADATA["do_premium_download2"] = {
"data": data,
"response": r2.text
}
soup2 = BeautifulSoup(r2.content, "html.parser") soup2 = BeautifulSoup(r2.content, "html.parser")
try: try:
download_link = soup2.find("a", attrs={"download": lambda x: x}).get("download") download_link = soup2.find("a", attrs={"download": lambda x: x}).get("download")
_download(download_link, i) _download(download_link, i)
except: except Exception as e:
if "not found" not in r.text: if "not found" not in r2.text:
print(r2.content, file=stderr) print(r2.content, file=stderr)
pass raise e
def get_topic_id(topic_url): def get_topic_id(topic_url):
@ -91,8 +124,10 @@ def parse_topic(topic_url):
r = session.get(topic_url) r = session.get(topic_url)
soup = BeautifulSoup(r.content, "html.parser") soup = BeautifulSoup(r.content, "html.parser")
with open("debug.parse_topic.html", "wb") as f: METADATA["parse_topic"] = {
f.write(r.content) "topic_url": topic_id,
"response": r.text
}
for i, elem in enumerate(soup.find_all(class_="postlink")): for i, elem in enumerate(soup.find_all(class_="postlink")):
if not elem.get("href"): if not elem.get("href"):
@ -100,7 +135,11 @@ def parse_topic(topic_url):
link = elem.get("href") link = elem.get("href")
if is_supported_premium_dl(link): if is_supported_premium_dl(link):
do_premium_download(link, i) try:
do_premium_download(link, i)
break
except:
continue
def login(): def login():
@ -116,8 +155,6 @@ def login():
}, headers={ }, headers={
"Content-Type": "application/x-www-form-urlencoded" "Content-Type": "application/x-www-form-urlencoded"
}) })
with open("debug.login.html", "wb") as f:
f.write(r.content)
topic_id = get_topic_id(TOPIC_URL) topic_id = get_topic_id(TOPIC_URL)