mirror of
https://github.com/simon987/mobilism_scrape.git
synced 2025-04-17 17:46:43 +00:00
update
This commit is contained in:
parent
57b994cc65
commit
532b27ab19
@ -1,14 +1,14 @@
|
|||||||
|
import re
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import browser_cookie3
|
import browser_cookie3
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import re
|
|
||||||
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
|
|
||||||
from hexlib.concurrency import queue_iter
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from hexlib.concurrency import queue_iter
|
||||||
|
|
||||||
|
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
|
||||||
|
|
||||||
cj = browser_cookie3.firefox()
|
cj = browser_cookie3.firefox()
|
||||||
|
|
||||||
@ -58,7 +58,7 @@ def get_posts(link, start):
|
|||||||
yield urljoin(BASE, elem.get("href"))
|
yield urljoin(BASE, elem.get("href"))
|
||||||
|
|
||||||
|
|
||||||
api = TaskTrackerApi("http://localhost:8080/api")
|
api = TaskTrackerApi("https://tt.simon987.net/api")
|
||||||
|
|
||||||
worker = Worker.from_file(api)
|
worker = Worker.from_file(api)
|
||||||
if not worker:
|
if not worker:
|
||||||
@ -76,11 +76,12 @@ def submit_worker(q: Queue):
|
|||||||
worker.submit_task(**task)
|
worker.submit_task(**task)
|
||||||
|
|
||||||
|
|
||||||
for _ in range(4):
|
for _ in range(32):
|
||||||
t = Thread(target=submit_worker, args=(q,))
|
t = Thread(target=submit_worker, args=(q,))
|
||||||
t.setDaemon(True)
|
t.setDaemon(True)
|
||||||
t.start()
|
t.start()
|
||||||
|
|
||||||
|
count = 0
|
||||||
for page in range(0, 50000, 40):
|
for page in range(0, 50000, 40):
|
||||||
for topic_url in get_posts(SECTIONS["educational"], start=page):
|
for topic_url in get_posts(SECTIONS["educational"], start=page):
|
||||||
q.put(dict(
|
q.put(dict(
|
||||||
@ -88,7 +89,9 @@ for page in range(0, 50000, 40):
|
|||||||
recipe=topic_url,
|
recipe=topic_url,
|
||||||
max_assign_time=60 * 10,
|
max_assign_time=60 * 10,
|
||||||
unique_str=get_topic_id(topic_url),
|
unique_str=get_topic_id(topic_url),
|
||||||
|
priority=max(32000 - page, 0)
|
||||||
))
|
))
|
||||||
print(page)
|
count += 1
|
||||||
|
print(count)
|
||||||
|
|
||||||
q.join()
|
q.join()
|
||||||
|
87
run
87
run
@ -1,13 +1,14 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import gzip
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import pickle
|
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
from base64 import b64decode
|
import tarfile
|
||||||
|
from io import BytesIO
|
||||||
from sys import stderr
|
from sys import stderr
|
||||||
|
from tarfile import TarInfo, TarFile
|
||||||
|
from time import time
|
||||||
from urllib.parse import urlparse, unquote
|
from urllib.parse import urlparse, unquote
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@ -22,12 +23,22 @@ session = requests.Session()
|
|||||||
|
|
||||||
TOPIC_URL = os.environ["TASK_RECIPE"]
|
TOPIC_URL = os.environ["TASK_RECIPE"]
|
||||||
|
|
||||||
PREMIUM_LINKS = (
|
METADATA = {
|
||||||
|
"ts": time(),
|
||||||
|
"topic_url": TOPIC_URL
|
||||||
|
}
|
||||||
|
|
||||||
|
# See https://forum.mobilism.org/filehosts.xml
|
||||||
|
PREMIUM_LINKS = {
|
||||||
"tusfiles.com", "userscloud.com", "uploaded.net", "ul.to", "uploaded.to", "2shared.com",
|
"tusfiles.com", "userscloud.com", "uploaded.net", "ul.to", "uploaded.to", "2shared.com",
|
||||||
"mediafire.com", "dailyuploads.net", "douploads.net", "centfile.com", "uploadev.org", "intoupload.net",
|
"mediafire.com", "dailyuploads.net", "douploads.net", "centfile.com", "uploadev.org", "intoupload.net",
|
||||||
"uploadrar.com", "mixloads.com", "ddownload.com", "filezip.cc", "sendit.cloud", "dropapk.to",
|
"mixloads.com", "ddownload.com", "filezip.cc", "sendit.cloud", "dropapk.to",
|
||||||
"hulkload.com", "filerio.in", "rapidgator.net", "rg.to", "mega4up.com", "upload.ac", "dropgalaxy.in"
|
"hulkload.com", "filerio.in", "rapidgator.net", "rg.to", "mega4up.com", "upload.ac", "dropgalaxy.in",
|
||||||
)
|
"rapidshare.com", "uplod.it", "2shared.com", "billionuploads.com", "tusfiles.com",
|
||||||
|
"dropapk.com", "dropapk.to", "douploads.com", "douploads.me", "dailyuploads.net", "dailyuploads.cc",
|
||||||
|
"upload.ac", "ddl.to", "ddownload.com", "uploadev.com", "uploadev.org", "uploadrar.com", "uploadrar.net",
|
||||||
|
"filetitle.com", "filerio.in", "mega4up.com", "filezip.cc", "dropgalaxy.in"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def is_supported_premium_dl(link):
|
def is_supported_premium_dl(link):
|
||||||
@ -35,16 +46,26 @@ def is_supported_premium_dl(link):
|
|||||||
return parsed.netloc in PREMIUM_LINKS
|
return parsed.netloc in PREMIUM_LINKS
|
||||||
|
|
||||||
|
|
||||||
|
def add_buf_to_tar(tar: TarFile, filename, data: bytes):
|
||||||
|
buf = BytesIO()
|
||||||
|
buf.write(data)
|
||||||
|
buf.flush()
|
||||||
|
buf.seek(0)
|
||||||
|
info = TarInfo(name=filename)
|
||||||
|
info.size = len(data)
|
||||||
|
tar.addfile(info, buf)
|
||||||
|
|
||||||
|
|
||||||
def _download(link, i):
|
def _download(link, i):
|
||||||
filename = "%s%02d_%s.gz" % (topic_id, i, unquote(os.path.basename(link)).replace("/", "_"))
|
filename = "%s%02d_%s" % (topic_id, i, unquote(os.path.basename(link)).replace("/", "_"))
|
||||||
|
|
||||||
r = session.get(link)
|
r = session.get(link)
|
||||||
|
|
||||||
with gzip.open(filename, "wb") as f:
|
with tarfile.open(filename + ".tar.gz", "w:gz") as tar:
|
||||||
f.write(r.content)
|
add_buf_to_tar(tar, filename, r.content)
|
||||||
|
add_buf_to_tar(tar, "meta.json", json.dumps(METADATA).encode())
|
||||||
|
|
||||||
subprocess.run(["rclone", "copy", "--config", "tmp.conf", filename, "staging:mobilism/"])
|
subprocess.run(["rclone", "copy", "--config", "tmp.conf", filename + ".tar.gz", "staging:mobilism/"])
|
||||||
quit(0)
|
|
||||||
|
|
||||||
|
|
||||||
def do_premium_download(link, i):
|
def do_premium_download(link, i):
|
||||||
@ -55,32 +76,44 @@ def do_premium_download(link, i):
|
|||||||
"Content-Type": "application/x-www-form-urlencoded"
|
"Content-Type": "application/x-www-form-urlencoded"
|
||||||
})
|
})
|
||||||
|
|
||||||
|
METADATA["do_premium_download"] = {
|
||||||
|
"link": link,
|
||||||
|
"response": r.text
|
||||||
|
}
|
||||||
|
|
||||||
soup = BeautifulSoup(r.content, "html.parser")
|
soup = BeautifulSoup(r.content, "html.parser")
|
||||||
form = soup.find("form")
|
form = soup.find("form")
|
||||||
|
|
||||||
with open("debug.do_premium_download.html", "wb") as f:
|
|
||||||
f.write(r.content)
|
|
||||||
|
|
||||||
if not form:
|
if not form:
|
||||||
if "The file you were looking for could not be found" not in r.text:
|
if "The file you were looking for could not be found" not in r.text:
|
||||||
print(r.content, file=stderr)
|
print(r.content, file=stderr)
|
||||||
return
|
return
|
||||||
|
|
||||||
r2 = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", {
|
data = {
|
||||||
"link": form.find("input", attrs={"name": "link"}).get("value"),
|
"link": form.find("input", attrs={"name": "link"}).get("value"),
|
||||||
"referer": form.find("input", attrs={"name": "referer"}).get("value"),
|
"referer": form.find("input", attrs={"name": "referer"}).get("value"),
|
||||||
"filename": form.find("input", attrs={"name": "filename"}).get("value"),
|
"filename": form.find("input", attrs={"name": "filename"}).get("value"),
|
||||||
"host": form.find("input", attrs={"name": "host"}).get("value"),
|
"host": form.find("input", attrs={"name": "host"}).get("value"),
|
||||||
"path": form.find("input", attrs={"name": "path"}).get("value"),
|
"path": form.find("input", attrs={"name": "path"}).get("value"),
|
||||||
})
|
}
|
||||||
|
port_el = form.find("input", attrs={"name": "port"})
|
||||||
|
if port_el:
|
||||||
|
data["port"] = port_el.get("value")
|
||||||
|
r2 = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", data)
|
||||||
|
|
||||||
|
METADATA["do_premium_download2"] = {
|
||||||
|
"data": data,
|
||||||
|
"response": r2.text
|
||||||
|
}
|
||||||
|
|
||||||
soup2 = BeautifulSoup(r2.content, "html.parser")
|
soup2 = BeautifulSoup(r2.content, "html.parser")
|
||||||
try:
|
try:
|
||||||
download_link = soup2.find("a", attrs={"download": lambda x: x}).get("download")
|
download_link = soup2.find("a", attrs={"download": lambda x: x}).get("download")
|
||||||
_download(download_link, i)
|
_download(download_link, i)
|
||||||
except:
|
except Exception as e:
|
||||||
if "not found" not in r.text:
|
if "not found" not in r2.text:
|
||||||
print(r2.content, file=stderr)
|
print(r2.content, file=stderr)
|
||||||
pass
|
raise e
|
||||||
|
|
||||||
|
|
||||||
def get_topic_id(topic_url):
|
def get_topic_id(topic_url):
|
||||||
@ -91,8 +124,10 @@ def parse_topic(topic_url):
|
|||||||
r = session.get(topic_url)
|
r = session.get(topic_url)
|
||||||
soup = BeautifulSoup(r.content, "html.parser")
|
soup = BeautifulSoup(r.content, "html.parser")
|
||||||
|
|
||||||
with open("debug.parse_topic.html", "wb") as f:
|
METADATA["parse_topic"] = {
|
||||||
f.write(r.content)
|
"topic_url": topic_id,
|
||||||
|
"response": r.text
|
||||||
|
}
|
||||||
|
|
||||||
for i, elem in enumerate(soup.find_all(class_="postlink")):
|
for i, elem in enumerate(soup.find_all(class_="postlink")):
|
||||||
if not elem.get("href"):
|
if not elem.get("href"):
|
||||||
@ -100,7 +135,11 @@ def parse_topic(topic_url):
|
|||||||
|
|
||||||
link = elem.get("href")
|
link = elem.get("href")
|
||||||
if is_supported_premium_dl(link):
|
if is_supported_premium_dl(link):
|
||||||
do_premium_download(link, i)
|
try:
|
||||||
|
do_premium_download(link, i)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
def login():
|
def login():
|
||||||
@ -116,8 +155,6 @@ def login():
|
|||||||
}, headers={
|
}, headers={
|
||||||
"Content-Type": "application/x-www-form-urlencoded"
|
"Content-Type": "application/x-www-form-urlencoded"
|
||||||
})
|
})
|
||||||
with open("debug.login.html", "wb") as f:
|
|
||||||
f.write(r.content)
|
|
||||||
|
|
||||||
|
|
||||||
topic_id = get_topic_id(TOPIC_URL)
|
topic_id = get_topic_id(TOPIC_URL)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user