mirror of
https://github.com/simon987/mobilism_scrape.git
synced 2025-04-10 14:26:44 +00:00
initial
This commit is contained in:
commit
8fc858ab8f
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
*.iml
|
||||||
|
*.json
|
||||||
|
.idea/
|
94
generate_tasks.py
Normal file
94
generate_tasks.py
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
from queue import Queue
|
||||||
|
from threading import Thread
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import browser_cookie3
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
|
||||||
|
from hexlib.concurrency import queue_iter
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
cj = browser_cookie3.firefox()
|
||||||
|
|
||||||
|
session = requests.Session()
|
||||||
|
session.cookies = cj
|
||||||
|
|
||||||
|
SECTIONS = {
|
||||||
|
# Non-fiction
|
||||||
|
"biographies": "https://forum.mobilism.org/viewforum.php?f=1285",
|
||||||
|
"educational": "https://forum.mobilism.org/viewforum.php?f=122",
|
||||||
|
"philosophy": "https://forum.mobilism.org/viewforum.php?f=1345",
|
||||||
|
"food": "https://forum.mobilism.org/viewforum.php?f=1328",
|
||||||
|
"health": "https://forum.mobilism.org/viewforum.php?f=545",
|
||||||
|
"history": "https://forum.mobilism.org/viewforum.php?f=1346",
|
||||||
|
"tech": "https://forum.mobilism.org/viewforum.php?f=892",
|
||||||
|
"general": "https://forum.mobilism.org/viewforum.php?f=126",
|
||||||
|
|
||||||
|
# Fiction
|
||||||
|
"romance": "https://forum.mobilism.org/viewforum.php?f=1292",
|
||||||
|
"erotic": "https://forum.mobilism.org/viewforum.php?f=1340",
|
||||||
|
"scifi": "https://forum.mobilism.org/viewforum.php?f=1293",
|
||||||
|
"mystery": "https://forum.mobilism.org/viewforum.php?f=1294",
|
||||||
|
"classics": "https://forum.mobilism.org/viewforum.php?f=121",
|
||||||
|
"children": "https://forum.mobilism.org/viewforum.php?f=1295",
|
||||||
|
|
||||||
|
"magazines": "https://forum.mobilism.org/viewforum.php?f=123",
|
||||||
|
"comics": "https://forum.mobilism.org/viewforum.php?f=311",
|
||||||
|
"collections": "https://forum.mobilism.org/viewforum.php?f=1271",
|
||||||
|
"adult": "https://forum.mobilism.org/viewforum.php?f=125"
|
||||||
|
}
|
||||||
|
|
||||||
|
BASE = "https://forum.mobilism.org"
|
||||||
|
|
||||||
|
|
||||||
|
def get_topic_id(topic_url):
|
||||||
|
return re.search("[&?]t=([0-9]+)", topic_url).group(1)
|
||||||
|
|
||||||
|
|
||||||
|
def get_posts(link, start):
|
||||||
|
r = session.get("%s&start=%d" % (link, start))
|
||||||
|
|
||||||
|
with open("test.html", "wb") as f:
|
||||||
|
f.write(r.content)
|
||||||
|
|
||||||
|
soup = BeautifulSoup(r.content, "html.parser")
|
||||||
|
for elem in soup.find_all("a", attrs={"class": "topictitle"}):
|
||||||
|
yield urljoin(BASE, elem.get("href"))
|
||||||
|
|
||||||
|
|
||||||
|
api = TaskTrackerApi("http://localhost:8080/api")
|
||||||
|
|
||||||
|
worker = Worker.from_file(api)
|
||||||
|
if not worker:
|
||||||
|
worker = api.make_worker("mobilism_insert")
|
||||||
|
worker.dump_to_file()
|
||||||
|
|
||||||
|
worker.request_access(project=1, assign=False, submit=True)
|
||||||
|
input("Accept request")
|
||||||
|
|
||||||
|
q = Queue()
|
||||||
|
|
||||||
|
|
||||||
|
def submit_worker(q: Queue):
|
||||||
|
for task in queue_iter(q):
|
||||||
|
worker.submit_task(**task)
|
||||||
|
|
||||||
|
|
||||||
|
for _ in range(4):
|
||||||
|
t = Thread(target=submit_worker, args=(q,))
|
||||||
|
t.setDaemon(True)
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
for page in range(0, 50000, 40):
|
||||||
|
for topic_url in get_posts(SECTIONS["educational"], start=page):
|
||||||
|
q.put(dict(
|
||||||
|
project=1,
|
||||||
|
recipe=topic_url,
|
||||||
|
max_assign_time=60 * 10,
|
||||||
|
unique_str=get_topic_id(topic_url),
|
||||||
|
))
|
||||||
|
print(page)
|
||||||
|
|
||||||
|
q.join()
|
93
run.py
Normal file
93
run.py
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
import gzip
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
import re
|
||||||
|
from base64 import b64decode
|
||||||
|
from urllib.parse import urlparse, unquote
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def decode_cookiejar(b64_str):
|
||||||
|
data = b64decode(b64_str)
|
||||||
|
return pickle.loads(data)
|
||||||
|
|
||||||
|
|
||||||
|
# from hexlib.web import cookiejar_filter, encode_cookiejar, decode_cookiejar, save_cookiejar
|
||||||
|
# import browser_cookie3
|
||||||
|
#
|
||||||
|
# cj = cookiejar_filter(browser_cookie3.firefox(), "forum.mobilism.org|mblservices.org")
|
||||||
|
# with open("cookies.txt", "wb") as f:
|
||||||
|
# f.write(encode_cookiejar(cj))
|
||||||
|
cj = decode_cookiejar(os.environ["PROJECT_SECRET"])
|
||||||
|
|
||||||
|
session = requests.Session()
|
||||||
|
session.cookies = cj
|
||||||
|
|
||||||
|
TOPIC_URL = "https://forum.mobilism.org/viewtopic.php?f=1346&t=3734829"
|
||||||
|
|
||||||
|
PREMIUM_LINKS = (
|
||||||
|
"tusfiles.com", "userscloud.com", "uploaded.net", "ul.to", "uploaded.to", "2shared.com",
|
||||||
|
"mediafire.com", "dailyuploads.net", "douploads.net", "centfile.com", "uploadev.org", "intoupload.net",
|
||||||
|
"uploadrar.com", "mixloads.com", "ddownload.com", "filezip.cc", "sendit.cloud", "dropapk.to",
|
||||||
|
"hulkload.com", "filerio.in", "rapidgator.net", "rg.to", "mega4up.com", "upload.ac", "dropgalaxy.in"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_supported_premium_dl(link):
|
||||||
|
parsed = urlparse(link.lower())
|
||||||
|
return parsed.netloc in PREMIUM_LINKS
|
||||||
|
|
||||||
|
|
||||||
|
def _download(link, i):
|
||||||
|
filename = "%s%02d_%s.gz" % (topic_id, i, unquote(os.path.basename(link)).replace("/", "_"))
|
||||||
|
|
||||||
|
r = session.get(link)
|
||||||
|
|
||||||
|
with gzip.open(filename, "wb") as f:
|
||||||
|
f.write(r.content)
|
||||||
|
|
||||||
|
|
||||||
|
def do_premium_download(link, i):
|
||||||
|
r = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", data={
|
||||||
|
"link": link,
|
||||||
|
"premium_acc": "on"
|
||||||
|
}, headers={
|
||||||
|
"Content-Type": "application/x-www-form-urlencoded"
|
||||||
|
})
|
||||||
|
|
||||||
|
soup = BeautifulSoup(r.content, "html.parser")
|
||||||
|
form = soup.find("form")
|
||||||
|
|
||||||
|
r2 = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", {
|
||||||
|
"link": form.find("input", attrs={"name": "link"}).get("value"),
|
||||||
|
"referer": form.find("input", attrs={"name": "referer"}).get("value"),
|
||||||
|
"filename": form.find("input", attrs={"name": "filename"}).get("value"),
|
||||||
|
"host": form.find("input", attrs={"name": "host"}).get("value"),
|
||||||
|
"path": form.find("input", attrs={"name": "path"}).get("value"),
|
||||||
|
})
|
||||||
|
soup2 = BeautifulSoup(r2.content, "html.parser")
|
||||||
|
download_link = soup2.find("a", attrs={"download": lambda x: x}).get("download")
|
||||||
|
_download(download_link, i)
|
||||||
|
|
||||||
|
|
||||||
|
def get_topic_id(topic_url):
|
||||||
|
return re.search("[&?]t=([0-9]+)", topic_url).group(1)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_topic(topic_url):
|
||||||
|
r = session.get(topic_url)
|
||||||
|
soup = BeautifulSoup(r.content, "html.parser")
|
||||||
|
|
||||||
|
for i, elem in enumerate(soup.find_all(class_="postlink")):
|
||||||
|
if not elem.get("href"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
link = elem.get("href")
|
||||||
|
if is_supported_premium_dl(link):
|
||||||
|
do_premium_download(link, i)
|
||||||
|
|
||||||
|
|
||||||
|
topic_id = get_topic_id(TOPIC_URL)
|
||||||
|
parse_topic(TOPIC_URL)
|
Loading…
x
Reference in New Issue
Block a user