mirror of
				https://github.com/simon987/mobilism_scrape.git
				synced 2025-11-04 03:56:52 +00:00 
			
		
		
		
	initial
This commit is contained in:
		
						commit
						8fc858ab8f
					
				
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@ -0,0 +1,3 @@
 | 
				
			|||||||
 | 
					*.iml
 | 
				
			||||||
 | 
					*.json
 | 
				
			||||||
 | 
					.idea/
 | 
				
			||||||
							
								
								
									
										94
									
								
								generate_tasks.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										94
									
								
								generate_tasks.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,94 @@
 | 
				
			|||||||
 | 
					from queue import Queue
 | 
				
			||||||
 | 
					from threading import Thread
 | 
				
			||||||
 | 
					from urllib.parse import urljoin
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import browser_cookie3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import requests
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
 | 
				
			||||||
 | 
					from hexlib.concurrency import queue_iter
 | 
				
			||||||
 | 
					from bs4 import BeautifulSoup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cj = browser_cookie3.firefox()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					session = requests.Session()
 | 
				
			||||||
 | 
					session.cookies = cj
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SECTIONS = {
 | 
				
			||||||
 | 
					    # Non-fiction
 | 
				
			||||||
 | 
					    "biographies": "https://forum.mobilism.org/viewforum.php?f=1285",
 | 
				
			||||||
 | 
					    "educational": "https://forum.mobilism.org/viewforum.php?f=122",
 | 
				
			||||||
 | 
					    "philosophy": "https://forum.mobilism.org/viewforum.php?f=1345",
 | 
				
			||||||
 | 
					    "food": "https://forum.mobilism.org/viewforum.php?f=1328",
 | 
				
			||||||
 | 
					    "health": "https://forum.mobilism.org/viewforum.php?f=545",
 | 
				
			||||||
 | 
					    "history": "https://forum.mobilism.org/viewforum.php?f=1346",
 | 
				
			||||||
 | 
					    "tech": "https://forum.mobilism.org/viewforum.php?f=892",
 | 
				
			||||||
 | 
					    "general": "https://forum.mobilism.org/viewforum.php?f=126",
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Fiction
 | 
				
			||||||
 | 
					    "romance": "https://forum.mobilism.org/viewforum.php?f=1292",
 | 
				
			||||||
 | 
					    "erotic": "https://forum.mobilism.org/viewforum.php?f=1340",
 | 
				
			||||||
 | 
					    "scifi": "https://forum.mobilism.org/viewforum.php?f=1293",
 | 
				
			||||||
 | 
					    "mystery": "https://forum.mobilism.org/viewforum.php?f=1294",
 | 
				
			||||||
 | 
					    "classics": "https://forum.mobilism.org/viewforum.php?f=121",
 | 
				
			||||||
 | 
					    "children": "https://forum.mobilism.org/viewforum.php?f=1295",
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    "magazines": "https://forum.mobilism.org/viewforum.php?f=123",
 | 
				
			||||||
 | 
					    "comics": "https://forum.mobilism.org/viewforum.php?f=311",
 | 
				
			||||||
 | 
					    "collections": "https://forum.mobilism.org/viewforum.php?f=1271",
 | 
				
			||||||
 | 
					    "adult": "https://forum.mobilism.org/viewforum.php?f=125"
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					BASE = "https://forum.mobilism.org"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_topic_id(topic_url):
 | 
				
			||||||
 | 
					    return re.search("[&?]t=([0-9]+)", topic_url).group(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_posts(link, start):
 | 
				
			||||||
 | 
					    r = session.get("%s&start=%d" % (link, start))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("test.html", "wb") as f:
 | 
				
			||||||
 | 
					        f.write(r.content)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    soup = BeautifulSoup(r.content, "html.parser")
 | 
				
			||||||
 | 
					    for elem in soup.find_all("a", attrs={"class": "topictitle"}):
 | 
				
			||||||
 | 
					        yield urljoin(BASE, elem.get("href"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					api = TaskTrackerApi("http://localhost:8080/api")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					worker = Worker.from_file(api)
 | 
				
			||||||
 | 
					if not worker:
 | 
				
			||||||
 | 
					    worker = api.make_worker("mobilism_insert")
 | 
				
			||||||
 | 
					    worker.dump_to_file()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					worker.request_access(project=1, assign=False, submit=True)
 | 
				
			||||||
 | 
					input("Accept request")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					q = Queue()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def submit_worker(q: Queue):
 | 
				
			||||||
 | 
					    for task in queue_iter(q):
 | 
				
			||||||
 | 
					        worker.submit_task(**task)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for _ in range(4):
 | 
				
			||||||
 | 
					    t = Thread(target=submit_worker, args=(q,))
 | 
				
			||||||
 | 
					    t.setDaemon(True)
 | 
				
			||||||
 | 
					    t.start()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for page in range(0, 50000, 40):
 | 
				
			||||||
 | 
					    for topic_url in get_posts(SECTIONS["educational"], start=page):
 | 
				
			||||||
 | 
					        q.put(dict(
 | 
				
			||||||
 | 
					            project=1,
 | 
				
			||||||
 | 
					            recipe=topic_url,
 | 
				
			||||||
 | 
					            max_assign_time=60 * 10,
 | 
				
			||||||
 | 
					            unique_str=get_topic_id(topic_url),
 | 
				
			||||||
 | 
					        ))
 | 
				
			||||||
 | 
					    print(page)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					q.join()
 | 
				
			||||||
							
								
								
									
										93
									
								
								run.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										93
									
								
								run.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,93 @@
 | 
				
			|||||||
 | 
					import gzip
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import pickle
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					from base64 import b64decode
 | 
				
			||||||
 | 
					from urllib.parse import urlparse, unquote
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import requests
 | 
				
			||||||
 | 
					from bs4 import BeautifulSoup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def decode_cookiejar(b64_str):
 | 
				
			||||||
 | 
					    data = b64decode(b64_str)
 | 
				
			||||||
 | 
					    return pickle.loads(data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# from hexlib.web import cookiejar_filter, encode_cookiejar, decode_cookiejar, save_cookiejar
 | 
				
			||||||
 | 
					# import browser_cookie3
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# cj = cookiejar_filter(browser_cookie3.firefox(), "forum.mobilism.org|mblservices.org")
 | 
				
			||||||
 | 
					# with open("cookies.txt", "wb") as f:
 | 
				
			||||||
 | 
					#     f.write(encode_cookiejar(cj))
 | 
				
			||||||
 | 
					cj = decode_cookiejar(os.environ["PROJECT_SECRET"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					session = requests.Session()
 | 
				
			||||||
 | 
					session.cookies = cj
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TOPIC_URL = "https://forum.mobilism.org/viewtopic.php?f=1346&t=3734829"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PREMIUM_LINKS = (
 | 
				
			||||||
 | 
					    "tusfiles.com", "userscloud.com", "uploaded.net", "ul.to", "uploaded.to", "2shared.com",
 | 
				
			||||||
 | 
					    "mediafire.com", "dailyuploads.net", "douploads.net", "centfile.com", "uploadev.org", "intoupload.net",
 | 
				
			||||||
 | 
					    "uploadrar.com", "mixloads.com", "ddownload.com", "filezip.cc", "sendit.cloud", "dropapk.to",
 | 
				
			||||||
 | 
					    "hulkload.com", "filerio.in", "rapidgator.net", "rg.to", "mega4up.com", "upload.ac", "dropgalaxy.in"
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_supported_premium_dl(link):
 | 
				
			||||||
 | 
					    parsed = urlparse(link.lower())
 | 
				
			||||||
 | 
					    return parsed.netloc in PREMIUM_LINKS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _download(link, i):
 | 
				
			||||||
 | 
					    filename = "%s%02d_%s.gz" % (topic_id, i, unquote(os.path.basename(link)).replace("/", "_"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    r = session.get(link)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with gzip.open(filename, "wb") as f:
 | 
				
			||||||
 | 
					        f.write(r.content)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def do_premium_download(link, i):
 | 
				
			||||||
 | 
					    r = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", data={
 | 
				
			||||||
 | 
					        "link": link,
 | 
				
			||||||
 | 
					        "premium_acc": "on"
 | 
				
			||||||
 | 
					    }, headers={
 | 
				
			||||||
 | 
					        "Content-Type": "application/x-www-form-urlencoded"
 | 
				
			||||||
 | 
					    })
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    soup = BeautifulSoup(r.content, "html.parser")
 | 
				
			||||||
 | 
					    form = soup.find("form")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    r2 = session.post("https://mblservices.org/amember/downloader/downloader/app/index.php", {
 | 
				
			||||||
 | 
					        "link": form.find("input", attrs={"name": "link"}).get("value"),
 | 
				
			||||||
 | 
					        "referer": form.find("input", attrs={"name": "referer"}).get("value"),
 | 
				
			||||||
 | 
					        "filename": form.find("input", attrs={"name": "filename"}).get("value"),
 | 
				
			||||||
 | 
					        "host": form.find("input", attrs={"name": "host"}).get("value"),
 | 
				
			||||||
 | 
					        "path": form.find("input", attrs={"name": "path"}).get("value"),
 | 
				
			||||||
 | 
					    })
 | 
				
			||||||
 | 
					    soup2 = BeautifulSoup(r2.content, "html.parser")
 | 
				
			||||||
 | 
					    download_link = soup2.find("a", attrs={"download": lambda x: x}).get("download")
 | 
				
			||||||
 | 
					    _download(download_link, i)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_topic_id(topic_url):
 | 
				
			||||||
 | 
					    return re.search("[&?]t=([0-9]+)", topic_url).group(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def parse_topic(topic_url):
 | 
				
			||||||
 | 
					    r = session.get(topic_url)
 | 
				
			||||||
 | 
					    soup = BeautifulSoup(r.content, "html.parser")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for i, elem in enumerate(soup.find_all(class_="postlink")):
 | 
				
			||||||
 | 
					        if not elem.get("href"):
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        link = elem.get("href")
 | 
				
			||||||
 | 
					        if is_supported_premium_dl(link):
 | 
				
			||||||
 | 
					            do_premium_download(link, i)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					topic_id = get_topic_id(TOPIC_URL)
 | 
				
			||||||
 | 
					parse_topic(TOPIC_URL)
 | 
				
			||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user