import requests from bs4 import BeautifulSoup from html2text import HTML2Text import json import os import multiprocessing h = HTML2Text() h.images_to_alt = True counter = multiprocessing.Value("i", 0) total = multiprocessing.Value("i", 0) def request_timeout(url): while True: try: return requests.get(url, timeout=15) except: print("!", end="", flush=True) continue def get_entries(): for page in range(0, 134): print("Page " + str(page)) r = request_timeout("https://opengameart.org/art-search-advanced?" "field_art_tags_tid_op=or" "&sort_by=count" "&sort_order=DESC" "&items_per_page=144" "&page=" + str(page)) print("Parsing...") soup = BeautifulSoup(r.text, "html.parser") for entry in soup.find_all("span", attrs={"class": "art-preview-title"}): for child in entry.children: with open("entries.txt", "a") as f: f.write(child.get("href") + "\n") break def download_entry(url): global counter global total counter.value += 1 print(str(counter.value) + "/" + str(total.value) + " {0:.2f}".format(counter.value / total.value * 100) + "%") simple_title = os.path.split(url)[1] if not os.path.exists("entries" + os.sep + simple_title + os.sep + "metadata.json"): r = request_timeout("https://opengameart.org" + url) soup = BeautifulSoup(r.text, "html.parser") metadata = dict() try: metadata["title"] = list(soup.find("div", attrs={"property": "dc:title"}).children)[0].text metadata["tags"] = list() tag_list = soup.find_all("a", attrs={"property": "rdfs:label skos:prefLabel"}) if tag_list is not None: for tag in tag_list: metadata["tags"].append(tag.text) metadata["description"] = h.handle(str(soup.find("div", attrs={"class": "field-item even", "property": "content:encoded"}))).strip() metadata["attribution"] = h.handle(str(soup.find("div", attrs={"class": "field field-name-field-art-attribution field-type-text-long field-label-above"}))).strip() metadata["license"] = soup.find("div", attrs={"class": "license-name"}).text metadata["type"] = soup.find("a", attrs={"property": "rdfs:label skos:prefLabel", "typeof": "skos:Concept"}).text path = "entries" + os.sep + simple_title if not os.path.exists(path): os.mkdir(path) for file in soup.find_all("span", attrs={"class", "file"}): link = file.find("a").get("href") if not os.path.exists(path + os.sep + os.path.split(link)[1]): while True: try: response = requests.get(link, stream=True, timeout=8) with open(path + os.sep + os.path.split(link)[1], 'wb') as f: for chunk in response.iter_content(chunk_size=1024): if chunk: f.write(chunk) break except: print("!") with open(path + os.sep + "metadata.json", "w") as f: json.dump(metadata, f) except: print("ERROR " + url) def download_all(): global total pool = multiprocessing.Pool(processes=25) with open("entries.txt", "r") as f: lines = f.read().splitlines() total.value = len(lines) pool.map(download_entry, lines) if not os.path.exists("entries"): os.mkdir("entries") # get_entries() download_all()