Added progression and resume option

This commit is contained in:
simon987 2018-04-14 11:41:05 -04:00
parent 894c4f1328
commit 8212793dc5

View File

@ -8,6 +8,9 @@ import multiprocessing
h = HTML2Text() h = HTML2Text()
h.images_to_alt = True h.images_to_alt = True
counter = multiprocessing.Value("i", 0)
total = multiprocessing.Value("i", 0)
def request_timeout(url): def request_timeout(url):
while True: while True:
@ -41,61 +44,74 @@ def get_entries():
def download_entry(url): def download_entry(url):
r = request_timeout("https://opengameart.org" + url) global counter
soup = BeautifulSoup(r.text, "html.parser") global total
counter.value += 1
print(str(counter.value) + "/" + str(total.value) + " {0:.2f}".format(counter.value / total.value * 100) + "%")
simple_title = os.path.split(url)[1] simple_title = os.path.split(url)[1]
if not os.path.exists(simple_title + os.sep + "metadata.json"): if not os.path.exists("entries" + os.sep + simple_title + os.sep + "metadata.json"):
r = request_timeout("https://opengameart.org" + url)
soup = BeautifulSoup(r.text, "html.parser")
metadata = dict() metadata = dict()
metadata["title"] = list(soup.find("div", attrs={"property": "dc:title"}).children)[0].text try:
metadata["tags"] = list()
for tag in soup.find_all("a", attrs={"property": "rdfs:label skos:prefLabel"}):
metadata["tags"].append(tag.text)
metadata["description"] = h.handle(str(soup.find("div", attrs={"class": "field-item even", "property": "content:encoded"}))).strip() metadata["title"] = list(soup.find("div", attrs={"property": "dc:title"}).children)[0].text
metadata["attribution"] = h.handle(str(soup.find("div", attrs={"class": "field field-name-field-art-attribution field-type-text-long field-label-above"}))).strip() metadata["tags"] = list()
metadata["license"] = soup.find("div", attrs={"class": "license-name"}).text tag_list = soup.find_all("a", attrs={"property": "rdfs:label skos:prefLabel"})
metadata["type"] = soup.find("a", attrs={"property": "rdfs:label skos:prefLabel", "typeof": "skos:Concept"}).text if tag_list is not None:
for tag in tag_list:
metadata["tags"].append(tag.text)
path = "entries" + os.sep + simple_title metadata["description"] = h.handle(str(soup.find("div", attrs={"class": "field-item even", "property": "content:encoded"}))).strip()
if not os.path.exists(path): metadata["attribution"] = h.handle(str(soup.find("div", attrs={"class": "field field-name-field-art-attribution field-type-text-long field-label-above"}))).strip()
os.mkdir(path) metadata["license"] = soup.find("div", attrs={"class": "license-name"}).text
metadata["type"] = soup.find("a", attrs={"property": "rdfs:label skos:prefLabel", "typeof": "skos:Concept"}).text
for file in soup.find_all("span", attrs={"class", "file"}): path = "entries" + os.sep + simple_title
link = file.find("a").get("href") if not os.path.exists(path):
os.mkdir(path)
if not os.path.exists(path + os.sep + os.path.split(link)[1]): for file in soup.find_all("span", attrs={"class", "file"}):
print(link) link = file.find("a").get("href")
while True: if not os.path.exists(path + os.sep + os.path.split(link)[1]):
try:
response = requests.get(link, stream=True, timeout=8)
with open(path + os.sep + os.path.split(link)[1], 'wb') as f: while True:
for chunk in response.iter_content(chunk_size=1024): try:
if chunk: response = requests.get(link, stream=True, timeout=8)
f.write(chunk)
break
except:
print("!")
with open(path + os.sep + "metadata.json", "w") as f: with open(path + os.sep + os.path.split(link)[1], 'wb') as f:
json.dump(metadata, f) for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
break
except:
print("!")
with open(path + os.sep + "metadata.json", "w") as f:
json.dump(metadata, f)
except:
print("ERROR " + url)
def download_all(): def download_all():
global total
pool = multiprocessing.Pool(processes=25) pool = multiprocessing.Pool(processes=25)
with open("entries.txt", "r") as f: with open("entries.txt", "r") as f:
pool.map(download_entry, f.read().splitlines()) lines = f.read().splitlines()
total.value = len(lines)
pool.map(download_entry, lines)
if not os.path.exists("entries"): if not os.path.exists("entries"):
os.mkdir("entries") os.mkdir("entries")
get_entries() # get_entries()
download_all() download_all()