Added progression and resume option

2025-10-15 12:06:53 +00:00 · 2018-04-14 11:41:05 -04:00 · 2018-04-14 11:41:05 -04:00 · 8212793dc5
commit 8212793dc5
parent 894c4f1328
1 changed files with 48 additions and 32 deletions
--- a/opengameart.org/run.py
+++ b/opengameart.org/run.py
@ -8,6 +8,9 @@ import multiprocessing
 h = HTML2Text()
 h.images_to_alt = True
 counter = multiprocessing.Value("i", 0)
 total = multiprocessing.Value("i", 0)
 def request_timeout(url):
    while True:
@ -41,18 +44,27 @@ def get_entries():
 def download_entry(url):
-    r = request_timeout("https://opengameart.org" + url)
+    global counter
-    soup = BeautifulSoup(r.text, "html.parser")
+    global total
    counter.value += 1
    print(str(counter.value) + "/" + str(total.value) + " {0:.2f}".format(counter.value / total.value * 100) + "%")
    simple_title = os.path.split(url)[1]
-    if not os.path.exists(simple_title + os.sep + "metadata.json"):
+    if not os.path.exists("entries" + os.sep + simple_title + os.sep + "metadata.json"):
        r = request_timeout("https://opengameart.org" + url)
        soup = BeautifulSoup(r.text, "html.parser")
        metadata = dict()
        try:
            metadata["title"] = list(soup.find("div", attrs={"property": "dc:title"}).children)[0].text
            metadata["tags"] = list()
-        for tag in soup.find_all("a", attrs={"property": "rdfs:label skos:prefLabel"}):
+            tag_list = soup.find_all("a", attrs={"property": "rdfs:label skos:prefLabel"})
            if tag_list is not None:
                for tag in tag_list:
                    metadata["tags"].append(tag.text)
            metadata["description"] = h.handle(str(soup.find("div", attrs={"class": "field-item even", "property": "content:encoded"}))).strip()
@ -68,7 +80,6 @@ def download_entry(url):
                link = file.find("a").get("href")
                if not os.path.exists(path + os.sep + os.path.split(link)[1]):
                print(link)
                    while True:
                        try:
@ -84,18 +95,23 @@ def download_entry(url):
            with open(path + os.sep + "metadata.json", "w") as f:
                json.dump(metadata, f)
        except:
            print("ERROR " + url)
 def download_all():
    global total
    pool = multiprocessing.Pool(processes=25)
    with open("entries.txt", "r") as f:
-        pool.map(download_entry, f.read().splitlines())
+        lines = f.read().splitlines()
        total.value = len(lines)
        pool.map(download_entry, lines)
 if not os.path.exists("entries"):
    os.mkdir("entries")
-get_entries()
+# get_entries()
 download_all()