commit f80bef1a7dfce45751d5701fe38946c8cb89def7 Author: simon987 Date: Fri Oct 15 15:41:37 2021 -0400 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..249963d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.iml +.idea/ \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e211ab4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +git+git://github.com/simon987/hexlib.git \ No newline at end of file diff --git a/run.py b/run.py new file mode 100644 index 0000000..eeecf0b --- /dev/null +++ b/run.py @@ -0,0 +1,102 @@ +import json +import re +import os +from datetime import datetime +from multiprocessing.pool import ThreadPool + +from hexlib.env import get_web +from hexlib.web import download_file +from tqdm import tqdm + +OUTPUT_PATH = "/mnt/Hatchery/main/projects/documentcloud/data/" + +RE_PAGE_NUM = re.compile(r".*&page=([0-9]+)") + + +class DocumentCloud: + + def __init__(self): + self.web = get_web() + self.web._logger = None + + def list_documents(self, page=1): + + r = self.web.get( + f"https://api.www.documentcloud.org/api/documents/search/?q=&page={page}" + ) + + j = r.json() + + try: + next_page = int(RE_PAGE_NUM.match(j["next"]).group(1)) + except: + next_page = None + + return j["results"], j["count"], next_page + + def all_documents(self): + _, count, _ = self.list_documents(page=0) + yield from tqdm(self._all_documents(), total=count, unit="item") + + def _all_documents(self): + next_page = 0 + while True: + res, _, next_page = self.list_documents(next_page) + + yield from res + + +def result_to_s2meta(result): + meta = { + "title": result["title"], + "language": result["language"], + "mtime": int(datetime.fromisoformat(result["updated_at"][:-1]).timestamp()), + "pages": result["page_count"], + "checksum": result["file_hash"], + } + + if "source" in result: + meta["source"] = result["source"] + + if "related_article" in result: + meta["related_article"] = result["related_article"] + + return meta + + +def process_task(item): + file_name = f"{item['id']} {item['slug']}.pdf" + file_path = os.path.join(OUTPUT_PATH, file_name) + s2meta = result_to_s2meta(item) + with open(file_path + ".s2meta", "w") as f: + f.write(json.dumps(s2meta)) + + download_url = f"{item['asset_url']}documents/{item['id']}/{item['slug']}.pdf" + download_file(download_url, file_path) + + +def tasks(): + documentcloud = DocumentCloud() + + item: dict + + for item in documentcloud.all_documents(): + + file_name = f"{item['id']} {item['slug']}.pdf" + file_path = os.path.join(OUTPUT_PATH, file_name) + + if not os.path.exists(file_path): + yield item + + +if __name__ == '__main__': + pool = ThreadPool(processes=10) + it = pool.imap( + iterable=tasks(), + func=process_task + ) + + for _ in it: + pass + + print("done")