import json import re import os from datetime import datetime from multiprocessing.pool import ThreadPool from hexlib.env import get_web from hexlib.web import download_file from tqdm import tqdm OUTPUT_PATH = "/mnt/Hatchery/main/projects/documentcloud/data/" RE_PAGE_NUM = re.compile(r".*&page=([0-9]+)") class DocumentCloud: def __init__(self): self.web = get_web() self.web._logger = None def list_documents(self, page=1): r = self.web.get( f"https://api.www.documentcloud.org/api/documents/search/?q=&page={page}" ) j = r.json() try: next_page = int(RE_PAGE_NUM.match(j["next"]).group(1)) except: next_page = None return j["results"], j["count"], next_page def all_documents(self): _, count, _ = self.list_documents(page=0) yield from tqdm(self._all_documents(), total=count, unit="item") def _all_documents(self): next_page = 0 while True: res, _, next_page = self.list_documents(next_page) yield from res def result_to_s2meta(result): meta = { "title": result["title"], "language": result["language"], "mtime": int(datetime.fromisoformat(result["updated_at"][:-1]).timestamp()), "pages": result["page_count"], "checksum": result["file_hash"], } if "source" in result: meta["source"] = result["source"] if "related_article" in result: meta["related_article"] = result["related_article"] return meta def process_task(item): file_name = f"{item['id']} {item['slug']}.pdf" file_path = os.path.join(OUTPUT_PATH, file_name) s2meta = result_to_s2meta(item) with open(file_path + ".s2meta", "w") as f: f.write(json.dumps(s2meta)) download_url = f"{item['asset_url']}documents/{item['id']}/{item['slug']}.pdf" download_file(download_url, file_path) def tasks(): documentcloud = DocumentCloud() item: dict for item in documentcloud.all_documents(): file_name = f"{item['id']} {item['slug']}.pdf" file_path = os.path.join(OUTPUT_PATH, file_name) if not os.path.exists(file_path): yield item if __name__ == '__main__': pool = ThreadPool(processes=10) it = pool.imap( iterable=tasks(), func=process_task ) for _ in it: pass print("done")