first commit

2025-10-26 15:26:53 +00:00 · 2021-10-15 15:41:37 -04:00 · 2021-10-15 15:41:37 -04:00 · f80bef1a7d
commit f80bef1a7d
3 changed files with 105 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 *.iml
 .idea/
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
 git+git://github.com/simon987/hexlib.git
--- a/run.py
+++ b/run.py
@ -0,0 +1,102 @@
 import json
 import re
 import os
 from datetime import datetime
 from multiprocessing.pool import ThreadPool
 from hexlib.env import get_web
 from hexlib.web import download_file
 from tqdm import tqdm
 OUTPUT_PATH = "/mnt/Hatchery/main/projects/documentcloud/data/"
 RE_PAGE_NUM = re.compile(r".*&page=([0-9]+)")
 class DocumentCloud:
    def __init__(self):
        self.web = get_web()
        self.web._logger = None
    def list_documents(self, page=1):
        r = self.web.get(
            f"https://api.www.documentcloud.org/api/documents/search/?q=&page={page}"
        )
        j = r.json()
        try:
            next_page = int(RE_PAGE_NUM.match(j["next"]).group(1))
        except:
            next_page = None
        return j["results"], j["count"], next_page
    def all_documents(self):
        _, count, _ = self.list_documents(page=0)
        yield from tqdm(self._all_documents(), total=count, unit="item")
    def _all_documents(self):
        next_page = 0
        while True:
            res, _, next_page = self.list_documents(next_page)
            yield from res
 def result_to_s2meta(result):
    meta = {
        "title": result["title"],
        "language": result["language"],
        "mtime": int(datetime.fromisoformat(result["updated_at"][:-1]).timestamp()),
        "pages": result["page_count"],
        "checksum": result["file_hash"],
    }
    if "source" in result:
        meta["source"] = result["source"]
    if "related_article" in result:
        meta["related_article"] = result["related_article"]
    return meta
 def process_task(item):
    file_name = f"{item['id']} {item['slug']}.pdf"
    file_path = os.path.join(OUTPUT_PATH, file_name)
    s2meta = result_to_s2meta(item)
    with open(file_path + ".s2meta", "w") as f:
        f.write(json.dumps(s2meta))
    download_url = f"{item['asset_url']}documents/{item['id']}/{item['slug']}.pdf"
    download_file(download_url, file_path)
 def tasks():
    documentcloud = DocumentCloud()
    item: dict
    for item in documentcloud.all_documents():
        file_name = f"{item['id']} {item['slug']}.pdf"
        file_path = os.path.join(OUTPUT_PATH, file_name)
        if not os.path.exists(file_path):
            yield item
 if __name__ == '__main__':
    pool = ThreadPool(processes=10)
    it = pool.imap(
        iterable=tasks(),
        func=process_task
    )
    for _ in it:
        pass
    print("done")