mirror of
https://github.com/simon987/documentcloud_dl.git
synced 2025-04-05 01:13:05 +00:00
106 lines
2.4 KiB
Python
106 lines
2.4 KiB
Python
import json
|
|
import re
|
|
import os
|
|
from datetime import datetime
|
|
from multiprocessing.pool import ThreadPool
|
|
|
|
from hexlib.env import get_web
|
|
from hexlib.web import download_file
|
|
from tqdm import tqdm
|
|
|
|
OUTPUT_PATH = "/mnt/Hatchery/main/projects/documentcloud/data/"
|
|
|
|
RE_PAGE_NUM = re.compile(r".*&page=([0-9]+)")
|
|
|
|
|
|
class DocumentCloud:
|
|
|
|
def __init__(self):
|
|
self.web = get_web()
|
|
self.web._logger = None
|
|
|
|
def list_documents(self, page=1):
|
|
|
|
r = self.web.get(
|
|
f"https://api.www.documentcloud.org/api/documents/search/?q=&page={page}"
|
|
)
|
|
|
|
j = r.json()
|
|
|
|
try:
|
|
next_page = int(RE_PAGE_NUM.match(j["next"]).group(1))
|
|
except:
|
|
next_page = None
|
|
|
|
return j["results"], j["count"], next_page
|
|
|
|
def all_documents(self):
|
|
_, count, _ = self.list_documents(page=0)
|
|
yield from tqdm(self._all_documents(), total=count, unit="item")
|
|
|
|
def _all_documents(self):
|
|
next_page = 0
|
|
while True:
|
|
res, _, next_page = self.list_documents(next_page)
|
|
|
|
yield from res
|
|
|
|
if not next_page:
|
|
break
|
|
|
|
|
|
def result_to_s2meta(result):
|
|
meta = {
|
|
"title": result["title"],
|
|
"language": result["language"],
|
|
"mtime": int(datetime.fromisoformat(result["updated_at"][:-1]).timestamp()),
|
|
"pages": result["page_count"],
|
|
"checksum": result["file_hash"],
|
|
}
|
|
|
|
if "source" in result:
|
|
meta["source"] = result["source"]
|
|
|
|
if "related_article" in result:
|
|
meta["related_article"] = result["related_article"]
|
|
|
|
return meta
|
|
|
|
|
|
def process_task(item):
|
|
file_name = f"{item['id']} {item['slug']}.pdf"
|
|
file_path = os.path.join(OUTPUT_PATH, file_name)
|
|
s2meta = result_to_s2meta(item)
|
|
with open(file_path + ".s2meta", "w") as f:
|
|
f.write(json.dumps(s2meta))
|
|
|
|
download_url = f"{item['asset_url']}documents/{item['id']}/{item['slug']}.pdf"
|
|
download_file(download_url, file_path)
|
|
|
|
|
|
def tasks():
|
|
documentcloud = DocumentCloud()
|
|
|
|
item: dict
|
|
|
|
for item in documentcloud.all_documents():
|
|
|
|
file_name = f"{item['id']} {item['slug']}.pdf"
|
|
file_path = os.path.join(OUTPUT_PATH, file_name)
|
|
|
|
if not os.path.exists(file_path):
|
|
yield item
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pool = ThreadPool(processes=10)
|
|
it = pool.imap(
|
|
iterable=tasks(),
|
|
func=process_task
|
|
)
|
|
|
|
for _ in it:
|
|
pass
|
|
|
|
print("done")
|