first commit

This commit is contained in:
simon987 2021-10-15 15:41:37 -04:00
commit f80bef1a7d
3 changed files with 105 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*.iml
.idea/

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
git+git://github.com/simon987/hexlib.git

102
run.py Normal file
View File

@ -0,0 +1,102 @@
import json
import re
import os
from datetime import datetime
from multiprocessing.pool import ThreadPool
from hexlib.env import get_web
from hexlib.web import download_file
from tqdm import tqdm
OUTPUT_PATH = "/mnt/Hatchery/main/projects/documentcloud/data/"
RE_PAGE_NUM = re.compile(r".*&page=([0-9]+)")
class DocumentCloud:
def __init__(self):
self.web = get_web()
self.web._logger = None
def list_documents(self, page=1):
r = self.web.get(
f"https://api.www.documentcloud.org/api/documents/search/?q=&page={page}"
)
j = r.json()
try:
next_page = int(RE_PAGE_NUM.match(j["next"]).group(1))
except:
next_page = None
return j["results"], j["count"], next_page
def all_documents(self):
_, count, _ = self.list_documents(page=0)
yield from tqdm(self._all_documents(), total=count, unit="item")
def _all_documents(self):
next_page = 0
while True:
res, _, next_page = self.list_documents(next_page)
yield from res
def result_to_s2meta(result):
meta = {
"title": result["title"],
"language": result["language"],
"mtime": int(datetime.fromisoformat(result["updated_at"][:-1]).timestamp()),
"pages": result["page_count"],
"checksum": result["file_hash"],
}
if "source" in result:
meta["source"] = result["source"]
if "related_article" in result:
meta["related_article"] = result["related_article"]
return meta
def process_task(item):
file_name = f"{item['id']} {item['slug']}.pdf"
file_path = os.path.join(OUTPUT_PATH, file_name)
s2meta = result_to_s2meta(item)
with open(file_path + ".s2meta", "w") as f:
f.write(json.dumps(s2meta))
download_url = f"{item['asset_url']}documents/{item['id']}/{item['slug']}.pdf"
download_file(download_url, file_path)
def tasks():
documentcloud = DocumentCloud()
item: dict
for item in documentcloud.all_documents():
file_name = f"{item['id']} {item['slug']}.pdf"
file_path = os.path.join(OUTPUT_PATH, file_name)
if not os.path.exists(file_path):
yield item
if __name__ == '__main__':
pool = ThreadPool(processes=10)
it = pool.imap(
iterable=tasks(),
func=process_task
)
for _ in it:
pass
print("done")