mirror of
https://github.com/simon987/documentcloud_dl.git
synced 2025-04-09 21:56:45 +00:00
first commit
This commit is contained in:
commit
f80bef1a7d
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
*.iml
|
||||||
|
.idea/
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
git+git://github.com/simon987/hexlib.git
|
102
run.py
Normal file
102
run.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
import json
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
from multiprocessing.pool import ThreadPool
|
||||||
|
|
||||||
|
from hexlib.env import get_web
|
||||||
|
from hexlib.web import download_file
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
OUTPUT_PATH = "/mnt/Hatchery/main/projects/documentcloud/data/"
|
||||||
|
|
||||||
|
RE_PAGE_NUM = re.compile(r".*&page=([0-9]+)")
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentCloud:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.web = get_web()
|
||||||
|
self.web._logger = None
|
||||||
|
|
||||||
|
def list_documents(self, page=1):
|
||||||
|
|
||||||
|
r = self.web.get(
|
||||||
|
f"https://api.www.documentcloud.org/api/documents/search/?q=&page={page}"
|
||||||
|
)
|
||||||
|
|
||||||
|
j = r.json()
|
||||||
|
|
||||||
|
try:
|
||||||
|
next_page = int(RE_PAGE_NUM.match(j["next"]).group(1))
|
||||||
|
except:
|
||||||
|
next_page = None
|
||||||
|
|
||||||
|
return j["results"], j["count"], next_page
|
||||||
|
|
||||||
|
def all_documents(self):
|
||||||
|
_, count, _ = self.list_documents(page=0)
|
||||||
|
yield from tqdm(self._all_documents(), total=count, unit="item")
|
||||||
|
|
||||||
|
def _all_documents(self):
|
||||||
|
next_page = 0
|
||||||
|
while True:
|
||||||
|
res, _, next_page = self.list_documents(next_page)
|
||||||
|
|
||||||
|
yield from res
|
||||||
|
|
||||||
|
|
||||||
|
def result_to_s2meta(result):
|
||||||
|
meta = {
|
||||||
|
"title": result["title"],
|
||||||
|
"language": result["language"],
|
||||||
|
"mtime": int(datetime.fromisoformat(result["updated_at"][:-1]).timestamp()),
|
||||||
|
"pages": result["page_count"],
|
||||||
|
"checksum": result["file_hash"],
|
||||||
|
}
|
||||||
|
|
||||||
|
if "source" in result:
|
||||||
|
meta["source"] = result["source"]
|
||||||
|
|
||||||
|
if "related_article" in result:
|
||||||
|
meta["related_article"] = result["related_article"]
|
||||||
|
|
||||||
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
def process_task(item):
|
||||||
|
file_name = f"{item['id']} {item['slug']}.pdf"
|
||||||
|
file_path = os.path.join(OUTPUT_PATH, file_name)
|
||||||
|
s2meta = result_to_s2meta(item)
|
||||||
|
with open(file_path + ".s2meta", "w") as f:
|
||||||
|
f.write(json.dumps(s2meta))
|
||||||
|
|
||||||
|
download_url = f"{item['asset_url']}documents/{item['id']}/{item['slug']}.pdf"
|
||||||
|
download_file(download_url, file_path)
|
||||||
|
|
||||||
|
|
||||||
|
def tasks():
|
||||||
|
documentcloud = DocumentCloud()
|
||||||
|
|
||||||
|
item: dict
|
||||||
|
|
||||||
|
for item in documentcloud.all_documents():
|
||||||
|
|
||||||
|
file_name = f"{item['id']} {item['slug']}.pdf"
|
||||||
|
file_path = os.path.join(OUTPUT_PATH, file_name)
|
||||||
|
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
yield item
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pool = ThreadPool(processes=10)
|
||||||
|
it = pool.imap(
|
||||||
|
iterable=tasks(),
|
||||||
|
func=process_task
|
||||||
|
)
|
||||||
|
|
||||||
|
for _ in it:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print("done")
|
Loading…
x
Reference in New Issue
Block a user