mirror of
https://github.com/simon987/documentcloud_dl.git
synced 2025-04-04 00:43:05 +00:00
first commit
This commit is contained in:
commit
f80bef1a7d
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
*.iml
|
||||
.idea/
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
git+git://github.com/simon987/hexlib.git
|
102
run.py
Normal file
102
run.py
Normal file
@ -0,0 +1,102 @@
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
from datetime import datetime
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
from hexlib.env import get_web
|
||||
from hexlib.web import download_file
|
||||
from tqdm import tqdm
|
||||
|
||||
OUTPUT_PATH = "/mnt/Hatchery/main/projects/documentcloud/data/"
|
||||
|
||||
RE_PAGE_NUM = re.compile(r".*&page=([0-9]+)")
|
||||
|
||||
|
||||
class DocumentCloud:
|
||||
|
||||
def __init__(self):
|
||||
self.web = get_web()
|
||||
self.web._logger = None
|
||||
|
||||
def list_documents(self, page=1):
|
||||
|
||||
r = self.web.get(
|
||||
f"https://api.www.documentcloud.org/api/documents/search/?q=&page={page}"
|
||||
)
|
||||
|
||||
j = r.json()
|
||||
|
||||
try:
|
||||
next_page = int(RE_PAGE_NUM.match(j["next"]).group(1))
|
||||
except:
|
||||
next_page = None
|
||||
|
||||
return j["results"], j["count"], next_page
|
||||
|
||||
def all_documents(self):
|
||||
_, count, _ = self.list_documents(page=0)
|
||||
yield from tqdm(self._all_documents(), total=count, unit="item")
|
||||
|
||||
def _all_documents(self):
|
||||
next_page = 0
|
||||
while True:
|
||||
res, _, next_page = self.list_documents(next_page)
|
||||
|
||||
yield from res
|
||||
|
||||
|
||||
def result_to_s2meta(result):
|
||||
meta = {
|
||||
"title": result["title"],
|
||||
"language": result["language"],
|
||||
"mtime": int(datetime.fromisoformat(result["updated_at"][:-1]).timestamp()),
|
||||
"pages": result["page_count"],
|
||||
"checksum": result["file_hash"],
|
||||
}
|
||||
|
||||
if "source" in result:
|
||||
meta["source"] = result["source"]
|
||||
|
||||
if "related_article" in result:
|
||||
meta["related_article"] = result["related_article"]
|
||||
|
||||
return meta
|
||||
|
||||
|
||||
def process_task(item):
|
||||
file_name = f"{item['id']} {item['slug']}.pdf"
|
||||
file_path = os.path.join(OUTPUT_PATH, file_name)
|
||||
s2meta = result_to_s2meta(item)
|
||||
with open(file_path + ".s2meta", "w") as f:
|
||||
f.write(json.dumps(s2meta))
|
||||
|
||||
download_url = f"{item['asset_url']}documents/{item['id']}/{item['slug']}.pdf"
|
||||
download_file(download_url, file_path)
|
||||
|
||||
|
||||
def tasks():
|
||||
documentcloud = DocumentCloud()
|
||||
|
||||
item: dict
|
||||
|
||||
for item in documentcloud.all_documents():
|
||||
|
||||
file_name = f"{item['id']} {item['slug']}.pdf"
|
||||
file_path = os.path.join(OUTPUT_PATH, file_name)
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
yield item
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pool = ThreadPool(processes=10)
|
||||
it = pool.imap(
|
||||
iterable=tasks(),
|
||||
func=process_task
|
||||
)
|
||||
|
||||
for _ in it:
|
||||
pass
|
||||
|
||||
print("done")
|
Loading…
x
Reference in New Issue
Block a user