mirror of
https://github.com/simon987/feed_archiver.git
synced 2025-04-04 01:52:58 +00:00
add script to export to .ndjson
This commit is contained in:
parent
632f05c9ea
commit
b18a57d256
38
export_to_ndjson.py
Normal file
38
export_to_ndjson.py
Normal file
@ -0,0 +1,38 @@
|
||||
from hexlib.db import pg_fetch_cursor_all
|
||||
import psycopg2
|
||||
from tqdm import tqdm
|
||||
import orjson
|
||||
import zstandard as zstd
|
||||
|
||||
TABLE = "chan_8kun2_post"
|
||||
THREADS = 12
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host="",
|
||||
port="",
|
||||
user="",
|
||||
password="",
|
||||
dbname="feed_archiver"
|
||||
)
|
||||
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM %s" % TABLE)
|
||||
row_count = cur.fetchone()[0]
|
||||
|
||||
cur.execute("DECLARE cur1 CURSOR FOR SELECT * FROM %s" % TABLE)
|
||||
|
||||
rows = pg_fetch_cursor_all(cur, name="cur1", batch_size=5000)
|
||||
|
||||
with open("out_mp.ndjson.zst", "wb") as f:
|
||||
cctx = zstd.ZstdCompressor(level=19, threads=THREADS)
|
||||
with cctx.stream_writer(f) as compressor:
|
||||
for row in tqdm(rows, total=row_count, unit="row"):
|
||||
_id, archived_on, data = row
|
||||
data["_archived_on"] = int(archived_on.timestamp())
|
||||
compressor.write(orjson.dumps(data))
|
||||
compressor.write(b"\n")
|
||||
|
||||
conn.close()
|
@ -1,3 +1,4 @@
|
||||
elasticsearch
|
||||
psycopg2
|
||||
ujson
|
||||
ujson
|
||||
git+git://github.com/simon987/hexlib.git
|
Loading…
x
Reference in New Issue
Block a user