Switch to orjson, add ndjson_iter

This commit is contained in:
simon987 2020-08-04 21:20:20 -04:00
parent 30854c7f8b
commit 52ad2d22b9
4 changed files with 71 additions and 5 deletions

View File

@ -1,7 +1,7 @@
import base64
import sqlite3
import redis
import ujson as json
import orjson as json
class PersistentState:

View File

@ -1,6 +1,14 @@
import os
from io import BytesIO
from io import BytesIO, BufferedReader
from tarfile import TarFile, TarInfo
import subprocess
import gzip
import zstandard
try:
import orjson as json
except ImportError:
import json
def ftw(path):
@ -27,3 +35,62 @@ def add_buf_to_tar(tar: TarFile, filename: str, buf: BytesIO):
info = TarInfo(name=filename)
info.size = len(buf.getvalue())
tar.addfile(info, buf)
def _is_executable(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
def find_program(*programs):
for program in programs:
for path in os.environ["PATH"].split(os.pathsep):
exe_file = os.path.join(path, program)
if _is_executable(exe_file):
return exe_file
def program_is_in_path(program) -> bool:
for path in os.environ["PATH"].split(os.pathsep):
exe_file = os.path.join(path, program)
if _is_executable(exe_file):
return True
return False
COMPRESSION_GZIP = "gz"
COMPRESSION_ZSTD = "zstd"
def ndjson_iter(*files, compression=""):
for file in files:
cleanup = None
if compression == COMPRESSION_GZIP:
prog = find_program("pigz", "gzip")
if prog:
process = subprocess.Popen([prog, "-dc", file], stdout=subprocess.PIPE)
line_iter = process.stdout
else:
# This is much slower
line_iter = BufferedReader(gzip.open(file))
elif compression == COMPRESSION_ZSTD:
fp = open(file, "rb")
dctx = zstandard.ZstdDecompressor()
reader = dctx.stream_reader(fp)
line_iter = BufferedReader(reader)
def cleanup():
fp.close()
reader.close()
else:
line_iter = open(file)
def cleanup():
line_iter.close()
for line in line_iter:
yield json.loads(line)
if cleanup:
cleanup()

View File

@ -2,7 +2,6 @@ import pickle
import re
from base64 import b64encode, b64decode
from http.cookiejar import Cookie
from io import BytesIO
from dateutil.parser import parse
from requests.cookies import RequestsCookieJar

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup(
name="hexlib",
version="1.13",
version="1.14",
description="Misc utility methods",
author="simon987",
author_email="me@simon987.net",
@ -12,6 +12,6 @@ setup(
"data/*"
]},
install_requires=[
"ImageHash", "influxdb", "siphash", "python-dateutil", "redis", "ujson"
"ImageHash", "influxdb", "siphash", "python-dateutil", "redis", "orjson", "zstandard"
]
)