Switch to orjson, add ndjson_iter

This commit is contained in:
simon987 2020-08-04 21:20:20 -04:00
parent 30854c7f8b
commit 52ad2d22b9
4 changed files with 71 additions and 5 deletions

View File

@ -1,7 +1,7 @@
import base64 import base64
import sqlite3 import sqlite3
import redis import redis
import ujson as json import orjson as json
class PersistentState: class PersistentState:

View File

@ -1,6 +1,14 @@
import os import os
from io import BytesIO from io import BytesIO, BufferedReader
from tarfile import TarFile, TarInfo from tarfile import TarFile, TarInfo
import subprocess
import gzip
import zstandard
try:
import orjson as json
except ImportError:
import json
def ftw(path): def ftw(path):
@ -27,3 +35,62 @@ def add_buf_to_tar(tar: TarFile, filename: str, buf: BytesIO):
info = TarInfo(name=filename) info = TarInfo(name=filename)
info.size = len(buf.getvalue()) info.size = len(buf.getvalue())
tar.addfile(info, buf) tar.addfile(info, buf)
def _is_executable(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
def find_program(*programs):
for program in programs:
for path in os.environ["PATH"].split(os.pathsep):
exe_file = os.path.join(path, program)
if _is_executable(exe_file):
return exe_file
def program_is_in_path(program) -> bool:
for path in os.environ["PATH"].split(os.pathsep):
exe_file = os.path.join(path, program)
if _is_executable(exe_file):
return True
return False
COMPRESSION_GZIP = "gz"
COMPRESSION_ZSTD = "zstd"
def ndjson_iter(*files, compression=""):
for file in files:
cleanup = None
if compression == COMPRESSION_GZIP:
prog = find_program("pigz", "gzip")
if prog:
process = subprocess.Popen([prog, "-dc", file], stdout=subprocess.PIPE)
line_iter = process.stdout
else:
# This is much slower
line_iter = BufferedReader(gzip.open(file))
elif compression == COMPRESSION_ZSTD:
fp = open(file, "rb")
dctx = zstandard.ZstdDecompressor()
reader = dctx.stream_reader(fp)
line_iter = BufferedReader(reader)
def cleanup():
fp.close()
reader.close()
else:
line_iter = open(file)
def cleanup():
line_iter.close()
for line in line_iter:
yield json.loads(line)
if cleanup:
cleanup()

View File

@ -2,7 +2,6 @@ import pickle
import re import re
from base64 import b64encode, b64decode from base64 import b64encode, b64decode
from http.cookiejar import Cookie from http.cookiejar import Cookie
from io import BytesIO
from dateutil.parser import parse from dateutil.parser import parse
from requests.cookies import RequestsCookieJar from requests.cookies import RequestsCookieJar

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup( setup(
name="hexlib", name="hexlib",
version="1.13", version="1.14",
description="Misc utility methods", description="Misc utility methods",
author="simon987", author="simon987",
author_email="me@simon987.net", author_email="me@simon987.net",
@ -12,6 +12,6 @@ setup(
"data/*" "data/*"
]}, ]},
install_requires=[ install_requires=[
"ImageHash", "influxdb", "siphash", "python-dateutil", "redis", "ujson" "ImageHash", "influxdb", "siphash", "python-dateutil", "redis", "orjson", "zstandard"
] ]
) )