From 52ad2d22b930aa3e7ee727241af62c5ead2b4967 Mon Sep 17 00:00:00 2001 From: simon987 Date: Tue, 4 Aug 2020 21:20:20 -0400 Subject: [PATCH] Switch to orjson, add ndjson_iter --- hexlib/db.py | 2 +- hexlib/files.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++- hexlib/web.py | 1 - setup.py | 4 +-- 4 files changed, 71 insertions(+), 5 deletions(-) diff --git a/hexlib/db.py b/hexlib/db.py index 0fb2745..3052ba4 100644 --- a/hexlib/db.py +++ b/hexlib/db.py @@ -1,7 +1,7 @@ import base64 import sqlite3 import redis -import ujson as json +import orjson as json class PersistentState: diff --git a/hexlib/files.py b/hexlib/files.py index bb58a1e..38b8ccf 100644 --- a/hexlib/files.py +++ b/hexlib/files.py @@ -1,6 +1,14 @@ import os -from io import BytesIO +from io import BytesIO, BufferedReader from tarfile import TarFile, TarInfo +import subprocess +import gzip +import zstandard + +try: + import orjson as json +except ImportError: + import json def ftw(path): @@ -27,3 +35,62 @@ def add_buf_to_tar(tar: TarFile, filename: str, buf: BytesIO): info = TarInfo(name=filename) info.size = len(buf.getvalue()) tar.addfile(info, buf) + + +def _is_executable(fpath): + return os.path.isfile(fpath) and os.access(fpath, os.X_OK) + + +def find_program(*programs): + for program in programs: + for path in os.environ["PATH"].split(os.pathsep): + exe_file = os.path.join(path, program) + if _is_executable(exe_file): + return exe_file + + +def program_is_in_path(program) -> bool: + for path in os.environ["PATH"].split(os.pathsep): + exe_file = os.path.join(path, program) + if _is_executable(exe_file): + return True + + return False + + +COMPRESSION_GZIP = "gz" +COMPRESSION_ZSTD = "zstd" + + +def ndjson_iter(*files, compression=""): + for file in files: + cleanup = None + if compression == COMPRESSION_GZIP: + prog = find_program("pigz", "gzip") + if prog: + process = subprocess.Popen([prog, "-dc", file], stdout=subprocess.PIPE) + line_iter = process.stdout + else: + # This is much slower + line_iter = BufferedReader(gzip.open(file)) + elif compression == COMPRESSION_ZSTD: + fp = open(file, "rb") + dctx = zstandard.ZstdDecompressor() + reader = dctx.stream_reader(fp) + line_iter = BufferedReader(reader) + + def cleanup(): + fp.close() + reader.close() + + else: + line_iter = open(file) + + def cleanup(): + line_iter.close() + + for line in line_iter: + yield json.loads(line) + if cleanup: + cleanup() + diff --git a/hexlib/web.py b/hexlib/web.py index 9cecb19..d996522 100644 --- a/hexlib/web.py +++ b/hexlib/web.py @@ -2,7 +2,6 @@ import pickle import re from base64 import b64encode, b64decode from http.cookiejar import Cookie -from io import BytesIO from dateutil.parser import parse from requests.cookies import RequestsCookieJar diff --git a/setup.py b/setup.py index efe3a6d..8e2954a 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup setup( name="hexlib", - version="1.13", + version="1.14", description="Misc utility methods", author="simon987", author_email="me@simon987.net", @@ -12,6 +12,6 @@ setup( "data/*" ]}, install_requires=[ - "ImageHash", "influxdb", "siphash", "python-dateutil", "redis", "ujson" + "ImageHash", "influxdb", "siphash", "python-dateutil", "redis", "orjson", "zstandard" ] )