From eb49a94ebc4eb5f6ff1ea8d5730b4d3886141a24 Mon Sep 17 00:00:00 2001 From: simon987 Date: Sun, 12 Apr 2020 11:58:35 -0400 Subject: [PATCH] tweaks & support 8,32 & 128 length --- CMakeLists.txt | 10 ++- README.md | 7 +- hamming.c | 233 ++++++++++++++----------------------------------- install.sh | 21 ++--- 4 files changed, 84 insertions(+), 187 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 23c00bc..52d2866 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,11 @@ project(hamming C) set(CMAKE_C_STANDARD 99) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") - find_package(PostgreSQL REQUIRED) -add_library(hamming SHARED hamming.c) \ No newline at end of file +add_library(hamming SHARED hamming.c) +target_compile_options( + hamming + PRIVATE + -Ofast + -march=native +) \ No newline at end of file diff --git a/README.md b/README.md index 61bd913..25c7d3a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # PostgreSQL hamming distance -Hamming distance for fixed-length `bytea` datatype. +Hamming distance for fixed-length `bytea` datatype (8-byte, 32-byte and 128-byte). ### Compiling from source (CMake) @@ -10,7 +10,8 @@ cmake . make ``` -The functions in this library are very domain-specific and can realistically -only be used within the scope of [irarchives](https://github.com/simon987/irarchives). +Will break catastrophically if function arguments are not exactly the correct length. See [hamming.c](hamming.c) for more information + + diff --git a/hamming.c b/hamming.c index 4cc476c..e72b136 100644 --- a/hamming.c +++ b/hamming.c @@ -1,15 +1,54 @@ #include "postgresql/server/postgres.h" #include "postgresql/server/fmgr.h" -PG_MODULE_MAGIC; +__always_inline +static int _distance_8(const char *h1, const char *h2) { + return __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2)); +} -PG_FUNCTION_INFO_V1(hash_is_within_distance18); +__always_inline +static int _distance_32(const char *h1, const char *h2) { + int distance = 0; + distance += __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2)); + distance += __builtin_popcountll(*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1)); + distance += __builtin_popcountll(*((uint64 *) h1 + 2) ^ *((uint64 *) h2 + 2)); + distance += __builtin_popcountll(*((uint64 *) h1 + 3) ^ *((uint64 *) h2 + 3)); + return distance; +} + +__always_inline +static int _distance_128(const char *h1, const char *h2) { + int distance = 0; + distance += __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2)); + distance += __builtin_popcountll(*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1)); + distance += __builtin_popcountll(*((uint64 *) h1 + 2) ^ *((uint64 *) h2 + 2)); + distance += __builtin_popcountll(*((uint64 *) h1 + 3) ^ *((uint64 *) h2 + 3)); + + distance += __builtin_popcountll(*((uint64 *) h1 + 4) ^ *((uint64 *) h2 + 4)); + distance += __builtin_popcountll(*((uint64 *) h1 + 5) ^ *((uint64 *) h2 + 5)); + distance += __builtin_popcountll(*((uint64 *) h1 + 6) ^ *((uint64 *) h2 + 6)); + distance += __builtin_popcountll(*((uint64 *) h1 + 7) ^ *((uint64 *) h2 + 7)); + + distance += __builtin_popcountll(*((uint64 *) h1 + 8) ^ *((uint64 *) h2 + 8)); + distance += __builtin_popcountll(*((uint64 *) h1 + 9) ^ *((uint64 *) h2 + 9)); + distance += __builtin_popcountll(*((uint64 *) h1 + 10) ^ *((uint64 *) h2 + 10)); + distance += __builtin_popcountll(*((uint64 *) h1 + 11) ^ *((uint64 *) h2 + 11)); + + distance += __builtin_popcountll(*((uint64 *) h1 + 12) ^ *((uint64 *) h2 + 12)); + distance += __builtin_popcountll(*((uint64 *) h1 + 13) ^ *((uint64 *) h2 + 13)); + distance += __builtin_popcountll(*((uint64 *) h1 + 14) ^ *((uint64 *) h2 + 14)); + distance += __builtin_popcountll(*((uint64 *) h1 + 15) ^ *((uint64 *) h2 + 15)); + return distance; +} + + +PG_MODULE_MAGIC; /** * Check if the hamming distance of the two raw byte arrays * is within the specified distance * - * It is assumed that: the two arrays are exactly 18 bytes long + * It is assumed that: the two arrays are exactly 8 bytes long * * Import with CREATE OR REPLACE FUNCTION hash_is_within_distance(bytea, bytea, integer) RETURNS boolean @@ -18,33 +57,6 @@ PG_FUNCTION_INFO_V1(hash_is_within_distance18); * * @return the hamming distance between the two arrays */ -Datum hash_is_within_distance18(PG_FUNCTION_ARGS) { - - char *h1 = VARDATA(PG_GETARG_BYTEA_P(0)); - char *h2 = VARDATA(PG_GETARG_BYTEA_P(1)); - int32 max_distance = PG_GETARG_INT32(2); - - int distance = 0; - - distance += __builtin_popcountll( - *((uint64 *) h1) ^ *((uint64 *) h2) - ); - if (distance > max_distance) { - PG_RETURN_BOOL(false); - } - distance += __builtin_popcountll( - *((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1) - ); - if (distance > max_distance) { - PG_RETURN_BOOL(false); - } - distance += __builtin_popcount( - *((uint16 *) h1 + 8) ^ *((uint16 *) h2 + 8) - ); - - PG_RETURN_BOOL(distance <= max_distance); -} - PG_FUNCTION_INFO_V1(hash_is_within_distance8); Datum hash_is_within_distance8(PG_FUNCTION_ARGS) { @@ -52,12 +64,10 @@ Datum hash_is_within_distance8(PG_FUNCTION_ARGS) { char *h2 = VARDATA(PG_GETARG_BYTEA_P(1)); int32 max_distance = PG_GETARG_INT32(2); - if (__builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2)) > max_distance) { - PG_RETURN_BOOL(false); - } - PG_RETURN_BOOL(true); + PG_RETURN_BOOL(_distance_8(h1, h2) <= max_distance); } +/** 32-byte **/ PG_FUNCTION_INFO_V1(hash_is_within_distance32); Datum hash_is_within_distance32(PG_FUNCTION_ARGS) { @@ -65,59 +75,24 @@ Datum hash_is_within_distance32(PG_FUNCTION_ARGS) { char *h2 = VARDATA(PG_GETARG_BYTEA_P(1)); int32 max_distance = PG_GETARG_INT32(2); - int distance = 0; - - distance += __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2)); - if (distance > max_distance) { - PG_RETURN_BOOL(false); - } - distance += __builtin_popcountll(*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1)); - if (distance > max_distance) { - PG_RETURN_BOOL(false); - } - distance += __builtin_popcountll(*((uint64 *) h1 + 2) ^ *((uint64 *) h2 + 2)); - if (distance > max_distance) { - PG_RETURN_BOOL(false); - } - distance += __builtin_popcountll(*((uint64 *) h1 + 3) ^ *((uint64 *) h2 + 3)); - if (distance > max_distance) { - PG_RETURN_BOOL(false); - } - - PG_RETURN_BOOL(distance <= max_distance); + PG_RETURN_BOOL(_distance_32(h1, h2) <= max_distance); } -PG_FUNCTION_INFO_V1(hash_distance8); +/** 128-byte **/ +PG_FUNCTION_INFO_V1(hash_is_within_distance128); -Datum hash_distance8(PG_FUNCTION_ARGS) { +Datum hash_is_within_distance128(PG_FUNCTION_ARGS) { char *h1 = VARDATA(PG_GETARG_BYTEA_P(0)); char *h2 = VARDATA(PG_GETARG_BYTEA_P(1)); + int32 max_distance = PG_GETARG_INT32(2); - int distance = __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2)); - PG_RETURN_INT32(distance); + PG_RETURN_BOOL(_distance_128(h1, h2) <= max_distance); } -PG_FUNCTION_INFO_V1(hash_distance32); - -Datum hash_distance32(PG_FUNCTION_ARGS) { - char *h1 = VARDATA(PG_GETARG_BYTEA_P(0)); - char *h2 = VARDATA(PG_GETARG_BYTEA_P(1)); - - int distance = 0; - distance += __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2)); - distance += __builtin_popcountll(*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1)); - distance += __builtin_popcountll(*((uint64 *) h1 + 2) ^ *((uint64 *) h2 + 2)); - distance += __builtin_popcountll(*((uint64 *) h1 + 3) ^ *((uint64 *) h2 + 3)); - - PG_RETURN_INT32(distance); -} - -PG_FUNCTION_INFO_V1(hash_distance18); - /** * Hamming distance of two raw byte arrays * - * It is assumed that: the two arrays are exactly 18 bytes long + * It is assumed that: the two arrays are exactly 8 bytes long * * Import with CREATE OR REPLACE FUNCTION hash_distance(bytea, bytea) RETURNS integer @@ -126,106 +101,32 @@ PG_FUNCTION_INFO_V1(hash_distance18); * * @return the hamming distance between the two arrays */ -Datum hash_distance18(PG_FUNCTION_ARGS) { +PG_FUNCTION_INFO_V1(hash_distance8); + +Datum hash_distance8(PG_FUNCTION_ARGS) { char *h1 = VARDATA(PG_GETARG_BYTEA_P(0)); char *h2 = VARDATA(PG_GETARG_BYTEA_P(1)); - int distance = 0; - - distance += __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2)); - distance += __builtin_popcountll(*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1)); - distance += __builtin_popcount(*((uint16 *) h1 + 8) ^ *((uint16 *) h2 + 8)); - - PG_RETURN_INT32(distance); + PG_RETURN_INT32(_distance_8(h1, h2)); } +/** 32-byte **/ +PG_FUNCTION_INFO_V1(hash_distance32); -PG_FUNCTION_INFO_V1(hash_is_within_distance18_any); +Datum hash_distance32(PG_FUNCTION_ARGS) { + char *h1 = VARDATA(PG_GETARG_BYTEA_P(0)); + char *h2 = VARDATA(PG_GETARG_BYTEA_P(1)); -/** - * Check if the first argument matches any (within distance 'max_distance') - hashes among an array of hashes - * - * It is assumed that: the first array is exactly 18 bytes long, the - second array length is a multiple of 18 bytes - * - * Import with - CREATE OR REPLACE FUNCTION hash_is_within_distance_any(bytea, bytea, integer) RETURNS bool - AS '/path/to/libhamming.so', 'hash_is_within_distance_any' - LANGUAGE C STRICT; - * - * @return true if at least 1 hash matches - */ -Datum hash_is_within_distance18_any(PG_FUNCTION_ARGS) { - - char *h = VARDATA(PG_GETARG_BYTEA_P(0)); - bytea *h_bytea = PG_GETARG_BYTEA_P(1); - char *h_arr = VARDATA(h_bytea); - int32 max_distance = PG_GETARG_INT32(2); - - int distance; - - for (int i = (VARSIZE(h_bytea) - VARHDRSZ) / 18 - 1; i >= 0; i--) { - - distance = __builtin_popcountll( - *((uint64 *) h) ^ *((uint64 *) h_arr) - ); - if (distance > max_distance) { - h_arr += 18; - continue; - } - distance += __builtin_popcountll( - *((uint64 *) h + 1) ^ *((uint64 *) h_arr + 1) - ); - if (distance > max_distance) { - h_arr += 18; - continue; - } - distance += __builtin_popcount( - *((uint16 *) h + 8) ^ *((uint16 *) h_arr + 8) - ); - - if (distance <= max_distance) { - PG_RETURN_BOOL(true); - } - h_arr += 18; - } - - PG_RETURN_BOOL(false); + PG_RETURN_INT32(_distance_32(h1, h2)); } -PG_FUNCTION_INFO_V1(hash_equ18_any); +/** 128-byte **/ +PG_FUNCTION_INFO_V1(hash_distance128); -/** - * Check if the first argument exactly matches any hashes among an array of hashes - * - * It is assumed that: the first array is exactly 18 bytes long, the - second array length is a multiple of 18 bytes - * - * Import with - * CREATE OR REPLACE FUNCTION hash_equ_any(bytea, bytea) RETURNS bool - AS '/path/to/libhamming.so', 'hash_equ_any' - LANGUAGE C STRICT; - * @return true if at least 1 hash is equal - */ -Datum hash_equ18_any(PG_FUNCTION_ARGS) { +Datum hash_distance128(PG_FUNCTION_ARGS) { + char *h1 = VARDATA(PG_GETARG_BYTEA_P(0)); + char *h2 = VARDATA(PG_GETARG_BYTEA_P(1)); - char *h = VARDATA(PG_GETARG_BYTEA_P(0)); - bytea *h_bytea = PG_GETARG_BYTEA_P(1); - char *h_arr = VARDATA(h_bytea); - - for (int i = (VARSIZE(h_bytea) - VARHDRSZ) / 18 - 1; i >= 0; i--) { - - // This is a bit faster than __builtin_memcmp - if (*((uint64 *) h) == *((uint64 *) h_arr) && - *((uint64 *) h + 1) == *((uint64 *) h_arr + 1) && - *((uint16 *) h + 8) == *((uint16 *) h_arr + 8)) { - PG_RETURN_BOOL(true); - } - - h_arr += 18; - } - - PG_RETURN_BOOL(false); + PG_RETURN_INT32(_distance_128(h1, h2)); } diff --git a/install.sh b/install.sh index e5561c7..7da35f6 100755 --- a/install.sh +++ b/install.sh @@ -7,31 +7,22 @@ DATABASE=imhashdb LIB_PATH="'/usr/lib/libhamming.so'" psql -U $USER $DATABASE <