mirror of
https://github.com/simon987/pg_hamming.git
synced 2025-12-18 00:59:02 +00:00
tweaks & support 8,32 & 128 length
This commit is contained in:
233
hamming.c
233
hamming.c
@@ -1,15 +1,54 @@
|
||||
#include "postgresql/server/postgres.h"
|
||||
#include "postgresql/server/fmgr.h"
|
||||
|
||||
PG_MODULE_MAGIC;
|
||||
__always_inline
|
||||
static int _distance_8(const char *h1, const char *h2) {
|
||||
return __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2));
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(hash_is_within_distance18);
|
||||
__always_inline
|
||||
static int _distance_32(const char *h1, const char *h2) {
|
||||
int distance = 0;
|
||||
distance += __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 2) ^ *((uint64 *) h2 + 2));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 3) ^ *((uint64 *) h2 + 3));
|
||||
return distance;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static int _distance_128(const char *h1, const char *h2) {
|
||||
int distance = 0;
|
||||
distance += __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 2) ^ *((uint64 *) h2 + 2));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 3) ^ *((uint64 *) h2 + 3));
|
||||
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 4) ^ *((uint64 *) h2 + 4));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 5) ^ *((uint64 *) h2 + 5));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 6) ^ *((uint64 *) h2 + 6));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 7) ^ *((uint64 *) h2 + 7));
|
||||
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 8) ^ *((uint64 *) h2 + 8));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 9) ^ *((uint64 *) h2 + 9));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 10) ^ *((uint64 *) h2 + 10));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 11) ^ *((uint64 *) h2 + 11));
|
||||
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 12) ^ *((uint64 *) h2 + 12));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 13) ^ *((uint64 *) h2 + 13));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 14) ^ *((uint64 *) h2 + 14));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 15) ^ *((uint64 *) h2 + 15));
|
||||
return distance;
|
||||
}
|
||||
|
||||
|
||||
PG_MODULE_MAGIC;
|
||||
|
||||
/**
|
||||
* Check if the hamming distance of the two raw byte arrays
|
||||
* is within the specified distance
|
||||
*
|
||||
* It is assumed that: the two arrays are exactly 18 bytes long
|
||||
* It is assumed that: the two arrays are exactly 8 bytes long
|
||||
*
|
||||
* Import with
|
||||
CREATE OR REPLACE FUNCTION hash_is_within_distance(bytea, bytea, integer) RETURNS boolean
|
||||
@@ -18,33 +57,6 @@ PG_FUNCTION_INFO_V1(hash_is_within_distance18);
|
||||
*
|
||||
* @return the hamming distance between the two arrays
|
||||
*/
|
||||
Datum hash_is_within_distance18(PG_FUNCTION_ARGS) {
|
||||
|
||||
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
|
||||
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
||||
int32 max_distance = PG_GETARG_INT32(2);
|
||||
|
||||
int distance = 0;
|
||||
|
||||
distance += __builtin_popcountll(
|
||||
*((uint64 *) h1) ^ *((uint64 *) h2)
|
||||
);
|
||||
if (distance > max_distance) {
|
||||
PG_RETURN_BOOL(false);
|
||||
}
|
||||
distance += __builtin_popcountll(
|
||||
*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1)
|
||||
);
|
||||
if (distance > max_distance) {
|
||||
PG_RETURN_BOOL(false);
|
||||
}
|
||||
distance += __builtin_popcount(
|
||||
*((uint16 *) h1 + 8) ^ *((uint16 *) h2 + 8)
|
||||
);
|
||||
|
||||
PG_RETURN_BOOL(distance <= max_distance);
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(hash_is_within_distance8);
|
||||
|
||||
Datum hash_is_within_distance8(PG_FUNCTION_ARGS) {
|
||||
@@ -52,12 +64,10 @@ Datum hash_is_within_distance8(PG_FUNCTION_ARGS) {
|
||||
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
||||
int32 max_distance = PG_GETARG_INT32(2);
|
||||
|
||||
if (__builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2)) > max_distance) {
|
||||
PG_RETURN_BOOL(false);
|
||||
}
|
||||
PG_RETURN_BOOL(true);
|
||||
PG_RETURN_BOOL(_distance_8(h1, h2) <= max_distance);
|
||||
}
|
||||
|
||||
/** 32-byte **/
|
||||
PG_FUNCTION_INFO_V1(hash_is_within_distance32);
|
||||
|
||||
Datum hash_is_within_distance32(PG_FUNCTION_ARGS) {
|
||||
@@ -65,59 +75,24 @@ Datum hash_is_within_distance32(PG_FUNCTION_ARGS) {
|
||||
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
||||
int32 max_distance = PG_GETARG_INT32(2);
|
||||
|
||||
int distance = 0;
|
||||
|
||||
distance += __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2));
|
||||
if (distance > max_distance) {
|
||||
PG_RETURN_BOOL(false);
|
||||
}
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1));
|
||||
if (distance > max_distance) {
|
||||
PG_RETURN_BOOL(false);
|
||||
}
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 2) ^ *((uint64 *) h2 + 2));
|
||||
if (distance > max_distance) {
|
||||
PG_RETURN_BOOL(false);
|
||||
}
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 3) ^ *((uint64 *) h2 + 3));
|
||||
if (distance > max_distance) {
|
||||
PG_RETURN_BOOL(false);
|
||||
}
|
||||
|
||||
PG_RETURN_BOOL(distance <= max_distance);
|
||||
PG_RETURN_BOOL(_distance_32(h1, h2) <= max_distance);
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(hash_distance8);
|
||||
/** 128-byte **/
|
||||
PG_FUNCTION_INFO_V1(hash_is_within_distance128);
|
||||
|
||||
Datum hash_distance8(PG_FUNCTION_ARGS) {
|
||||
Datum hash_is_within_distance128(PG_FUNCTION_ARGS) {
|
||||
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
|
||||
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
||||
int32 max_distance = PG_GETARG_INT32(2);
|
||||
|
||||
int distance = __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2));
|
||||
PG_RETURN_INT32(distance);
|
||||
PG_RETURN_BOOL(_distance_128(h1, h2) <= max_distance);
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(hash_distance32);
|
||||
|
||||
Datum hash_distance32(PG_FUNCTION_ARGS) {
|
||||
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
|
||||
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
||||
|
||||
int distance = 0;
|
||||
distance += __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 2) ^ *((uint64 *) h2 + 2));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 3) ^ *((uint64 *) h2 + 3));
|
||||
|
||||
PG_RETURN_INT32(distance);
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(hash_distance18);
|
||||
|
||||
/**
|
||||
* Hamming distance of two raw byte arrays
|
||||
*
|
||||
* It is assumed that: the two arrays are exactly 18 bytes long
|
||||
* It is assumed that: the two arrays are exactly 8 bytes long
|
||||
*
|
||||
* Import with
|
||||
CREATE OR REPLACE FUNCTION hash_distance(bytea, bytea) RETURNS integer
|
||||
@@ -126,106 +101,32 @@ PG_FUNCTION_INFO_V1(hash_distance18);
|
||||
*
|
||||
* @return the hamming distance between the two arrays
|
||||
*/
|
||||
Datum hash_distance18(PG_FUNCTION_ARGS) {
|
||||
|
||||
PG_FUNCTION_INFO_V1(hash_distance8);
|
||||
|
||||
Datum hash_distance8(PG_FUNCTION_ARGS) {
|
||||
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
|
||||
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
||||
|
||||
int distance = 0;
|
||||
|
||||
distance += __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2));
|
||||
distance += __builtin_popcountll(*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1));
|
||||
distance += __builtin_popcount(*((uint16 *) h1 + 8) ^ *((uint16 *) h2 + 8));
|
||||
|
||||
PG_RETURN_INT32(distance);
|
||||
PG_RETURN_INT32(_distance_8(h1, h2));
|
||||
}
|
||||
|
||||
/** 32-byte **/
|
||||
PG_FUNCTION_INFO_V1(hash_distance32);
|
||||
|
||||
PG_FUNCTION_INFO_V1(hash_is_within_distance18_any);
|
||||
Datum hash_distance32(PG_FUNCTION_ARGS) {
|
||||
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
|
||||
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
||||
|
||||
/**
|
||||
* Check if the first argument matches any (within distance 'max_distance')
|
||||
hashes among an array of hashes
|
||||
*
|
||||
* It is assumed that: the first array is exactly 18 bytes long, the
|
||||
second array length is a multiple of 18 bytes
|
||||
*
|
||||
* Import with
|
||||
CREATE OR REPLACE FUNCTION hash_is_within_distance_any(bytea, bytea, integer) RETURNS bool
|
||||
AS '/path/to/libhamming.so', 'hash_is_within_distance_any'
|
||||
LANGUAGE C STRICT;
|
||||
*
|
||||
* @return true if at least 1 hash matches
|
||||
*/
|
||||
Datum hash_is_within_distance18_any(PG_FUNCTION_ARGS) {
|
||||
|
||||
char *h = VARDATA(PG_GETARG_BYTEA_P(0));
|
||||
bytea *h_bytea = PG_GETARG_BYTEA_P(1);
|
||||
char *h_arr = VARDATA(h_bytea);
|
||||
int32 max_distance = PG_GETARG_INT32(2);
|
||||
|
||||
int distance;
|
||||
|
||||
for (int i = (VARSIZE(h_bytea) - VARHDRSZ) / 18 - 1; i >= 0; i--) {
|
||||
|
||||
distance = __builtin_popcountll(
|
||||
*((uint64 *) h) ^ *((uint64 *) h_arr)
|
||||
);
|
||||
if (distance > max_distance) {
|
||||
h_arr += 18;
|
||||
continue;
|
||||
}
|
||||
distance += __builtin_popcountll(
|
||||
*((uint64 *) h + 1) ^ *((uint64 *) h_arr + 1)
|
||||
);
|
||||
if (distance > max_distance) {
|
||||
h_arr += 18;
|
||||
continue;
|
||||
}
|
||||
distance += __builtin_popcount(
|
||||
*((uint16 *) h + 8) ^ *((uint16 *) h_arr + 8)
|
||||
);
|
||||
|
||||
if (distance <= max_distance) {
|
||||
PG_RETURN_BOOL(true);
|
||||
}
|
||||
h_arr += 18;
|
||||
}
|
||||
|
||||
PG_RETURN_BOOL(false);
|
||||
PG_RETURN_INT32(_distance_32(h1, h2));
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(hash_equ18_any);
|
||||
/** 128-byte **/
|
||||
PG_FUNCTION_INFO_V1(hash_distance128);
|
||||
|
||||
/**
|
||||
* Check if the first argument exactly matches any hashes among an array of hashes
|
||||
*
|
||||
* It is assumed that: the first array is exactly 18 bytes long, the
|
||||
second array length is a multiple of 18 bytes
|
||||
*
|
||||
* Import with
|
||||
* CREATE OR REPLACE FUNCTION hash_equ_any(bytea, bytea) RETURNS bool
|
||||
AS '/path/to/libhamming.so', 'hash_equ_any'
|
||||
LANGUAGE C STRICT;
|
||||
* @return true if at least 1 hash is equal
|
||||
*/
|
||||
Datum hash_equ18_any(PG_FUNCTION_ARGS) {
|
||||
Datum hash_distance128(PG_FUNCTION_ARGS) {
|
||||
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
|
||||
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
||||
|
||||
char *h = VARDATA(PG_GETARG_BYTEA_P(0));
|
||||
bytea *h_bytea = PG_GETARG_BYTEA_P(1);
|
||||
char *h_arr = VARDATA(h_bytea);
|
||||
|
||||
for (int i = (VARSIZE(h_bytea) - VARHDRSZ) / 18 - 1; i >= 0; i--) {
|
||||
|
||||
// This is a bit faster than __builtin_memcmp
|
||||
if (*((uint64 *) h) == *((uint64 *) h_arr) &&
|
||||
*((uint64 *) h + 1) == *((uint64 *) h_arr + 1) &&
|
||||
*((uint16 *) h + 8) == *((uint16 *) h_arr + 8)) {
|
||||
PG_RETURN_BOOL(true);
|
||||
}
|
||||
|
||||
h_arr += 18;
|
||||
}
|
||||
|
||||
PG_RETURN_BOOL(false);
|
||||
PG_RETURN_INT32(_distance_128(h1, h2));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user