Compare commits

...

7 Commits
1.0 ... master

Author SHA1 Message Date
e32ba92aa7 docker 2020-05-23 14:45:23 -04:00
21403845aa update readme 2020-04-30 09:12:18 -04:00
5fc9c8548d fix include dirs 2020-04-28 11:12:39 -04:00
eb49a94ebc tweaks & support 8,32 & 128 length 2020-04-12 12:59:46 -04:00
ab3fb7191e Add 32-byte hash support 2020-04-11 21:08:51 -04:00
82291d600d Add 8-byte hash support 2020-04-10 18:55:20 -04:00
a7c41e4959 bugfix 2019-07-23 15:50:18 -04:00
5 changed files with 143 additions and 132 deletions

View File

@ -1,10 +1,16 @@
cmake_minimum_required(VERSION 3.14) cmake_minimum_required(VERSION 3.13)
include_directories("/usr/include/postgresql/server/")
project(hamming C) project(hamming C)
set(CMAKE_C_STANDARD 99) set(CMAKE_C_STANDARD 99)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") find_package(PostgreSQL REQUIRED)
include_directories(${PostgreSQL_INCLUDE_DIRS})
find_package("PostgreSQL REQUIRED")
add_library(hamming SHARED hamming.c) add_library(hamming SHARED hamming.c)
target_compile_options(
hamming
PRIVATE
-Ofast
-march=native
)

16
Dockerfile Normal file
View File

@ -0,0 +1,16 @@
FROM ubuntu:20.04 as build
ARG DEBIAN_FRONTEND=noninteractive
RUN apt update
RUN apt install -y cmake postgresql-server-dev-12 -y
WORKDIR /build/
COPY . /build/
RUN cmake .
FROM postgres:12
COPY --from=build /build/libhamming.so /usr/lib/libhamming.so
COPY install.sh /docker-entrypoint-initdb.d/init-user-db.sh

View File

@ -1,16 +1,18 @@
# PostgreSQL hamming distance # PostgreSQL hamming distance
Hamming distance for fixed-length `bytea` datatype. Hamming distance for fixed-length `bytea` datatype (8-byte, 32-byte and 128-byte).
### Compiling from source (CMake) ### Compiling from source (CMake)
```bash ```bash
apt install postgresql-server-11-dev
cmake . cmake .
make make
``` ```
The functions in this library are very domain-specific and can realistically Will break catastrophically if function arguments are not exactly the correct length.
only be used within the scope of [irarchives](https://github.com/simon987/irarchives).
See [hamming.c](hamming.c) for more information See [hamming.c](hamming.c) for more information

211
hamming.c
View File

@ -1,169 +1,132 @@
#include "postgresql/server/postgres.h" #include <postgres.h>
#include "postgresql/server/fmgr.h" #include <fmgr.h>
__always_inline
static int _distance_8(const char *h1, const char *h2) {
return __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2));
}
__always_inline
static int _distance_32(const char *h1, const char *h2) {
int distance = 0;
distance += __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2));
distance += __builtin_popcountll(*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1));
distance += __builtin_popcountll(*((uint64 *) h1 + 2) ^ *((uint64 *) h2 + 2));
distance += __builtin_popcountll(*((uint64 *) h1 + 3) ^ *((uint64 *) h2 + 3));
return distance;
}
__always_inline
static int _distance_128(const char *h1, const char *h2) {
int distance = 0;
distance += __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2));
distance += __builtin_popcountll(*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1));
distance += __builtin_popcountll(*((uint64 *) h1 + 2) ^ *((uint64 *) h2 + 2));
distance += __builtin_popcountll(*((uint64 *) h1 + 3) ^ *((uint64 *) h2 + 3));
distance += __builtin_popcountll(*((uint64 *) h1 + 4) ^ *((uint64 *) h2 + 4));
distance += __builtin_popcountll(*((uint64 *) h1 + 5) ^ *((uint64 *) h2 + 5));
distance += __builtin_popcountll(*((uint64 *) h1 + 6) ^ *((uint64 *) h2 + 6));
distance += __builtin_popcountll(*((uint64 *) h1 + 7) ^ *((uint64 *) h2 + 7));
distance += __builtin_popcountll(*((uint64 *) h1 + 8) ^ *((uint64 *) h2 + 8));
distance += __builtin_popcountll(*((uint64 *) h1 + 9) ^ *((uint64 *) h2 + 9));
distance += __builtin_popcountll(*((uint64 *) h1 + 10) ^ *((uint64 *) h2 + 10));
distance += __builtin_popcountll(*((uint64 *) h1 + 11) ^ *((uint64 *) h2 + 11));
distance += __builtin_popcountll(*((uint64 *) h1 + 12) ^ *((uint64 *) h2 + 12));
distance += __builtin_popcountll(*((uint64 *) h1 + 13) ^ *((uint64 *) h2 + 13));
distance += __builtin_popcountll(*((uint64 *) h1 + 14) ^ *((uint64 *) h2 + 14));
distance += __builtin_popcountll(*((uint64 *) h1 + 15) ^ *((uint64 *) h2 + 15));
return distance;
}
PG_MODULE_MAGIC; PG_MODULE_MAGIC;
PG_FUNCTION_INFO_V1(hash_is_within_distance);
/** /**
* Check if the hamming distance of the two raw byte arrays * Check if the hamming distance of the two raw byte arrays
* is within the specified distance * is within the specified distance
* *
* It is assumed that: the two arrays are exactly 18 bytes long * It is assumed that: the two arrays are exactly 8 bytes long
* *
* Import with * Import with
CREATE OR REPLACE FUNCTION hash_is_within_distance(bytea, bytea, integer) RETURNS boolean CREATE OR REPLACE FUNCTION hash_is_within_distance(bytea, bytea, integer) RETURNS boolean
AS '/path/to/libhamming.so', 'hash_is_within_distance' AS '/path/to/libhamming.so', 'hash_is_within_distance'
LANGUAGE C STRICT;' LANGUAGE C STRICT;
* *
* @return the hamming distance between the two arrays * @return the hamming distance between the two arrays
*/ */
Datum hash_is_within_distance(PG_FUNCTION_ARGS) { PG_FUNCTION_INFO_V1(hash_is_within_distance8);
Datum hash_is_within_distance8(PG_FUNCTION_ARGS) {
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0)); char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1)); char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
int32 max_distance = PG_GETARG_INT32(2); int32 max_distance = PG_GETARG_INT32(2);
int distance = 0; PG_RETURN_BOOL(_distance_8(h1, h2) <= max_distance);
distance += __builtin_popcountll(
*((uint64 *) h1) ^ *((uint64 *) h2)
);
if (distance > max_distance) {
PG_RETURN_BOOL(false);
}
distance += __builtin_popcountll(
*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1)
);
if (distance > max_distance) {
PG_RETURN_BOOL(false);
}
distance += __builtin_popcount(
*((uint16 *) h1 + 8) ^ *((uint16 *) h2 + 8)
);
PG_RETURN_BOOL(distance <= max_distance);
} }
PG_FUNCTION_INFO_V1(hash_distance); /** 32-byte **/
PG_FUNCTION_INFO_V1(hash_is_within_distance32);
Datum hash_is_within_distance32(PG_FUNCTION_ARGS) {
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
int32 max_distance = PG_GETARG_INT32(2);
PG_RETURN_BOOL(_distance_32(h1, h2) <= max_distance);
}
/** 128-byte **/
PG_FUNCTION_INFO_V1(hash_is_within_distance128);
Datum hash_is_within_distance128(PG_FUNCTION_ARGS) {
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
int32 max_distance = PG_GETARG_INT32(2);
PG_RETURN_BOOL(_distance_128(h1, h2) <= max_distance);
}
/** /**
* Hamming distance of two raw byte arrays * Hamming distance of two raw byte arrays
* *
* It is assumed that: the two arrays are exactly 18 bytes long * It is assumed that: the two arrays are exactly 8 bytes long
* *
* Import with * Import with
CREATE OR REPLACE FUNCTION hash_distance(bytea, bytea) RETURNS integer CREATE OR REPLACE FUNCTION hash_distance(bytea, bytea) RETURNS integer
AS '/path/to/libhamming.so', 'hash_distance' AS '/path/to/libhamming.so', 'hash_distance'
LANGUAGE C STRICT;' LANGUAGE C STRICT;
* *
* @return the hamming distance between the two arrays * @return the hamming distance between the two arrays
*/ */
Datum hash_distance(PG_FUNCTION_ARGS) {
PG_FUNCTION_INFO_V1(hash_distance8);
Datum hash_distance8(PG_FUNCTION_ARGS) {
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0)); char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1)); char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
int distance = 0; PG_RETURN_INT32(_distance_8(h1, h2));
distance += __builtin_popcountll(
*((uint64 *) h1) ^ *((uint64 *) h2)
);
distance += __builtin_popcountll(
*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1)
);
distance += __builtin_popcount(
*((uint16 *) h1 + 8) ^ *((uint16 *) h2 + 8)
);
PG_RETURN_INT32(distance);
} }
/** 32-byte **/
PG_FUNCTION_INFO_V1(hash_distance32);
PG_FUNCTION_INFO_V1(hash_is_within_distance_any); Datum hash_distance32(PG_FUNCTION_ARGS) {
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
/** PG_RETURN_INT32(_distance_32(h1, h2));
* Check if the first argument matches any (within distance 'max_distance')
hashes among an array of hashes
*
* It is assumed that: the first array is exactly 18 bytes long, the
second array length is a multiple of 18 bytes
*
* Import with
CREATE OR REPLACE FUNCTION hash_is_within_distance_any(bytea, bytea, integer) RETURNS bool
AS '/path/to/libhamming.so', 'hash_is_within_distance_any'
LANGUAGE C STRICT;'
*
* @return true if at least 1 hash matches
*/
Datum hash_is_within_distance_any(PG_FUNCTION_ARGS) {
char *h = VARDATA(PG_GETARG_BYTEA_P(0));
bytea *h_bytea = PG_GETARG_BYTEA_P(1);
char *h_arr = VARDATA(h_bytea);
int32 max_distance = PG_GETARG_INT32(2);
int distance;
for (int i = VARSIZE(h_bytea) - 18; i >= 0; i -= 18) {
h_arr += 18;
distance = 0;
distance += __builtin_popcountll(
*((uint64 *) h) ^ *((uint64 *) h_arr)
);
if (distance > max_distance) {
continue;
}
distance += __builtin_popcountll(
*((uint64 *) h + 1) ^ *((uint64 *) h_arr + 1)
);
if (distance > max_distance) {
continue;
}
distance += __builtin_popcount(
*((uint16 *) h + 8) ^ *((uint16 *) h_arr + 8)
);
if (distance <= max_distance) {
PG_RETURN_BOOL(true);
}
}
PG_RETURN_BOOL(false);
} }
PG_FUNCTION_INFO_V1(hash_equ_any); /** 128-byte **/
PG_FUNCTION_INFO_V1(hash_distance128);
/** Datum hash_distance128(PG_FUNCTION_ARGS) {
* Check if the first argument exactly matches any hashes among an array of hashes char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
* char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
* It is assumed that: the first array is exactly 18 bytes long, the
second array length is a multiple of 18 bytes
*
* Import with
* CREATE OR REPLACE FUNCTION hash_equ_any(bytea, bytea) RETURNS bool
AS '/path/to/libhamming.so', 'hash_equ_any'
LANGUAGE C STRICT;
* @return true if at least 1 hash is equal
*/
Datum hash_equ_any(PG_FUNCTION_ARGS) {
char *h = VARDATA(PG_GETARG_BYTEA_P(0)); PG_RETURN_INT32(_distance_128(h1, h2));
bytea *h_bytea = PG_GETARG_BYTEA_P(1);
char *h_arr = VARDATA(h_bytea);
for (int i = VARSIZE(h_bytea); i >= 0; i -= 18) {
// This is a bit faster than __builtin_memcmp
if (*((uint64 *) h) == *((uint64 *) h_arr) &&
*((uint64 *) h + 1) == *((uint64 *) h_arr + 1) &&
*((uint16 *) h + 8) == *((uint16 *) h_arr + 8)) {
PG_RETURN_BOOL(true);
}
h_arr += 18;
}
PG_RETURN_BOOL(false);
} }

24
install.sh Executable file
View File

@ -0,0 +1,24 @@
#!/usr/bin/env bash
LIB_PATH="'/usr/lib/libhamming.so'"
psql -U $POSTGRES_USER $POSTGRES_DB <<EOF
CREATE OR REPLACE FUNCTION hash_is_within_distance8(bytea, bytea, integer) RETURNS boolean
AS $LIB_PATH, 'hash_is_within_distance8' LANGUAGE C STRICT;
CREATE OR REPLACE FUNCTION hash_is_within_distance32(bytea, bytea, integer) RETURNS boolean
AS $LIB_PATH, 'hash_is_within_distance32'
LANGUAGE C STRICT;
CREATE OR REPLACE FUNCTION hash_is_within_distance128(bytea, bytea, integer) RETURNS boolean
AS $LIB_PATH, 'hash_is_within_distance128'
LANGUAGE C STRICT;
CREATE OR REPLACE FUNCTION hash_distance8(bytea, bytea) RETURNS integer
AS $LIB_PATH, 'hash_distance8'
LANGUAGE C STRICT;
CREATE OR REPLACE FUNCTION hash_distance32(bytea, bytea) RETURNS integer
AS $LIB_PATH, 'hash_distance32'
LANGUAGE C STRICT;
CREATE OR REPLACE FUNCTION hash_distance128(bytea, bytea) RETURNS integer
AS $LIB_PATH, 'hash_distance128'
LANGUAGE C STRICT;
EOF