mirror of
https://github.com/simon987/pg_hamming.git
synced 2025-04-20 02:36:43 +00:00
Compare commits
7 Commits
Author | SHA1 | Date | |
---|---|---|---|
e32ba92aa7 | |||
21403845aa | |||
5fc9c8548d | |||
eb49a94ebc | |||
ab3fb7191e | |||
82291d600d | |||
a7c41e4959 |
@ -1,10 +1,16 @@
|
|||||||
cmake_minimum_required(VERSION 3.14)
|
cmake_minimum_required(VERSION 3.13)
|
||||||
include_directories("/usr/include/postgresql/server/")
|
|
||||||
project(hamming C)
|
project(hamming C)
|
||||||
|
|
||||||
set(CMAKE_C_STANDARD 99)
|
set(CMAKE_C_STANDARD 99)
|
||||||
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
|
find_package(PostgreSQL REQUIRED)
|
||||||
|
|
||||||
|
include_directories(${PostgreSQL_INCLUDE_DIRS})
|
||||||
|
|
||||||
find_package("PostgreSQL REQUIRED")
|
|
||||||
add_library(hamming SHARED hamming.c)
|
add_library(hamming SHARED hamming.c)
|
||||||
|
target_compile_options(
|
||||||
|
hamming
|
||||||
|
PRIVATE
|
||||||
|
-Ofast
|
||||||
|
-march=native
|
||||||
|
)
|
16
Dockerfile
Normal file
16
Dockerfile
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
FROM ubuntu:20.04 as build
|
||||||
|
|
||||||
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
RUN apt update
|
||||||
|
RUN apt install -y cmake postgresql-server-dev-12 -y
|
||||||
|
|
||||||
|
WORKDIR /build/
|
||||||
|
COPY . /build/
|
||||||
|
|
||||||
|
RUN cmake .
|
||||||
|
|
||||||
|
FROM postgres:12
|
||||||
|
|
||||||
|
COPY --from=build /build/libhamming.so /usr/lib/libhamming.so
|
||||||
|
COPY install.sh /docker-entrypoint-initdb.d/init-user-db.sh
|
@ -1,16 +1,18 @@
|
|||||||
# PostgreSQL hamming distance
|
# PostgreSQL hamming distance
|
||||||
|
|
||||||
Hamming distance for fixed-length `bytea` datatype.
|
Hamming distance for fixed-length `bytea` datatype (8-byte, 32-byte and 128-byte).
|
||||||
|
|
||||||
|
|
||||||
### Compiling from source (CMake)
|
### Compiling from source (CMake)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
apt install postgresql-server-11-dev
|
||||||
cmake .
|
cmake .
|
||||||
make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
The functions in this library are very domain-specific and can realistically
|
Will break catastrophically if function arguments are not exactly the correct length.
|
||||||
only be used within the scope of [irarchives](https://github.com/simon987/irarchives).
|
|
||||||
|
|
||||||
See [hamming.c](hamming.c) for more information
|
See [hamming.c](hamming.c) for more information
|
||||||
|
|
||||||
|
|
||||||
|
211
hamming.c
211
hamming.c
@ -1,169 +1,132 @@
|
|||||||
#include "postgresql/server/postgres.h"
|
#include <postgres.h>
|
||||||
#include "postgresql/server/fmgr.h"
|
#include <fmgr.h>
|
||||||
|
|
||||||
|
__always_inline
|
||||||
|
static int _distance_8(const char *h1, const char *h2) {
|
||||||
|
return __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2));
|
||||||
|
}
|
||||||
|
|
||||||
|
__always_inline
|
||||||
|
static int _distance_32(const char *h1, const char *h2) {
|
||||||
|
int distance = 0;
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 2) ^ *((uint64 *) h2 + 2));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 3) ^ *((uint64 *) h2 + 3));
|
||||||
|
return distance;
|
||||||
|
}
|
||||||
|
|
||||||
|
__always_inline
|
||||||
|
static int _distance_128(const char *h1, const char *h2) {
|
||||||
|
int distance = 0;
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1) ^ *((uint64 *) h2));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 2) ^ *((uint64 *) h2 + 2));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 3) ^ *((uint64 *) h2 + 3));
|
||||||
|
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 4) ^ *((uint64 *) h2 + 4));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 5) ^ *((uint64 *) h2 + 5));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 6) ^ *((uint64 *) h2 + 6));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 7) ^ *((uint64 *) h2 + 7));
|
||||||
|
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 8) ^ *((uint64 *) h2 + 8));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 9) ^ *((uint64 *) h2 + 9));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 10) ^ *((uint64 *) h2 + 10));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 11) ^ *((uint64 *) h2 + 11));
|
||||||
|
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 12) ^ *((uint64 *) h2 + 12));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 13) ^ *((uint64 *) h2 + 13));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 14) ^ *((uint64 *) h2 + 14));
|
||||||
|
distance += __builtin_popcountll(*((uint64 *) h1 + 15) ^ *((uint64 *) h2 + 15));
|
||||||
|
return distance;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
PG_MODULE_MAGIC;
|
PG_MODULE_MAGIC;
|
||||||
|
|
||||||
PG_FUNCTION_INFO_V1(hash_is_within_distance);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if the hamming distance of the two raw byte arrays
|
* Check if the hamming distance of the two raw byte arrays
|
||||||
* is within the specified distance
|
* is within the specified distance
|
||||||
*
|
*
|
||||||
* It is assumed that: the two arrays are exactly 18 bytes long
|
* It is assumed that: the two arrays are exactly 8 bytes long
|
||||||
*
|
*
|
||||||
* Import with
|
* Import with
|
||||||
CREATE OR REPLACE FUNCTION hash_is_within_distance(bytea, bytea, integer) RETURNS boolean
|
CREATE OR REPLACE FUNCTION hash_is_within_distance(bytea, bytea, integer) RETURNS boolean
|
||||||
AS '/path/to/libhamming.so', 'hash_is_within_distance'
|
AS '/path/to/libhamming.so', 'hash_is_within_distance'
|
||||||
LANGUAGE C STRICT;'
|
LANGUAGE C STRICT;
|
||||||
*
|
*
|
||||||
* @return the hamming distance between the two arrays
|
* @return the hamming distance between the two arrays
|
||||||
*/
|
*/
|
||||||
Datum hash_is_within_distance(PG_FUNCTION_ARGS) {
|
PG_FUNCTION_INFO_V1(hash_is_within_distance8);
|
||||||
|
|
||||||
|
Datum hash_is_within_distance8(PG_FUNCTION_ARGS) {
|
||||||
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
|
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
|
||||||
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
||||||
int32 max_distance = PG_GETARG_INT32(2);
|
int32 max_distance = PG_GETARG_INT32(2);
|
||||||
|
|
||||||
int distance = 0;
|
PG_RETURN_BOOL(_distance_8(h1, h2) <= max_distance);
|
||||||
|
|
||||||
distance += __builtin_popcountll(
|
|
||||||
*((uint64 *) h1) ^ *((uint64 *) h2)
|
|
||||||
);
|
|
||||||
if (distance > max_distance) {
|
|
||||||
PG_RETURN_BOOL(false);
|
|
||||||
}
|
|
||||||
distance += __builtin_popcountll(
|
|
||||||
*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1)
|
|
||||||
);
|
|
||||||
if (distance > max_distance) {
|
|
||||||
PG_RETURN_BOOL(false);
|
|
||||||
}
|
|
||||||
distance += __builtin_popcount(
|
|
||||||
*((uint16 *) h1 + 8) ^ *((uint16 *) h2 + 8)
|
|
||||||
);
|
|
||||||
|
|
||||||
PG_RETURN_BOOL(distance <= max_distance);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PG_FUNCTION_INFO_V1(hash_distance);
|
/** 32-byte **/
|
||||||
|
PG_FUNCTION_INFO_V1(hash_is_within_distance32);
|
||||||
|
|
||||||
|
Datum hash_is_within_distance32(PG_FUNCTION_ARGS) {
|
||||||
|
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
|
||||||
|
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
||||||
|
int32 max_distance = PG_GETARG_INT32(2);
|
||||||
|
|
||||||
|
PG_RETURN_BOOL(_distance_32(h1, h2) <= max_distance);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** 128-byte **/
|
||||||
|
PG_FUNCTION_INFO_V1(hash_is_within_distance128);
|
||||||
|
|
||||||
|
Datum hash_is_within_distance128(PG_FUNCTION_ARGS) {
|
||||||
|
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
|
||||||
|
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
||||||
|
int32 max_distance = PG_GETARG_INT32(2);
|
||||||
|
|
||||||
|
PG_RETURN_BOOL(_distance_128(h1, h2) <= max_distance);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Hamming distance of two raw byte arrays
|
* Hamming distance of two raw byte arrays
|
||||||
*
|
*
|
||||||
* It is assumed that: the two arrays are exactly 18 bytes long
|
* It is assumed that: the two arrays are exactly 8 bytes long
|
||||||
*
|
*
|
||||||
* Import with
|
* Import with
|
||||||
CREATE OR REPLACE FUNCTION hash_distance(bytea, bytea) RETURNS integer
|
CREATE OR REPLACE FUNCTION hash_distance(bytea, bytea) RETURNS integer
|
||||||
AS '/path/to/libhamming.so', 'hash_distance'
|
AS '/path/to/libhamming.so', 'hash_distance'
|
||||||
LANGUAGE C STRICT;'
|
LANGUAGE C STRICT;
|
||||||
*
|
*
|
||||||
* @return the hamming distance between the two arrays
|
* @return the hamming distance between the two arrays
|
||||||
*/
|
*/
|
||||||
Datum hash_distance(PG_FUNCTION_ARGS) {
|
|
||||||
|
|
||||||
|
PG_FUNCTION_INFO_V1(hash_distance8);
|
||||||
|
|
||||||
|
Datum hash_distance8(PG_FUNCTION_ARGS) {
|
||||||
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
|
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
|
||||||
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
||||||
|
|
||||||
int distance = 0;
|
PG_RETURN_INT32(_distance_8(h1, h2));
|
||||||
|
|
||||||
distance += __builtin_popcountll(
|
|
||||||
*((uint64 *) h1) ^ *((uint64 *) h2)
|
|
||||||
);
|
|
||||||
distance += __builtin_popcountll(
|
|
||||||
*((uint64 *) h1 + 1) ^ *((uint64 *) h2 + 1)
|
|
||||||
);
|
|
||||||
distance += __builtin_popcount(
|
|
||||||
*((uint16 *) h1 + 8) ^ *((uint16 *) h2 + 8)
|
|
||||||
);
|
|
||||||
|
|
||||||
PG_RETURN_INT32(distance);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** 32-byte **/
|
||||||
|
PG_FUNCTION_INFO_V1(hash_distance32);
|
||||||
|
|
||||||
PG_FUNCTION_INFO_V1(hash_is_within_distance_any);
|
Datum hash_distance32(PG_FUNCTION_ARGS) {
|
||||||
|
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
|
||||||
|
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
||||||
|
|
||||||
/**
|
PG_RETURN_INT32(_distance_32(h1, h2));
|
||||||
* Check if the first argument matches any (within distance 'max_distance')
|
|
||||||
hashes among an array of hashes
|
|
||||||
*
|
|
||||||
* It is assumed that: the first array is exactly 18 bytes long, the
|
|
||||||
second array length is a multiple of 18 bytes
|
|
||||||
*
|
|
||||||
* Import with
|
|
||||||
CREATE OR REPLACE FUNCTION hash_is_within_distance_any(bytea, bytea, integer) RETURNS bool
|
|
||||||
AS '/path/to/libhamming.so', 'hash_is_within_distance_any'
|
|
||||||
LANGUAGE C STRICT;'
|
|
||||||
*
|
|
||||||
* @return true if at least 1 hash matches
|
|
||||||
*/
|
|
||||||
Datum hash_is_within_distance_any(PG_FUNCTION_ARGS) {
|
|
||||||
|
|
||||||
char *h = VARDATA(PG_GETARG_BYTEA_P(0));
|
|
||||||
bytea *h_bytea = PG_GETARG_BYTEA_P(1);
|
|
||||||
char *h_arr = VARDATA(h_bytea);
|
|
||||||
int32 max_distance = PG_GETARG_INT32(2);
|
|
||||||
|
|
||||||
int distance;
|
|
||||||
|
|
||||||
for (int i = VARSIZE(h_bytea) - 18; i >= 0; i -= 18) {
|
|
||||||
h_arr += 18;
|
|
||||||
distance = 0;
|
|
||||||
|
|
||||||
distance += __builtin_popcountll(
|
|
||||||
*((uint64 *) h) ^ *((uint64 *) h_arr)
|
|
||||||
);
|
|
||||||
if (distance > max_distance) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
distance += __builtin_popcountll(
|
|
||||||
*((uint64 *) h + 1) ^ *((uint64 *) h_arr + 1)
|
|
||||||
);
|
|
||||||
if (distance > max_distance) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
distance += __builtin_popcount(
|
|
||||||
*((uint16 *) h + 8) ^ *((uint16 *) h_arr + 8)
|
|
||||||
);
|
|
||||||
|
|
||||||
if (distance <= max_distance) {
|
|
||||||
PG_RETURN_BOOL(true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
PG_RETURN_BOOL(false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PG_FUNCTION_INFO_V1(hash_equ_any);
|
/** 128-byte **/
|
||||||
|
PG_FUNCTION_INFO_V1(hash_distance128);
|
||||||
|
|
||||||
/**
|
Datum hash_distance128(PG_FUNCTION_ARGS) {
|
||||||
* Check if the first argument exactly matches any hashes among an array of hashes
|
char *h1 = VARDATA(PG_GETARG_BYTEA_P(0));
|
||||||
*
|
char *h2 = VARDATA(PG_GETARG_BYTEA_P(1));
|
||||||
* It is assumed that: the first array is exactly 18 bytes long, the
|
|
||||||
second array length is a multiple of 18 bytes
|
|
||||||
*
|
|
||||||
* Import with
|
|
||||||
* CREATE OR REPLACE FUNCTION hash_equ_any(bytea, bytea) RETURNS bool
|
|
||||||
AS '/path/to/libhamming.so', 'hash_equ_any'
|
|
||||||
LANGUAGE C STRICT;
|
|
||||||
* @return true if at least 1 hash is equal
|
|
||||||
*/
|
|
||||||
Datum hash_equ_any(PG_FUNCTION_ARGS) {
|
|
||||||
|
|
||||||
char *h = VARDATA(PG_GETARG_BYTEA_P(0));
|
PG_RETURN_INT32(_distance_128(h1, h2));
|
||||||
bytea *h_bytea = PG_GETARG_BYTEA_P(1);
|
|
||||||
char *h_arr = VARDATA(h_bytea);
|
|
||||||
|
|
||||||
for (int i = VARSIZE(h_bytea); i >= 0; i -= 18) {
|
|
||||||
|
|
||||||
// This is a bit faster than __builtin_memcmp
|
|
||||||
if (*((uint64 *) h) == *((uint64 *) h_arr) &&
|
|
||||||
*((uint64 *) h + 1) == *((uint64 *) h_arr + 1) &&
|
|
||||||
*((uint16 *) h + 8) == *((uint16 *) h_arr + 8)) {
|
|
||||||
PG_RETURN_BOOL(true);
|
|
||||||
}
|
|
||||||
|
|
||||||
h_arr += 18;
|
|
||||||
}
|
|
||||||
|
|
||||||
PG_RETURN_BOOL(false);
|
|
||||||
}
|
}
|
||||||
|
24
install.sh
Executable file
24
install.sh
Executable file
@ -0,0 +1,24 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
LIB_PATH="'/usr/lib/libhamming.so'"
|
||||||
|
|
||||||
|
psql -U $POSTGRES_USER $POSTGRES_DB <<EOF
|
||||||
|
CREATE OR REPLACE FUNCTION hash_is_within_distance8(bytea, bytea, integer) RETURNS boolean
|
||||||
|
AS $LIB_PATH, 'hash_is_within_distance8' LANGUAGE C STRICT;
|
||||||
|
CREATE OR REPLACE FUNCTION hash_is_within_distance32(bytea, bytea, integer) RETURNS boolean
|
||||||
|
AS $LIB_PATH, 'hash_is_within_distance32'
|
||||||
|
LANGUAGE C STRICT;
|
||||||
|
CREATE OR REPLACE FUNCTION hash_is_within_distance128(bytea, bytea, integer) RETURNS boolean
|
||||||
|
AS $LIB_PATH, 'hash_is_within_distance128'
|
||||||
|
LANGUAGE C STRICT;
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION hash_distance8(bytea, bytea) RETURNS integer
|
||||||
|
AS $LIB_PATH, 'hash_distance8'
|
||||||
|
LANGUAGE C STRICT;
|
||||||
|
CREATE OR REPLACE FUNCTION hash_distance32(bytea, bytea) RETURNS integer
|
||||||
|
AS $LIB_PATH, 'hash_distance32'
|
||||||
|
LANGUAGE C STRICT;
|
||||||
|
CREATE OR REPLACE FUNCTION hash_distance128(bytea, bytea) RETURNS integer
|
||||||
|
AS $LIB_PATH, 'hash_distance128'
|
||||||
|
LANGUAGE C STRICT;
|
||||||
|
EOF
|
Loading…
x
Reference in New Issue
Block a user