From 9ef184fd7c00ce9524537cc551bbbe02e86a2f49 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 20 Jul 2019 09:52:31 -0400 Subject: [PATCH] Initial commit --- .gitignore | 8 ++++++ CMakeLists.txt | 10 +++++++ README.md | 13 +++++++++ hamming.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 109 insertions(+) create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 README.md create mode 100644 hamming.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f5b17b3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +cmake-build-debug/ +CMakeFiles +cmake_install.cmake +Makefile +CMakeCache.txt +*.so +*.cbp +.idea/ \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..f98a47c --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.14) +include_directories("/usr/include/postgresql/server/") +project(hamming C) + +set(CMAKE_C_STANDARD 99) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") + +find_package("PostgreSQL REQUIRED") +add_library(hamming SHARED hamming.c) \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..4f21340 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +# PostgreSQL hamming distance + +Hamming distance for fixed-length `bytea` datatype. + + +### Compiling from source (Cmake) + +```bash +cmake . +make +``` + +See [hamming.c](hamming.c) for more information \ No newline at end of file diff --git a/hamming.c b/hamming.c new file mode 100644 index 0000000..2b8c440 --- /dev/null +++ b/hamming.c @@ -0,0 +1,78 @@ +#include "postgresql/server/postgres.h" +#include "postgresql/server/fmgr.h" + +PG_MODULE_MAGIC; + +int const HASH_SIZE = 12; +int const HASH_BITS = HASH_SIZE * HASH_SIZE; +int const BYTEA_LEN = HASH_BITS / 8; + +PG_FUNCTION_INFO_V1(hash_is_within_distance); + +/** + * Check if the hamming distance of the two raw byte arrays + * is within the specified distance + * + * It is assumed that: the two arrays are exactly + * BYTEA_LEN bytes long + * + * Import with + CREATE OR REPLACE FUNCTION hash_is_within_distance(bytea, bytea, integer) RETURNS boolean + AS '/path/to/libhamming.so', 'hash_is_within_distance' + LANGUAGE C STRICT;' + * + * @return the hamming distance between the two arrays + */ +Datum hash_is_within_distance(PG_FUNCTION_ARGS) { + + bytea *hash1 = PG_GETARG_BYTEA_P(0); + bytea *hash2 = PG_GETARG_BYTEA_P(1); + int32 max_distance = PG_GETARG_INT32(2); + + int distance = 0; + + char *h1 = hash1->vl_len_; + char *h2 = hash2->vl_len_; + + for (int i = BYTEA_LEN; i >= 0; i--) { + distance += __builtin_popcount(h1[i] ^ h2[i]); + + if (distance > max_distance) { + PG_RETURN_BOOL(false); + } + } + + PG_RETURN_BOOL(true); +} + +PG_FUNCTION_INFO_V1(hash_distance); + +/** + * Hamming distance of two raw byte arrays + * + * It is assumed that: the two arrays are exactly + * BYTEA_LEN bytes long + * + * Import with + CREATE OR REPLACE FUNCTION hash_distance(bytea, bytea) RETURNS integer + AS '/path/to/libhamming.so', 'hash_distance' + LANGUAGE C STRICT;' + * + * @return the hamming distance between the two arrays + */ +Datum hash_distance(PG_FUNCTION_ARGS) { + + bytea *hash1 = PG_GETARG_BYTEA_P(0); + bytea *hash2 = PG_GETARG_BYTEA_P(1); + + int distance = 0; + + char *h1 = hash1->vl_len_; + char *h2 = hash2->vl_len_; + + for (int i = BYTEA_LEN; i >= 0; i--) { + distance += __builtin_popcount(h1[i] ^ h2[i]); + } + + PG_RETURN_INT32(distance); +}