imhashdb/db.go
2020-04-11 11:40:38 -04:00

302 lines
7.4 KiB
Go

package imhashdb
import (
"context"
"crypto/md5"
"crypto/sha1"
"crypto/sha256"
"errors"
"github.com/jackc/pgx"
"github.com/jackc/pgx/pgtype"
"github.com/mailru/easyjson"
"github.com/simon987/fastimagehash-go"
"go.uber.org/zap"
)
const MaxDistance = 30
const MaxLimit = 1000
type Entry struct {
AHash *fastimagehash.Hash
DHash *fastimagehash.Hash
MHash *fastimagehash.Hash
PHash *fastimagehash.Hash
WHash *fastimagehash.Hash
Size int
Sha1 [sha1.Size]byte
Md5 [md5.Size]byte
Sha256 [sha256.Size]byte
Crc32 uint32
Meta []Meta
Url string
}
func Store(entry *Entry) {
row := Pgdb.QueryRow(
`INSERT INTO image (size, sha1, md5, sha256, crc32) VALUES ($1, $2, $3, $4, $5) RETURNING id;`,
entry.Size, entry.Sha1[:], entry.Md5[:], entry.Sha256[:], entry.Crc32,
)
var id int64
imageExists := false
err := row.Scan(&id)
if err != nil {
imageExists = true
row = Pgdb.QueryRow(`SELECT id FROM image WHERE sha1=$1`, entry.Sha1[:])
err := row.Scan(&id)
if err != nil {
Logger.Error("FIXME: Could not insert image", zap.Error(err))
return
}
}
if !imageExists {
_, err = Pgdb.Exec("INSERT INTO hash_ahash VALUES ($1, $2) ON CONFLICT DO NOTHING", id, entry.AHash.Bytes)
if err != nil {
Logger.Error("Could not insert ahash", zap.Error(err))
}
_, err = Pgdb.Exec("INSERT INTO hash_dhash VALUES ($1, $2) ON CONFLICT DO NOTHING", id, entry.DHash.Bytes)
if err != nil {
Logger.Error("Could not insert dhash", zap.Error(err))
}
_, err = Pgdb.Exec("INSERT INTO hash_mhash VALUES ($1, $2) ON CONFLICT DO NOTHING", id, entry.MHash.Bytes)
if err != nil {
Logger.Error("Could not insert mhash", zap.Error(err))
}
_, err = Pgdb.Exec("INSERT INTO hash_phash VALUES ($1, $2) ON CONFLICT DO NOTHING", id, entry.PHash.Bytes)
if err != nil {
Logger.Error("Could not insert phash", zap.Error(err))
}
_, err = Pgdb.Exec("INSERT INTO hash_whash VALUES ($1, $2) ON CONFLICT DO NOTHING", id, entry.WHash.Bytes)
if err != nil {
Logger.Error("Could not insert whash", zap.Error(err))
}
}
for _, meta := range entry.Meta {
_, err = Pgdb.Exec(
"INSERT INTO image_meta VALUES ($1, $2, $3) ON CONFLICT DO NOTHING",
meta.Id, meta.RetrievedAt, meta.Meta,
)
if err != nil {
Logger.Error("Could not insert meta", zap.Error(err))
return
}
_, err = Pgdb.Exec(
"INSERT INTO image_has_meta VALUES ($1, $2, $3) ON CONFLICT DO NOTHING",
id, entry.Url, meta.Id,
)
if err != nil {
Logger.Error("Could not insert ihm", zap.Error(err))
return
}
}
}
func isHashValid(hash []byte, hashType HashType) bool {
switch hashType {
case AHash12:
if len(hash) != 18 {
return false
}
case DHash12:
if len(hash) != 18 {
return false
}
case MHash12:
if len(hash) != 18 {
return false
}
case PHash12:
if len(hash) != 18 {
return false
}
case WHash8Haar:
if len(hash) != 8 {
return false
}
default:
return false
}
return true
}
func FindImagesByHash(ctx context.Context, hash []byte, hashType HashType, distance, limit, offset uint) ([]byte, error) {
if !isHashValid(hash, hashType) {
return nil, errors.New("invalid hash")
}
if distance > MaxDistance {
return nil, errors.New("Invalid distance")
}
if limit > MaxLimit {
return nil, errors.New("Invalid distance")
}
tx, err := Pgdb.BeginEx(ctx, &pgx.TxOptions{IsoLevel: pgx.ReadUncommitted})
if err != nil {
return nil, err
}
defer tx.Commit()
var sql string
switch hashType {
case AHash12:
sql = `SELECT image.* FROM image INNER JOIN hash_ahash h on image.id = h.image_id
WHERE hash_is_within_distance18(h.hash, $1, $2) ORDER BY image.id LIMIT $3 OFFSET $4`
case DHash12:
sql = `SELECT image.* FROM image INNER JOIN hash_dhash h on image.id = h.image_id
WHERE hash_is_within_distance18(h.hash, $1, $2) ORDER BY image.id LIMIT $3 OFFSET $4`
case MHash12:
sql = `SELECT image.* FROM image INNER JOIN hash_mhash h on image.id = h.image_id
WHERE hash_is_within_distance18(h.hash, $1, $2) ORDER BY image.id LIMIT $3 OFFSET $4`
case PHash12:
sql = `SELECT image.* FROM image INNER JOIN hash_phash h on image.id = h.image_id
WHERE hash_is_within_distance18(h.hash, $1, $2) ORDER BY image.id LIMIT $3 OFFSET $4`
case WHash8Haar:
sql = `SELECT image.* FROM image INNER JOIN hash_whash h on image.id = h.image_id
WHERE hash_is_within_distance8(h.hash, $1, $2) ORDER BY image.id LIMIT $3 OFFSET $4`
}
rows, err := tx.Query(sql, hash, distance, limit, offset)
if err != nil {
return nil, err
}
var images []*Image
for rows.Next() {
var im Image
err := rows.Scan(&im.id, &im.Size, &im.Sha1, &im.Md5, &im.Sha256, &im.Crc32)
if err != nil {
Logger.Error("Error while fetching db image", zap.String("err", err.Error()))
return nil, err
}
images = append(images, &im)
}
batch := tx.BeginBatch()
defer batch.Close()
for _, im := range images {
batch.Queue(
`SELECT ihm.url, meta.id, meta.retrieved_at, meta.meta FROM image_has_meta ihm
INNER JOIN image_meta meta on ihm.image_meta_id = meta.id
WHERE image_id=$1`,
[]interface{}{im.id},
[]pgtype.OID{pgtype.Int4OID},
nil,
)
}
err = batch.Send(ctx, nil)
if err != nil {
Logger.Error("Error while fetching db meta", zap.String("err", err.Error()))
return nil, err
}
for _, im := range images {
rows, err := batch.QueryResults()
if err != nil {
Logger.Error("Error while fetching db meta", zap.String("err", err.Error()))
return nil, err
}
for rows.Next() {
var ihm ImageHasMeta
err := rows.Scan(&ihm.Url, &ihm.Meta.Id, &ihm.Meta.RetrievedAt, &ihm.Meta.Meta)
if err != nil {
return nil, err
}
im.Meta = append(im.Meta, ihm)
}
}
b, _ := easyjson.Marshal(ImageList{Images: images})
return b, nil
}
func DbInit(pool *pgx.ConnPool) {
sql := `
CREATE TABLE IF NOT EXISTS image (
id BIGSERIAL PRIMARY KEY,
size INT,
sha1 bytea,
md5 bytea,
sha256 bytea,
crc32 bigint
);
CREATE UNIQUE INDEX IF NOT EXISTS idx_image_sha1 ON image(sha1);
CREATE INDEX IF NOT EXISTS idx_image_md5 ON image(md5);
CREATE INDEX IF NOT EXISTS idx_image_sha256 ON image(sha256);
CREATE INDEX IF NOT EXISTS idx_image_crc32 ON image(crc32);
CREATE TABLE IF NOT EXISTS image_meta (
id TEXT UNIQUE,
retrieved_at bigint,
meta bytea
);
CREATE TABLE IF NOT EXISTS image_has_meta (
image_id bigint REFERENCES image(id),
url TEXT,
image_meta_id text REFERENCES image_meta(id),
UNIQUE(image_id, image_meta_id)
);
CREATE TABLE IF NOT EXISTS hash_phash (
image_id BIGINT REFERENCES image(id) UNIQUE,
hash bytea
);
CREATE TABLE IF NOT EXISTS hash_ahash (
image_id BIGINT REFERENCES image(id) UNIQUE,
hash bytea
);
CREATE TABLE IF NOT EXISTS hash_dhash (
image_id BIGINT REFERENCES image(id) UNIQUE,
hash bytea
);
CREATE TABLE IF NOT EXISTS hash_mhash (
image_id BIGINT REFERENCES image(id) UNIQUE,
hash bytea
);
CREATE TABLE IF NOT EXISTS hash_whash (
image_id BIGINT REFERENCES image(id) UNIQUE,
hash bytea
);
`
_, err := pool.Exec(sql)
if err != nil {
Logger.Fatal("Could not initialize database", zap.String("err", err.Error()))
}
}
func DbConnect(host string, port int, user, password, database string) *pgx.ConnPool {
connPoolConfig := pgx.ConnPoolConfig{
ConnConfig: pgx.ConnConfig{
Host: host,
User: user,
Port: uint16(port),
Password: password,
Database: database,
},
MaxConnections: 10,
}
var err error
pool, err := pgx.NewConnPool(connPoolConfig)
if err != nil {
panic(err)
}
return pool
}