matchlist tmp table + zstd meta compression

This commit is contained in:
simon987 2020-04-28 10:32:09 -04:00
parent 5cd367456c
commit 8d7bc2d133
8 changed files with 111 additions and 13 deletions

19
core.go
View File

@ -7,6 +7,7 @@ import (
"github.com/pkg/errors"
"github.com/simon987/fastimagehash-go"
"github.com/valyala/fasthttp"
"github.com/valyala/gozstd"
"go.uber.org/zap"
"strings"
)
@ -44,6 +45,8 @@ var ImageBlackList = []string{}
var Rdb *redis.Client
var Pgdb *pgx.ConnPool
var CDict *gozstd.CDict
var DDict *gozstd.DDict
var Logger *zap.Logger
var Conf Config
@ -68,6 +71,16 @@ func Init() {
Conf.PgDb,
)
DbInit(Pgdb)
CDict, err = gozstd.NewCDictLevel(DictBytes, 19)
if err != nil {
Logger.Fatal("Could not initialize zstd cdict")
}
DDict, err = gozstd.NewDDict(DictBytes)
if err != nil {
Logger.Fatal("Could not initialize zstd ddict")
}
}
func ComputeHash(data []byte) (*Hashes, error) {
@ -113,15 +126,15 @@ func ComputeHash(data []byte) (*Hashes, error) {
return nil, errors.Errorf("pHash error: %d", int(code))
}
h.WHash8, code = fastimagehash.WHashMem(data, 8, 0, fastimagehash.Haar)
h.WHash8, code = fastimagehash.WHashMem(data, 8, 0, true, fastimagehash.Haar)
if code != fastimagehash.Ok {
return nil, errors.Errorf("wHash error: %d", int(code))
}
h.WHash16, code = fastimagehash.WHashMem(data, 16, 0, fastimagehash.Haar)
h.WHash16, code = fastimagehash.WHashMem(data, 16, 0, true, fastimagehash.Haar)
if code != fastimagehash.Ok {
return nil, errors.Errorf("wHash error: %d", int(code))
}
h.WHash32, code = fastimagehash.WHashMem(data, 32, 0, fastimagehash.Haar)
h.WHash32, code = fastimagehash.WHashMem(data, 32, 0, true, fastimagehash.Haar)
if code != fastimagehash.Ok {
return nil, errors.Errorf("wHash error: %d", int(code))
}

73
db.go
View File

@ -10,6 +10,7 @@ import (
"github.com/jackc/pgx"
"github.com/jackc/pgx/pgtype"
"github.com/mailru/easyjson"
"github.com/valyala/gozstd"
"go.uber.org/zap"
)
@ -27,6 +28,30 @@ type Entry struct {
Url string
}
type MatchTrigger struct {
HashType HashType
MinDistance int
Id int
}
var MatchTriggers = []MatchTrigger{
{
HashType: DHash16,
MinDistance: 25,
Id: 1,
},
{
HashType: PHash16,
MinDistance: 25,
Id: 2,
},
{
HashType: WHash16Haar,
MinDistance: 6,
Id: 3,
},
}
func Store(entry *Entry) {
row := Pgdb.QueryRow(
`INSERT INTO image (size, sha1, md5, sha256, crc32) VALUES ($1, $2, $3, $4, $5) RETURNING id;`,
@ -48,7 +73,10 @@ func Store(entry *Entry) {
}
if !imageExists {
_, _ = Pgdb.Exec("INSERT INTO hash_dhash8 VALUES ($1, $2) ON CONFLICT DO NOTHING", id, entry.H.DHash8.Bytes)
_, err = Pgdb.Exec("INSERT INTO hash_dhash8 VALUES ($1, $2) ON CONFLICT DO NOTHING", id, entry.H.DHash8.Bytes)
if err != nil {
panic(err)
}
_, _ = Pgdb.Exec("INSERT INTO hash_dhash16 VALUES ($1, $2) ON CONFLICT DO NOTHING", id, entry.H.DHash16.Bytes)
_, _ = Pgdb.Exec("INSERT INTO hash_dhash32 VALUES ($1, $2) ON CONFLICT DO NOTHING", id, entry.H.DHash32.Bytes)
@ -65,10 +93,13 @@ func Store(entry *Entry) {
_, _ = Pgdb.Exec("INSERT INTO hash_whash32haar VALUES ($1, $2) ON CONFLICT DO NOTHING", id, entry.H.WHash32.Bytes)
}
var buf []byte
for _, meta := range entry.Meta {
compressedMeta := gozstd.CompressDict(buf[:0], meta.Meta, CDict)
_, err = Pgdb.Exec(
"INSERT INTO image_meta VALUES ($1, $2, $3) ON CONFLICT DO NOTHING",
meta.Id, meta.RetrievedAt, meta.Meta,
"INSERT INTO image_meta (id, retrieved_at, meta) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING",
meta.Id, meta.RetrievedAt, compressedMeta,
)
if err != nil {
Logger.Error("Could not insert meta", zap.Error(err))
@ -164,7 +195,8 @@ func FindImagesByHash(ctx context.Context, hash []byte, hashType HashType, dista
}
if images == nil {
return nil, nil
b, _ := easyjson.Marshal(ImageList{Images: []*Image{}})
return b, nil
}
batch := tx.BeginBatch()
@ -195,7 +227,11 @@ func FindImagesByHash(ctx context.Context, hash []byte, hashType HashType, dista
for rows.Next() {
var ihm ImageHasMeta
err := rows.Scan(&ihm.Url, &ihm.Meta.Id, &ihm.Meta.RetrievedAt, &ihm.Meta.Meta)
var compressedMeta []byte
err := rows.Scan(&ihm.Url, &ihm.Meta.Id, &ihm.Meta.RetrievedAt, &compressedMeta)
ihm.Meta.Meta, err = gozstd.DecompressDict(nil, compressedMeta, DDict)
if err != nil {
return nil, err
}
@ -221,8 +257,8 @@ CREATE TABLE IF NOT EXISTS image (
CREATE UNIQUE INDEX IF NOT EXISTS idx_image_sha1 ON image(sha1);
CREATE TABLE IF NOT EXISTS image_meta (
id TEXT UNIQUE NOT NULL,
retrieved_at bigint NOT NULL,
id TEXT PRIMARY KEY,
meta bytea NOT NULL
);
@ -232,6 +268,13 @@ CREATE TABLE IF NOT EXISTS image_has_meta (
image_meta_id text REFERENCES image_meta(id) NOT NULL,
UNIQUE(image_id, image_meta_id)
);
CREATE TABLE IF NOT EXISTS matchlist (
id smallint,
distance smallint NOT NULL,
im1 bigint NOT NULL,
im2 bigint NOT NULL
);
`
for _, hashType := range HashTypes {
sql += fmt.Sprintf(`CREATE TABLE IF NOT EXISTS hash_%s (
@ -239,6 +282,24 @@ CREATE TABLE IF NOT EXISTS image_has_meta (
hash bytea NOT NULL);`, hashType)
}
for _, trigger := range MatchTriggers {
sql += fmt.Sprintf(`
CREATE OR REPLACE FUNCTION on_%s_insert() RETURNS TRIGGER AS $$
BEGIN
INSERT INTO matchlist (id, distance, im1, im2)
SELECT %d, hash_distance%d(hash, NEW.hash), NEW.image_id, image_id FROM hash_%s AS h
WHERE h.image_id != NEW.image_id AND hash_is_within_distance%d(hash, NEW.hash, %d);
RETURN NEW;
END;
$$ LANGUAGE 'plpgsql';
DROP TRIGGER IF EXISTS on_%s_insert ON hash_%s;
CREATE TRIGGER on_%s_insert AFTER INSERT ON hash_%s
FOR EACH ROW EXECUTE PROCEDURE on_%s_insert();`,
trigger.HashType, trigger.Id, trigger.HashType.HashLength(), trigger.HashType,
trigger.HashType.HashLength(), trigger.MinDistance, trigger.HashType,
trigger.HashType, trigger.HashType, trigger.HashType, trigger.HashType)
}
_, err := pool.Exec(sql)
if err != nil {
Logger.Fatal("Could not initialize database", zap.String("err", err.Error()))

BIN
dictionary Normal file

Binary file not shown.

3
embed.go Normal file

File diff suppressed because one or more lines are too long

5
go.mod
View File

@ -22,16 +22,17 @@ require (
github.com/onsi/gomega v1.9.0 // indirect
github.com/pkg/errors v0.9.1
github.com/shopspring/decimal v0.0.0-20200227202807-02e2044944cc // indirect
github.com/simon987/fastimagehash-go v0.0.0-20200412174912-bee8c91bb52e
github.com/simon987/fastimagehash-go v0.0.0-20200426185525-dad3269c77a9
github.com/stretchr/testify v1.5.1 // indirect
github.com/urfave/cli/v2 v2.2.0
github.com/valyala/fasthttp v1.9.0
github.com/valyala/gozstd v1.7.0
go.uber.org/zap v1.14.1
golang.org/x/crypto v0.0.0-20200406173513-056763e48d71 // indirect
golang.org/x/lint v0.0.0-20200302205851-738671d3881b // indirect
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e // indirect
golang.org/x/sys v0.0.0-20200409092240-59c9f1ba88fa // indirect
golang.org/x/tools v0.0.0-20200410194907-79a7a3126eef // indirect
golang.org/x/tools v0.0.0-20200425043458-8463f397d07c // indirect
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f // indirect
honnef.co/go/tools v0.0.1-2020.1.3 // indirect
)

15
go.sum
View File

@ -32,6 +32,9 @@ github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5y
github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
github.com/golang/protobuf v1.3.5 h1:F768QJ1E9tib+q5Sc8MkdJi1RxLTbRcTf8LJV56aRls=
github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk=
github.com/google/brotli v1.0.7 h1:fxwwohNEPaVS6qvtnjwgzRR62Upa70pkw0f9qarjrQs=
github.com/google/brotli/go/cbrotli v0.0.0-20200331123801-f83aa5169e3c h1:ar2QDaYlLo8sO8Lto330cewxK1KSDGhXhAbDPdDGqBs=
github.com/google/brotli/go/cbrotli v0.0.0-20200331123801-f83aa5169e3c/go.mod h1:nOPhAkwVliJdNTkj3gXpljmWhjc4wCaVqbMJcPKWP4s=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=
@ -60,6 +63,8 @@ github.com/mailru/easyjson v0.7.1 h1:mdxE1MF9o53iCb2Ghj1VfWvh7ZOwHpnVG/xwXrV90U8
github.com/mailru/easyjson v0.7.1/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs=
github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHXY=
github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
github.com/mjibson/esc v0.2.0 h1:k96hdaR9Z+nMcnDwNrOvhdBqtjyMrbVyxLpsRCdP2mA=
github.com/mjibson/esc v0.2.0/go.mod h1:9Hw9gxxfHulMF5OJKCyhYD7PzlSdhzXyaGEBRPH1OPs=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@ -81,6 +86,8 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rakyll/statik v0.1.7 h1:OF3QCZUuyPxuGEP7B4ypUa7sB/iHtqOTDYZXGM8KOdQ=
github.com/rakyll/statik v0.1.7/go.mod h1:AlZONWzMtEnMs7W4e/1LURLiI49pIMmp6V9Unghqrcc=
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
@ -98,6 +105,8 @@ github.com/simon987/fastimagehash-go v0.0.0-20200412154506-b0e9d9b3a73e h1:8+cH+
github.com/simon987/fastimagehash-go v0.0.0-20200412154506-b0e9d9b3a73e/go.mod h1:fmgaZptm6M5Kn3Ctu/R5p2fncGYPpGi/raZCZUrkRsY=
github.com/simon987/fastimagehash-go v0.0.0-20200412174912-bee8c91bb52e h1:86MhzPgOTM6dmzNF4qAOGY4zaZ9BgQFHLwwilMZla8I=
github.com/simon987/fastimagehash-go v0.0.0-20200412174912-bee8c91bb52e/go.mod h1:fmgaZptm6M5Kn3Ctu/R5p2fncGYPpGi/raZCZUrkRsY=
github.com/simon987/fastimagehash-go v0.0.0-20200426185525-dad3269c77a9 h1:9Ttq0K7AU1F20LK3n/tiPWq0yPVWOlQRjXOmU5Eh7fs=
github.com/simon987/fastimagehash-go v0.0.0-20200426185525-dad3269c77a9/go.mod h1:fmgaZptm6M5Kn3Ctu/R5p2fncGYPpGi/raZCZUrkRsY=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
@ -107,6 +116,8 @@ github.com/ugorji/go v1.1.7 h1:/68gy2h+1mWMrwZFeD1kQialdSzAb432dtpeJ42ovdo=
github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw=
github.com/ugorji/go/codec v1.1.7 h1:2SvQaVZ1ouYrrKKwoSk2pzd4A9evlKJb9oTL+OaLUSs=
github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY=
github.com/ulikunitz/xz v0.5.7 h1:YvTNdFzX6+W5m9msiYg/zpkSURPPtOlzbqYjrFn7Yt4=
github.com/ulikunitz/xz v0.5.7/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
github.com/urfave/cli v1.22.4 h1:u7tSpNPPswAFymm8IehJhy4uJMlUuU/GmqSkvJ1InXA=
github.com/urfave/cli/v2 v2.2.0 h1:JTTnM6wKzdA0Jqodd966MVj4vWbbquZykeX1sKbe2C4=
github.com/urfave/cli/v2 v2.2.0/go.mod h1:SE9GqnLQmjVa0iPEY0f1w3ygNIYcIJ0OKPMoW2caLfQ=
@ -114,6 +125,8 @@ github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6Kllzaw
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/valyala/fasthttp v1.9.0 h1:hNpmUdy/+ZXYpGy0OBfm7K0UQTzb73W0T0U4iJIVrMw=
github.com/valyala/fasthttp v1.9.0/go.mod h1:FstJa9V+Pj9vQ7OJie2qMHdwemEDaDiSdBnvPM1Su9w=
github.com/valyala/gozstd v1.7.0 h1:Ljh5c9zboqLhwTI33al32R72iCZfn0mCbVGcFWbGwRQ=
github.com/valyala/gozstd v1.7.0/go.mod h1:y5Ew47GLlP37EkTB+B4s7r6A5rdaeB7ftbl9zoYiIPQ=
github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.uber.org/atomic v1.6.0 h1:Ezj3JGmsOnG1MoRWQkPBsKLe9DwWD9QeXzTRzzldNVk=
@ -171,6 +184,8 @@ golang.org/x/tools v0.0.0-20191130070609-6e064ea0cf2d/go.mod h1:b+2E5dAYhXwXZwtn
golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200410194907-79a7a3126eef h1:RHORRhs540cYZYrzgU2CPUyykkwZM78hGdzocOo9P8A=
golang.org/x/tools v0.0.0-20200410194907-79a7a3126eef/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20200425043458-8463f397d07c h1:iHhCR0b26amDCiiO+kBguKZom9aMF+NrFxh9zeKR/XU=
golang.org/x/tools v0.0.0-20200425043458-8463f397d07c/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=

View File

@ -28,6 +28,11 @@ func dispatchFromQueue(pattern string, queue chan []string) error {
continue
}
if len(keys) == 0 {
time.Sleep(time.Second * 1)
continue
}
rawTask, err := Rdb.BLPop(time.Second*30, keys...).Result()
if err != nil {
continue
@ -106,7 +111,7 @@ func trimUrl(link string) string {
}
func Main() error {
queue := make(chan []string, Conf.HasherConcurrency*2)
queue := make(chan []string)
for i := 0; i < Conf.HasherConcurrency; i++ {
go worker(queue)

View File

@ -12,7 +12,7 @@ const inQueue = "qq:in"
const outQueue = "qq:out:"
const wipQueue = "qq:wip"
const CacheLength = time.Second * 30
const CacheLength = time.Second * 5
func queryWorker() {
Logger.Info("Query worker started")