This commit is contained in:
simon987 2020-04-15 09:42:50 -04:00
parent 91f0c9b1f9
commit 5cd367456c
12 changed files with 273 additions and 166 deletions

3
.gitignore vendored
View File

@ -1,3 +1,4 @@
*.prof
.idea/
*.iml
*.iml
cli/cli

View File

@ -1,17 +0,0 @@
FROM ubuntu:16.04
# WIP...
RUN apt update
RUN apt install git libopencv-dev wget libssl-dev -y
RUN wget https://github.com/Kitware/CMake/releases/download/v3.16.2/cmake-3.16.2.tar.gz && \
tar -xzf cmake-*.tar.gz && cd cmake-* && ./bootstrap && make -j 4 && make install
WORKDIR /build/
RUN git clone --recursive https://github.com/simon987/fastimagehash
WORKDIR /build/fastimagehash
RUN cmake .
RUN make && make install

194
cli/main.go Normal file
View File

@ -0,0 +1,194 @@
package main
import (
. "github.com/simon987/imhashdb"
"github.com/simon987/imhashdb/hasher"
api "github.com/simon987/imhashdb/web"
"github.com/urfave/cli/v2"
"os"
)
func main() {
app := &cli.App{
Commands: []*cli.Command{
{
Name: "web",
Usage: "Start http API",
Action: func(c *cli.Context) error {
Init()
return api.Main()
},
Flags: []cli.Flag{
&cli.StringFlag{
Name: "pg-user",
Value: "imhashdb",
Usage: "PostgreSQL user",
EnvVars: []string{"IMHASHDB_PG_USER"},
Destination: &Conf.PgUser,
},
&cli.StringFlag{
Name: "pg-password",
Value: "imhashdb",
Usage: "PostgreSQL password",
EnvVars: []string{"IMHASHDB_PG_PASSWORD"},
Destination: &Conf.PgPassword,
},
&cli.StringFlag{
Name: "pg-db",
Value: "imhashdb",
Usage: "PostgreSQL database",
EnvVars: []string{"IMHASHDB_PG_DATABASE"},
Destination: &Conf.PgDb,
},
&cli.StringFlag{
Name: "pg-host",
Value: "localhost",
Usage: "PostgreSQL host",
EnvVars: []string{"IMHASHDB_PG_HOST"},
Destination: &Conf.PgHost,
},
&cli.IntFlag{
Name: "pg-port",
Value: 5432,
Usage: "PostgreSQL port",
EnvVars: []string{"IMHASHDB_PG_PORT"},
Destination: &Conf.PgPort,
},
&cli.StringFlag{
Name: "redis-addr",
Value: "localhost:6379",
Usage: "redis address",
EnvVars: []string{"IMHASHDB_REDIS_ADDR"},
Destination: &Conf.RedisAddr,
},
&cli.StringFlag{
Name: "redis-password",
Value: "",
Usage: "redis password",
EnvVars: []string{"IMHASHDB_REDIS_PASSWORD"},
Destination: &Conf.RedisPassword,
},
&cli.IntFlag{
Name: "redis-db",
Value: 0,
Usage: "redis db",
EnvVars: []string{"IMHASHDB_REDIS_DB"},
Destination: &Conf.RedisDb,
},
&cli.StringFlag{
Name: "api-addr",
Value: "localhost:8080",
Usage: "HTTP api address",
EnvVars: []string{"IMHASHDB_API_ADDR"},
Destination: &Conf.ApiAddr,
},
&cli.IntFlag{
Name: "query-concurrency",
Value: 2,
Usage: "Number of background query workers",
EnvVars: []string{"IMHASHDB_QUERY_CONCURRENCY"},
Destination: &Conf.QueryConcurrency,
},
},
},
{
Name: "hasher",
Usage: "Start an hasher instance",
Action: func(c *cli.Context) error {
Init()
return hasher.Main()
},
Flags: []cli.Flag{
&cli.StringFlag{
Name: "pg-user",
Value: "imhashdb",
Usage: "PostgreSQL user",
EnvVars: []string{"IMHASHDB_PG_USER"},
Destination: &Conf.PgUser,
},
&cli.StringFlag{
Name: "pg-password",
Value: "imhashdb",
Usage: "PostgreSQL password",
EnvVars: []string{"IMHASHDB_PG_PASSWORD"},
Destination: &Conf.PgPassword,
},
&cli.StringFlag{
Name: "pg-db",
Value: "imhashdb",
Usage: "PostgreSQL database",
EnvVars: []string{"IMHASHDB_PG_DATABASE"},
Destination: &Conf.PgDb,
},
&cli.StringFlag{
Name: "pg-host",
Value: "localhost",
Usage: "PostgreSQL host",
EnvVars: []string{"IMHASHDB_PG_HOST"},
Destination: &Conf.PgHost,
},
&cli.IntFlag{
Name: "pg-port",
Value: 5432,
Usage: "PostgreSQL port",
EnvVars: []string{"IMHASHDB_PG_PORT"},
Destination: &Conf.PgPort,
},
&cli.StringFlag{
Name: "redis-addr",
Value: "localhost:6379",
Usage: "redis address",
EnvVars: []string{"IMHASHDB_REDIS_ADDR"},
Destination: &Conf.RedisAddr,
},
&cli.StringFlag{
Name: "redis-password",
Value: "",
Usage: "redis password",
EnvVars: []string{"IMHASHDB_REDIS_PASSWORD"},
Destination: &Conf.RedisPassword,
},
&cli.IntFlag{
Name: "redis-db",
Value: 0,
Usage: "redis db",
EnvVars: []string{"IMHASHDB_REDIS_DB"},
Destination: &Conf.RedisDb,
},
&cli.StringFlag{
Name: "imgur-clientid",
Value: "546c25a59c58ad7",
Usage: "imgur API client id",
EnvVars: []string{"IMHASHDB_IMGUR_CLIENTID"},
Destination: &Conf.ImgurClientId,
},
&cli.StringFlag{
Name: "hasher-pattern",
Value: "q.*",
Usage: "redis pattern for hasher input tasks",
EnvVars: []string{"IMHASHDB_HASHER_PATTERN"},
Destination: &Conf.HasherPattern,
},
&cli.IntFlag{
Name: "hash-concurrency",
Value: 4,
Usage: "Thread count for hasher",
EnvVars: []string{"IMHASHDB_HASH_CONCURRENCY"},
Destination: &Conf.HasherConcurrency,
},
},
},
},
Authors: []*cli.Author{
{
Name: "simon987",
Email: "me@simon987.net",
},
},
}
err := app.Run(os.Args)
if err != nil {
panic(err)
}
}

107
core.go
View File

@ -8,17 +8,11 @@ import (
"github.com/simon987/fastimagehash-go"
"github.com/valyala/fasthttp"
"go.uber.org/zap"
"log"
"net"
"net/url"
"os"
"strings"
"syscall"
)
const RedisPrefix = "q."
const UserAgent = "imhashdb/v1.0"
const Concurrency = 4
var ImageSuffixes = []string{
".jpeg", ".jpg", ".png",
@ -26,22 +20,53 @@ var ImageSuffixes = []string{
".bmp", ".webp",
}
type Config struct {
PgUser string
PgPassword string
PgDb string
PgHost string
PgPort int
RedisAddr string
RedisPassword string
RedisDb int
ApiAddr string
HasherConcurrency int
QueryConcurrency int
ImgurClientId string
HasherPattern string
}
var ImageBlackList = []string{}
var Rdb *redis.Client
var Pgdb *pgx.ConnPool
var Logger *zap.Logger
var Conf Config
func Init() {
Logger, _ = zap.NewDevelopment()
Rdb = redis.NewClient(&redis.Options{
Addr: "localhost:6379",
Password: "",
DB: 0,
Addr: Conf.RedisAddr,
Password: Conf.RedisPassword,
DB: Conf.RedisDb,
})
_, err := Rdb.Ping().Result()
if err != nil {
Logger.Fatal("Could not connect to redis server")
}
Pgdb = DbConnect("localhost", 5432, "imhashdb", "imhashdb", "imhashdb")
Pgdb = DbConnect(
Conf.PgHost,
Conf.PgPort,
Conf.PgUser,
Conf.PgPassword,
Conf.PgDb,
)
DbInit(Pgdb)
}
@ -184,65 +209,3 @@ func Fetch(link string, headers ...[]string) ([]byte, error) {
return body, nil
}
func IsPermanentError(err error) bool {
if strings.HasPrefix(err.Error(), "HTTP") {
//TODO: Handle http 429 etc?
return true
}
var opErr *net.OpError
urlErr, ok := err.(*url.Error)
if ok {
opErr, ok = urlErr.Err.(*net.OpError)
if !ok {
if urlErr.Err != nil && urlErr.Err.Error() == "Proxy Authentication Required" {
return true
}
return false
}
if opErr.Err.Error() == "Internal Privoxy Error" {
return true
}
} else {
_, ok := err.(net.Error)
if ok {
_, ok := err.(*net.DNSError)
return ok
}
}
if opErr == nil {
return false
}
if opErr.Timeout() {
// Usually means thalt there is no route to host
return true
}
switch t := opErr.Err.(type) {
case *net.DNSError:
return true
case *os.SyscallError:
if errno, ok := t.Err.(syscall.Errno); ok {
switch errno {
case syscall.ECONNREFUSED:
log.Println("connect refused")
return true
case syscall.ETIMEDOUT:
log.Println("timeout")
return false
case syscall.ECONNRESET:
log.Println("connection reset by peer")
return false
}
}
}
return false
}

3
go.mod
View File

@ -22,8 +22,9 @@ require (
github.com/onsi/gomega v1.9.0 // indirect
github.com/pkg/errors v0.9.1
github.com/shopspring/decimal v0.0.0-20200227202807-02e2044944cc // indirect
github.com/simon987/fastimagehash-go v0.0.0-20200412154506-b0e9d9b3a73e
github.com/simon987/fastimagehash-go v0.0.0-20200412174912-bee8c91bb52e
github.com/stretchr/testify v1.5.1 // indirect
github.com/urfave/cli/v2 v2.2.0
github.com/valyala/fasthttp v1.9.0
go.uber.org/zap v1.14.1
golang.org/x/crypto v0.0.0-20200406173513-056763e48d71 // indirect

11
go.sum
View File

@ -2,6 +2,8 @@ github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/cockroachdb/apd v1.1.0 h1:3LFP3629v+1aKXU5Q37mxmRxX/pIu1nijXydLShEq5I=
github.com/cockroachdb/apd v1.1.0/go.mod h1:8Sl8LxpKi29FqWXR16WEFZRNSz3SoPzUzeMeY4+DwBQ=
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY=
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
@ -80,8 +82,12 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/shopspring/decimal v0.0.0-20200227202807-02e2044944cc h1:jUIKcSPO9MoMJBbEoyE/RJoE8vz7Mb8AjvifMMwSyvY=
github.com/shopspring/decimal v0.0.0-20200227202807-02e2044944cc/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
github.com/simon987/fastimagehash-go v0.0.0-20200411005122-1886a7c50720 h1:0VrGo7jKQqv5cmuD/7Yd2O+o98/eyLi2wl4wWNsKfh0=
github.com/simon987/fastimagehash-go v0.0.0-20200411005122-1886a7c50720/go.mod h1:MbqNG+6OaprdElEIes1aYF7qmLlaTop4j5X6pgNiaaw=
github.com/simon987/fastimagehash-go v0.0.0-20200411154912-569fe641b1a7 h1:4XD2rCg4hJRcCZErDLp8lfMsHw5Zinr5e2t2C18GdzU=
@ -90,6 +96,8 @@ github.com/simon987/fastimagehash-go v0.0.0-20200412153922-bcfdf954b464 h1:5p3TX
github.com/simon987/fastimagehash-go v0.0.0-20200412153922-bcfdf954b464/go.mod h1:fmgaZptm6M5Kn3Ctu/R5p2fncGYPpGi/raZCZUrkRsY=
github.com/simon987/fastimagehash-go v0.0.0-20200412154506-b0e9d9b3a73e h1:8+cH+kriBBb9OqtKh/wNsr+PvV8e73yNjEly5wAjFQk=
github.com/simon987/fastimagehash-go v0.0.0-20200412154506-b0e9d9b3a73e/go.mod h1:fmgaZptm6M5Kn3Ctu/R5p2fncGYPpGi/raZCZUrkRsY=
github.com/simon987/fastimagehash-go v0.0.0-20200412174912-bee8c91bb52e h1:86MhzPgOTM6dmzNF4qAOGY4zaZ9BgQFHLwwilMZla8I=
github.com/simon987/fastimagehash-go v0.0.0-20200412174912-bee8c91bb52e/go.mod h1:fmgaZptm6M5Kn3Ctu/R5p2fncGYPpGi/raZCZUrkRsY=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
@ -99,6 +107,9 @@ github.com/ugorji/go v1.1.7 h1:/68gy2h+1mWMrwZFeD1kQialdSzAb432dtpeJ42ovdo=
github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw=
github.com/ugorji/go/codec v1.1.7 h1:2SvQaVZ1ouYrrKKwoSk2pzd4A9evlKJb9oTL+OaLUSs=
github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY=
github.com/urfave/cli v1.22.4 h1:u7tSpNPPswAFymm8IehJhy4uJMlUuU/GmqSkvJ1InXA=
github.com/urfave/cli/v2 v2.2.0 h1:JTTnM6wKzdA0Jqodd966MVj4vWbbquZykeX1sKbe2C4=
github.com/urfave/cli/v2 v2.2.0/go.mod h1:SE9GqnLQmjVa0iPEY0f1w3ygNIYcIJ0OKPMoW2caLfQ=
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
github.com/valyala/fasthttp v1.9.0 h1:hNpmUdy/+ZXYpGy0OBfm7K0UQTzb73W0T0U4iJIVrMw=

View File

@ -1,4 +1,4 @@
package main
package hasher
import (
"crypto/md5"
@ -19,7 +19,7 @@ type Task struct {
Id int64 `json:"_id"`
}
func dispatchFromQueue(pattern string, queue chan []string) {
func dispatchFromQueue(pattern string, queue chan []string) error {
for {
keys, err := Rdb.Keys(pattern).Result()
@ -33,8 +33,11 @@ func dispatchFromQueue(pattern string, queue chan []string) {
continue
}
//TODO: put in WIP list, resume on crash
queue <- rawTask
}
return nil
}
func worker(queue chan []string) {
@ -66,11 +69,6 @@ func computeAndStore(rawTask []string) {
data, err := Fetch(turl)
if err != nil {
if !IsPermanentError(err) {
// Retry later
Logger.Debug("Will retry task later", zap.String("link", link))
Rdb.RPush(rawTask[0], rawTask[1])
}
continue
}
@ -107,19 +105,12 @@ func trimUrl(link string) string {
return link
}
func main() {
Init()
func Main() error {
queue := make(chan []string, Conf.HasherConcurrency*2)
_, err := Rdb.Ping().Result()
if err != nil {
Logger.Fatal("Could not connect to redis server")
}
queue := make(chan []string, 100)
for i := 0; i < Concurrency; i++ {
for i := 0; i < Conf.HasherConcurrency; i++ {
go worker(queue)
}
dispatchFromQueue("q.reddit.*", queue)
return dispatchFromQueue("q.reddit.*", queue)
}

View File

@ -48,6 +48,10 @@ func IsImageLink(link string) bool {
func handleImgurLink(link string, meta *[]Meta) []string {
if strings.HasPrefix(link, "https://imgur.fun/") {
link = strings.Replace(link, "imgur.fun", "imgur.com", 1)
}
if ReImgurImg.MatchString(link) {
id := ReImgurImg.FindStringSubmatch(link)[1]
@ -56,7 +60,7 @@ func handleImgurLink(link string, meta *[]Meta) []string {
err := FetchJson(
"https://api.imgur.com/3/image/"+id,
&img, &rawJson,
[]string{"Authorization", "Client-Id 546c25a59c58ad7"},
[]string{"Authorization", "Client-Id " + Conf.ImgurClientId},
)
if err != nil {
return nil
@ -75,7 +79,7 @@ func handleImgurLink(link string, meta *[]Meta) []string {
err := FetchJson(
"https://api.imgur.com/3/album/"+id,
&album, &rawJson,
[]string{"Authorization", "Client-Id 546c25a59c58ad7"},
[]string{"Authorization", "Client-Id " + Conf.ImgurClientId},
)
if err != nil {
return nil

View File

@ -906,9 +906,9 @@ func easyjsonD2b7633eDecodeGithubComSimon987FastimagehashGo(in *jlexer.Lexer, ou
continue
}
switch key {
case "Size":
case "size":
out.Size = int(in.Int())
case "Bytes":
case "bytes":
if in.IsNull() {
in.Skip()
out.Bytes = nil
@ -930,12 +930,12 @@ func easyjsonD2b7633eEncodeGithubComSimon987FastimagehashGo(out *jwriter.Writer,
first := true
_ = first
{
const prefix string = ",\"Size\":"
const prefix string = ",\"size\":"
out.RawString(prefix[1:])
out.Int(int(in.Size))
}
{
const prefix string = ",\"Bytes\":"
const prefix string = ",\"bytes\":"
out.RawString(prefix)
out.Base64Bytes(in.Bytes)
}

24
test.py
View File

@ -1,24 +0,0 @@
import requests
from base64 import b64encode
import json
with open("/home/simon/Downloads/a.jpg", "rb") as f:
data = f.read()
r = requests.post("http://localhost:8080/api/hash", data=json.dumps({
"data": b64encode(data).decode()
}))
# print(r.content)
for i in range (0, 49):
r2 = requests.post("http://localhost:8080/api/query", data=json.dumps({
"hash": r.json()["ahash:12"],
"type": "ahash:12",
"distance": 30,
"limit": 500 + i,
"offset": 0
}))
print(r2.content.decode())

View File

@ -1,16 +1,12 @@
package main
package api
import (
"context"
"errors"
"fmt"
"github.com/gin-gonic/gin"
"github.com/go-redis/redis/v7"
"github.com/mailru/easyjson"
. "github.com/simon987/imhashdb"
"log"
"os"
"runtime/pprof"
"time"
)
@ -112,28 +108,15 @@ func hash(c *gin.Context) {
c.Data(200, gin.MIMEJSON, b)
}
func main() {
Init()
f, err := os.Create("prof")
if err != nil {
log.Fatal(err)
}
pprof.StartCPUProfile(f)
go func() {
time.Sleep(time.Second * 15)
pprof.StopCPUProfile()
fmt.Println("!!!!!!!!!!!!!!!")
f.Close()
}()
func Main() error {
r := gin.Default()
r.Use(CORSMiddleware())
r.POST("/api/hash", hash)
r.POST("/api/query", query)
//TODO: concurrency
go queryWorker()
for i := 0; i < Conf.QueryConcurrency; i++ {
go queryWorker()
}
r.Run()
return r.Run(Conf.ApiAddr)
}

View File

@ -1,4 +1,4 @@
package main
package api
import (
"context"
@ -17,7 +17,7 @@ const CacheLength = time.Second * 30
func queryWorker() {
Logger.Info("Query worker started")
for {
value := Rdb.BZPopMin(time.Second * 30, inQueue).Val()
value := Rdb.BZPopMin(time.Second*30, inQueue).Val()
if value == nil {
continue
}
@ -38,13 +38,13 @@ func queryWorker() {
Logger.Info("worker query done")
b = resp
}
Rdb.Set(outQueue + member, b, CacheLength)
Rdb.Set(outQueue+member, b, CacheLength)
}
}
func dbQuery(req QueryReq, value string) ([]byte, error) {
Rdb.SAdd(wipQueue, value)
Rdb.Expire(wipQueue, time.Minute * 10)
Rdb.Expire(wipQueue, time.Minute*10)
defer Rdb.SRem(wipQueue, value)