diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9d26332 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,34 @@ +FROM ubuntu + +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt update +RUN apt install git build-essential libopencv-dev wget libssl-dev -y + +RUN wget https://github.com/Kitware/CMake/releases/download/v3.16.2/cmake-3.16.2.tar.gz && \ + tar -xzf cmake-*.tar.gz && cd cmake-* && ./bootstrap && make -j 4 && make install + +RUN wget http://fftw.org/fftw-3.3.8.tar.gz && tar -xzf fftw-3.3.8.tar.gz && cd fftw-3.3.8 && ./configure --enable-shared --disable-static --enable-threads --with-combined-threads --enable-portable-binary CFLAGS='-fPIC' && make -j 4 && make install + +RUN wget https://dl.google.com/go/go1.14.2.linux-amd64.tar.gz && tar -C /usr/local -xzf go1.14.2.linux-amd64.tar.gz + +WORKDIR /build/ + +RUN git clone --recursive https://github.com/simon987/fastimagehash + +WORKDIR /build/fastimagehash + +RUN cmake . +RUN make -j 4 && make install + +WORKDIR /build/ + +COPY . /build/imhashdb + +WORKDIR /build/imhashdb/cli + +RUN PATH=$PATH:/usr/local/go/bin go build . + +ENV LD_LIBRARY_PATH /usr/local/lib/ + +ENTRYPOINT ["/build/imhashdb/cli/cli"] \ No newline at end of file diff --git a/README.md b/README.md index 93e5f66..a306c83 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ # imhashdb -wip +### Requirements +1. PostgreSQL +1. Redis +1. [pg_hamming](https://github.com/simon987/pg_hamming) +1. [fastimagehash](https://github.com/simon987/fastimagehash) diff --git a/cli/main.go b/cli/main.go index f66e2ce..38ddbb8 100644 --- a/cli/main.go +++ b/cli/main.go @@ -89,6 +89,13 @@ func main() { EnvVars: []string{"IMHASHDB_QUERY_CONCURRENCY"}, Destination: &Conf.QueryConcurrency, }, + &cli.StringFlag{ + Name: "store", + Value: "", + Usage: "If set, store downloaded images there", + EnvVars: []string{"IMHASHDB_STORE"}, + Destination: &Conf.Store, + }, }, }, { diff --git a/core.go b/core.go index 4a384d2..88f4b4f 100644 --- a/core.go +++ b/core.go @@ -12,7 +12,6 @@ import ( "strings" ) -const RedisPrefix = "q." const UserAgent = "imhashdb/v1.0" var ImageSuffixes = []string{ @@ -37,6 +36,8 @@ type Config struct { HasherConcurrency int QueryConcurrency int + Store string + ImgurClientId string HasherPattern string } diff --git a/hasher/hasher.go b/hasher/hasher.go index d74e340..b0b7289 100644 --- a/hasher/hasher.go +++ b/hasher/hasher.go @@ -4,8 +4,13 @@ import ( "crypto/md5" "crypto/sha1" "crypto/sha256" + "encoding/hex" "encoding/json" + "fmt" "hash/crc32" + "io/ioutil" + "os" + "path/filepath" "strconv" "strings" "time" @@ -51,6 +56,29 @@ func worker(queue chan []string) { } } +func storeData(data []byte, sha1 [20]byte, link string) { + + sha1Str := hex.EncodeToString(sha1[:]) + + filename := fmt.Sprintf("%s/%c/%s/", + DataPath, + sha1Str[0], + sha1Str[1:3], + ) + err := os.MkdirAll(filename, 0755) + if err != nil { + panic(err) + } + filename += sha1Str + filepath.Ext(link) + + Logger.Debug("Storing image data to file", zap.String("path", filename)) + + err = ioutil.WriteFile(filename, data, 0666) + if err != nil { + panic(err) + } +} + func computeAndStore(rawTask []string) { var task Task err := json.Unmarshal([]byte(rawTask[1]), &task) @@ -61,7 +89,7 @@ func computeAndStore(rawTask []string) { meta := []Meta{{ RetrievedAt: time.Now().Unix(), - Id: rawTask[0][len(RedisPrefix):] + "." + strconv.FormatInt(task.Id, 10), + Id: rawTask[0][len(Pattern)-1:] + "." + strconv.FormatInt(task.Id, 10), Meta: []byte(rawTask[1]), }} @@ -86,11 +114,17 @@ func computeAndStore(rawTask []string) { return } + sha1sum := sha1.Sum(data) + + if StoreData { + storeData(data, sha1sum, link) + } + Store(&Entry{ H: h, Size: len(data), Sha256: sha256.Sum256(data), - Sha1: sha1.Sum(data), + Sha1: sha1sum, Md5: md5.Sum(data), Crc32: crc32.ChecksumIEEE(data), Meta: meta, @@ -110,6 +144,10 @@ func trimUrl(link string) string { return link } +var StoreData = Conf.Store != "" +var DataPath = Conf.Store +var Pattern = "imhash.*" + func Main() error { queue := make(chan []string) @@ -117,5 +155,5 @@ func Main() error { go worker(queue) } - return dispatchFromQueue("q.reddit.*", queue) + return dispatchFromQueue(Pattern, queue) }