diff --git a/README.md b/README.md index 3b7785c..1b487fe 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ sist2 (Simple incremental search tool) \* See [format support](#format-support) \*\* See [Archive files](#archive-files) -\*\*\* See [OCR](#ocr) +\*\*\* See [OCR](#ocr) \*\*\*\* See [Named-Entity Recognition](#NER) ## Getting Started @@ -46,7 +46,7 @@ services: - "discovery.type=single-node" - "ES_JAVA_OPTS=-Xms2g -Xmx2g" sist2-admin: - image: simon987/sist2:3.0.7-x64-linux + image: simon987/sist2:3.1.0-x64-linux restart: unless-stopped volumes: - ./sist2-admin-data/:/sist2-admin/ @@ -206,7 +206,7 @@ docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux 3. Install vcpkg dependencies ```bash - vcpkg install curl[core,openssl] sqlite3[core,fts5] cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample] + vcpkg install curl[core,openssl] sqlite3[core,fts5] cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample,webp] ``` 4. Build diff --git a/docs/USAGE.md b/docs/USAGE.md index 43be817..6483d5a 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -17,7 +17,7 @@ Lightning-fast file system indexer and search tool. Scan options -t, --threads= Number of threads. DEFAULT: 1 - -q, --thumbnail-quality= Thumbnail quality, on a scale of 2 to 31, 2 being the best. DEFAULT: 2 + -q, --thumbnail-quality= Thumbnail quality, on a scale of 0 to 100, 100 being the best. DEFAULT: 50 --thumbnail-size= Thumbnail size, in pixels. DEFAULT: 552 --thumbnail-count= Number of thumbnails to generate. Set a value > 1 to create video previews, set to 0 to disable thumbnails. DEFAULT: 1 --content-size= Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT: 32768 @@ -88,8 +88,8 @@ Made by simon987 . Released under GPL-3.0 See chart below for rough estimate of thumbnail size vs. thumbnail size & quality arguments: -For example, `--thumbnail-size=500`, `--thumbnail-quality=2` for a directory with 8 million images will create a thumbnail database -that is about `8000000 * 36kB = 288GB`. +For example, `--thumbnail-size=500`, `--thumbnail-quality=50` for a directory with 8 million images will create a thumbnail database +that is about `8000000 * 11.8kB = 94.4GB`. ![thumbnail_size](thumbnail_size.png) diff --git a/docs/thumbnail_size.png b/docs/thumbnail_size.png index 3550334..90d88a0 100644 Binary files a/docs/thumbnail_size.png and b/docs/thumbnail_size.png differ diff --git a/src/cli.c b/src/cli.c index 1acf394..f40ad16 100644 --- a/src/cli.c +++ b/src/cli.c @@ -5,7 +5,7 @@ #define DEFAULT_OUTPUT "index.sist2" #define DEFAULT_NAME "index" #define DEFAULT_CONTENT_SIZE 32768 -#define DEFAULT_QUALITY 2 +#define DEFAULT_QUALITY 50 #define DEFAULT_THUMBNAIL_SIZE 552 #define DEFAULT_THUMBNAIL_COUNT 1 #define DEFAULT_REWRITE_URL "" @@ -100,8 +100,8 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) { if (args->tn_quality == OPTION_VALUE_UNSPECIFIED) { args->tn_quality = DEFAULT_QUALITY; - } else if (args->tn_quality < 2 || args->tn_quality > 31) { - fprintf(stderr, "Invalid value for --thumbnail-quality argument: %d. Must be within [2, 31].\n", + } else if (args->tn_quality < 0 || args->tn_quality > 100) { + fprintf(stderr, "Invalid value for --thumbnail-quality argument: %d. Must be within [0, 100].\n", args->tn_quality); return 1; } @@ -109,7 +109,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) { if (args->tn_size == OPTION_VALUE_UNSPECIFIED) { args->tn_size = DEFAULT_THUMBNAIL_SIZE; } else if (args->tn_size < 32) { - printf("Invalid value --thumbnail-size argument: %d. Must be greater than 32 pixels.\n", args->tn_size); + printf("Invalid value --thumbnail-size argument: %d. Must be >= 32 pixels.\n", args->tn_size); return 1; } diff --git a/src/main.c b/src/main.c index 07ec787..b85631c 100644 --- a/src/main.c +++ b/src/main.c @@ -490,7 +490,7 @@ int main(int argc, const char *argv[]) { OPT_GROUP("Scan options"), OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT: 1"), OPT_INTEGER('q', "thumbnail-quality", &scan_args->tn_quality, - "Thumbnail quality, on a scale of 2 to 31, 2 being the best. DEFAULT: 2", + "Thumbnail quality, on a scale of 0 to 100, 100 being the best. DEFAULT: 50", set_to_negative_if_value_is_zero, (intptr_t) &scan_args->tn_quality), OPT_INTEGER(0, "thumbnail-size", &scan_args->tn_size, "Thumbnail size, in pixels. DEFAULT: 552", diff --git a/src/sist.h b/src/sist.h index 861bf09..5657c85 100644 --- a/src/sist.h +++ b/src/sist.h @@ -51,11 +51,11 @@ #include #include "git_hash.h" -#define VERSION "3.0.7" +#define VERSION "3.1.0" static const char *const Version = VERSION; static const int VersionMajor = 3; -static const int VersionMinor = 0; -static const int VersionPatch = 7; +static const int VersionMinor = 1; +static const int VersionPatch = 0; #ifndef SIST_PLATFORM #define SIST_PLATFORM unknown diff --git a/third-party/libscan/libscan/ebook/ebook.c b/third-party/libscan/libscan/ebook/ebook.c index b53d937..0c56397 100644 --- a/third-party/libscan/libscan/ebook/ebook.c +++ b/third-party/libscan/libscan/ebook/ebook.c @@ -153,22 +153,23 @@ int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_d sws_freeContext(sws_ctx); - // YUV420p -> JPEG - AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(pixmap->w, pixmap->h, ctx->tn_qscale); - avcodec_send_frame(jpeg_encoder, scaled_frame); + // YUV420p -> JPEG/WEBP + AVCodecContext *thumbnail_encoder = alloc_webp_encoder(pixmap->w, pixmap->h, ctx->tn_qscale); + avcodec_send_frame(thumbnail_encoder, scaled_frame); + avcodec_send_frame(thumbnail_encoder, NULL); // Send EOF - AVPacket jpeg_packet; - av_init_packet(&jpeg_packet); - avcodec_receive_packet(jpeg_encoder, &jpeg_packet); + AVPacket thumbnail_packet; + av_init_packet(&thumbnail_packet); + avcodec_receive_packet(thumbnail_encoder, &thumbnail_packet); APPEND_LONG_META(doc, MetaThumbnail, 1); - ctx->store(doc->doc_id, 0, (char *) jpeg_packet.data, jpeg_packet.size); + ctx->store(doc->doc_id, 0, (char *) thumbnail_packet.data, thumbnail_packet.size); free(samples); - av_packet_unref(&jpeg_packet); + av_packet_unref(&thumbnail_packet); av_free(*scaled_frame->data); av_frame_free(&scaled_frame); - avcodec_free_context(&jpeg_encoder); + avcodec_free_context(&thumbnail_encoder); fz_drop_pixmap(fzctx, pixmap); fz_drop_page(fzctx, cover); diff --git a/third-party/libscan/libscan/media/media.c b/third-party/libscan/libscan/media/media.c index fa62724..bbc7684 100644 --- a/third-party/libscan/libscan/media/media.c +++ b/third-party/libscan/libscan/media/media.c @@ -68,7 +68,7 @@ void *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) struct SwsContext *sws_ctx = sws_getContext( decoder->width, decoder->height, decoder->pix_fmt, - dstW, dstH, AV_PIX_FMT_YUVJ420P, + dstW, dstH, AV_PIX_FMT_YUV420P, SIST_SWS_ALGO, 0, 0, 0 ); @@ -436,7 +436,8 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor } if (seek_ok == FALSE && thumbnail_index != 0) { - CTX_LOG_WARNING(doc->filepath, "(media.c) Could not seek media file. Can't generate additional thumbnails."); + CTX_LOG_WARNING(doc->filepath, + "(media.c) Could not seek media file. Can't generate additional thumbnails."); return SAVE_THUMBNAIL_FAILED; } } @@ -470,18 +471,19 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor ctx->store(doc->doc_id, 0, frame_and_packet->packet->data, frame_and_packet->packet->size); } else { - // Encode frame to jpeg - AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, - ctx->tn_qscale); - avcodec_send_frame(jpeg_encoder, scaled_frame); + // Encode frame + AVCodecContext *thumbnail_encoder = alloc_webp_encoder(scaled_frame->width, scaled_frame->height, + ctx->tn_qscale); + avcodec_send_frame(thumbnail_encoder, scaled_frame); + avcodec_send_frame(thumbnail_encoder, NULL); // send EOF - AVPacket jpeg_packet; - av_init_packet(&jpeg_packet); - avcodec_receive_packet(jpeg_encoder, &jpeg_packet); + AVPacket thumbnail_packet; + av_init_packet(&thumbnail_packet); + avcodec_receive_packet(thumbnail_encoder, &thumbnail_packet); // Save thumbnail if (thumbnail_index == 0) { - ctx->store(doc->doc_id, 0, jpeg_packet.data, jpeg_packet.size); + ctx->store(doc->doc_id, 0, thumbnail_packet.data, thumbnail_packet.size); return_value = SAVE_THUMBNAIL_OK; } else if (thumbnail_index > 1) { @@ -489,15 +491,15 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor // I figure out a better fix. thumbnail_index -= 1; - ctx->store(doc->doc_id, thumbnail_index, jpeg_packet.data, jpeg_packet.size); + ctx->store(doc->doc_id, thumbnail_index, thumbnail_packet.data, thumbnail_packet.size); return_value = SAVE_THUMBNAIL_OK; } else { return_value = SAVE_THUMBNAIL_SKIPPED; } - avcodec_free_context(&jpeg_encoder); - av_packet_unref(&jpeg_packet); + avcodec_free_context(&thumbnail_encoder); + av_packet_unref(&thumbnail_packet); av_free(*scaled_frame->data); av_frame_free(&scaled_frame); } @@ -854,9 +856,10 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu ctx->store(doc->doc_id, 0, frame_and_packet->packet->data, frame_and_packet->packet->size); } else { // Encode frame to jpeg - AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, + AVCodecContext *jpeg_encoder = alloc_webp_encoder(scaled_frame->width, scaled_frame->height, ctx->tn_qscale); avcodec_send_frame(jpeg_encoder, scaled_frame); + avcodec_send_frame(jpeg_encoder, NULL); // Send EOF AVPacket jpeg_packet; av_init_packet(&jpeg_packet); diff --git a/third-party/libscan/libscan/media/media.h b/third-party/libscan/libscan/media/media.h index b1e6b48..ebe70a0 100644 --- a/third-party/libscan/libscan/media/media.h +++ b/third-party/libscan/libscan/media/media.h @@ -48,6 +48,28 @@ static AVCodecContext *alloc_jpeg_encoder(int w, int h, int qscale) { return jpeg; } +static AVCodecContext *alloc_webp_encoder(int w, int h, int qscale) { + + const AVCodec *webp_codec = avcodec_find_encoder(AV_CODEC_ID_WEBP); + AVCodecContext *webp = avcodec_alloc_context3(webp_codec); + webp->width = w; + webp->height = h; + webp->time_base.den = 1000000; + webp->time_base.num = 1; + webp->compression_level = 6; + webp->global_quality = FF_QP2LAMBDA * qscale; + + webp->pix_fmt = AV_PIX_FMT_YUV420P; + webp->color_range = AVCOL_RANGE_JPEG; + int ret = avcodec_open2(webp, webp_codec, NULL); + + if (ret != 0) { + return NULL; + } + + return webp; +} + void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char *mime_str); diff --git a/third-party/libscan/libscan/raw/raw.c b/third-party/libscan/libscan/raw/raw.c index 7991d7c..047550f 100644 --- a/third-party/libscan/libscan/raw/raw.c +++ b/third-party/libscan/libscan/raw/raw.c @@ -52,7 +52,7 @@ int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, do struct SwsContext *sws_ctx = sws_getContext( img->width, img->height, AV_PIX_FMT_RGB24, - dstW, dstH, AV_PIX_FMT_YUVJ420P, + dstW, dstH, AV_PIX_FMT_YUV420P, SIST_SWS_ALGO, 0, 0, 0 ); @@ -76,20 +76,21 @@ int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, do sws_freeContext(sws_ctx); - AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, 1.0f); - avcodec_send_frame(jpeg_encoder, scaled_frame); + AVCodecContext *thumbnail_encoder = alloc_webp_encoder(scaled_frame->width, scaled_frame->height, ctx->tn_qscale); + avcodec_send_frame(thumbnail_encoder, scaled_frame); + avcodec_send_frame(thumbnail_encoder, NULL); // Send EOF - AVPacket jpeg_packet; - av_init_packet(&jpeg_packet); - avcodec_receive_packet(jpeg_encoder, &jpeg_packet); + AVPacket thumbnail_packet; + av_init_packet(&thumbnail_packet); + avcodec_receive_packet(thumbnail_encoder, &thumbnail_packet); APPEND_LONG_META(doc, MetaThumbnail, 1); - ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size); + ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) thumbnail_packet.data, thumbnail_packet.size); - av_packet_unref(&jpeg_packet); + av_packet_unref(&thumbnail_packet); av_free(*scaled_frame->data); av_frame_free(&scaled_frame); - avcodec_free_context(&jpeg_encoder); + avcodec_free_context(&thumbnail_encoder); return TRUE; }