Un-break raw file thumbnails

Merge pull request #262 from yatli/fix_261
fix #261: inherit index id from base index when using incremental scan
2025-10-19 18:46:52 +00:00 · 2022-02-26 20:37:45 -05:00 · 2022-02-26 11:37:16 -05:00 · 2022-02-27 00:25:23 +08:00 · 2022-02-20 10:52:22 -05:00 · 2022-02-20 10:43:34 -05:00
12 changed files with 283 additions and 86 deletions
--- a/2
+++ b/2
@ -3,7 +3,7 @@ MAINTAINER simon987 <me@simon987.net>
 WORKDIR /build/
 COPY . .
-RUN cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake .
+RUN cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake .
 RUN make -j$(nproc)
 RUN strip sist2 || mv sist2_debug sist2
--- a/README.md
+++ b/README.md
@ -52,7 +52,7 @@ sist2 (Simple incremental search tool)
 Select the file corresponding to your CPU architecture and mark the binary as executable with `chmod +x` *
    2. *(or)* Download a [development snapshot](https://files.simon987.net/.gate/sist2/simon987_sist2/) *(Not
       recommended!)*
-    3. *(or)* `docker pull simon987/sist2:2.11.6-x64-linux`
+    3. *(or)* `docker pull simon987/sist2:2.11.7-x64-linux`
 1. See [Usage guide](docs/USAGE.md)
--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@ -13,7 +13,6 @@
    * [options](#web-options)
    * [examples](#web-examples)
    * [rewrite_url](#rewrite_url)
    * [link to specific indices](#link-to-specific-indices)
 * [elasticsearch](#elasticsearch)
 * [exec-script](#exec-script)
 * [tagging](#tagging)
@ -26,62 +25,66 @@ Usage: sist2 scan [OPTION]... PATH
   or: sist2 exec-script [OPTION]... INDEX
 Lightning-fast file system indexer and search tool.
-    -h, --help                    show this help message and exit
+    -h, --help                        show this help message and exit
-    -v, --version                 Show version and exit
+    -v, --version                     Show version and exit
-    --verbose                     Turn on logging
+    --verbose                         Turn on logging
-    --very-verbose                Turn on debug messages
+    --very-verbose                    Turn on debug messages
 Scan options
-    -t, --threads=<int>           Number of threads. DEFAULT=1
+    -t, --threads=<int>               Number of threads. DEFAULT=1
-    -q, --quality=<flt>           Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=3
+    --mem-throttle=<int>              Total memory threshold in MiB for scan throttling. DEFAULT=0
-    --size=<int>                  Thumbnail size, in pixels. Use negative value to disable. DEFAULT=500
+    -q, --thumbnail-quality=<flt>     Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=1
-    --content-size=<int>          Number of bytes to be extracted from text documents. Use negative value to disable. DEFAULT=32768
+    --thumbnail-size=<int>            Thumbnail size, in pixels. DEFAULT=500
-    --incremental=<str>           Reuse an existing index and only scan modified files.
+    --thumbnail-count=<int>           Number of thumbnails to generate. Set a value > 1 to create video previews, set to 0 to disable thumbnails. DEFAULT=1
-    -o, --output=<str>            Output directory. DEFAULT=index.sist2/
+    --content-size=<int>              Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT=32768
-    --rewrite-url=<str>           Serve files from this url instead of from disk.
+    --incremental=<str>               Reuse an existing index and only scan modified files.
-    --name=<str>                  Index display name. DEFAULT: (name of the directory)
+    -o, --output=<str>                Output directory. DEFAULT=index.sist2/
-    --depth=<int>                 Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1
+    --rewrite-url=<str>               Serve files from this url instead of from disk.
-    --archive=<str>               Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse
+    --name=<str>                      Index display name. DEFAULT: (name of the directory)
-    --archive-passphrase=<str>    Passphrase for encrypted archive files
+    --depth=<int>                     Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1
-    --ocr-lang=<str>              Tesseract language (use 'tesseract --list-langs' to see which are installed on your machine)
+    --archive=<str>                   Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse
-    --ocr-images                  Enable OCR'ing of image files.
+    --archive-passphrase=<str>        Passphrase for encrypted archive files
-    --ocr-ebooks                  Enable OCR'ing of ebook files.
+    --ocr-lang=<str>                  Tesseract language (use 'tesseract --list-langs' to see which are installed on your machine)
-    -e, --exclude=<str>           Files that match this regex will not be scanned
+    --ocr-images                      Enable OCR'ing of image files.
-    --fast                        Only index file names & mime type
+    --ocr-ebooks                      Enable OCR'ing of ebook files.
-    --treemap-threshold=<str>     Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
+    -e, --exclude=<str>               Files that match this regex will not be scanned
-    --mem-buffer=<int>            Maximum memory buffer size per thread in MB for files inside archives (see USAGE.md). DEFAULT: 2000
+    --fast                            Only index file names & mime type
-    --read-subtitles              Read subtitles from media files.
+    --treemap-threshold=<str>         Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
-    --fast-epub                   Faster but less accurate EPUB parsing (no thumbnails, metadata)
+    --mem-buffer=<int>                Maximum memory buffer size per thread in MiB for files inside archives (see USAGE.md). DEFAULT: 2000
-    --checksums                   Calculate file checksums when scanning.
+    --read-subtitles                  Read subtitles from media files.
-    --list-file=<str>             Specify a list of newline-delimited paths to be scanned instead of normal directory traversal. Use '-' to read from stdin.
+    --fast-epub                       Faster but less accurate EPUB parsing (no thumbnails, metadata)
    --checksums                       Calculate file checksums when scanning.
    --list-file=<str>                 Specify a list of newline-delimited paths to be scanned instead of normal directory traversal. Use '-' to read from stdin.
 Index options
-    -t, --threads=<int>           Number of threads. DEFAULT=1
+    -t, --threads=<int>               Number of threads. DEFAULT=1
-    --es-url=<str>                Elasticsearch url with port. DEFAULT=http://localhost:9200
+    --es-url=<str>                    Elasticsearch url with port. DEFAULT=http://localhost:9200
-    --es-index=<str>              Elasticsearch index name. DEFAULT=sist2
+    --es-index=<str>                  Elasticsearch index name. DEFAULT=sist2
-    -p, --print                   Just print JSON documents to stdout.
+    -p, --print                       Just print JSON documents to stdout.
-    --script-file=<str>           Path to user script.
+    --incremental-index               Conduct incremental indexing, assumes that the old index is already digested by Elasticsearch.
-    --mappings-file=<str>         Path to Elasticsearch mappings.
+    --script-file=<str>               Path to user script.
-    --settings-file=<str>         Path to Elasticsearch settings.
+    --mappings-file=<str>             Path to Elasticsearch mappings.
-    --async-script                Execute user script asynchronously.
+    --settings-file=<str>             Path to Elasticsearch settings.
-    --batch-size=<int>            Index batch size. DEFAULT: 100
+    --async-script                    Execute user script asynchronously.
-    -f, --force-reset             Reset Elasticsearch mappings and settings. (You must use this option the first time you use the index command)
+    --batch-size=<int>                Index batch size. DEFAULT: 100
    -f, --force-reset                 Reset Elasticsearch mappings and settings. (You must use this option the first time you use the index command)
 Web options
-    --es-url=<str>                Elasticsearch url. DEFAULT=http://localhost:9200
+    --es-url=<str>                    Elasticsearch url. DEFAULT=http://localhost:9200
-    --es-index=<str>              Elasticsearch index name. DEFAULT=sist2
+    --es-index=<str>                  Elasticsearch index name. DEFAULT=sist2
-    --bind=<str>                  Listen on this address. DEFAULT=localhost:4090
+    --bind=<str>                      Listen on this address. DEFAULT=localhost:4090
-    --auth=<str>                  Basic auth in user:password format
+    --auth=<str>                      Basic auth in user:password format
-    --tag-auth=<str>              Basic auth in user:password format for tagging
+    --tag-auth=<str>                  Basic auth in user:password format for tagging
-    --tagline=<str>               Tagline in navbar
+    --tagline=<str>                   Tagline in navbar
-    --dev                         Serve html & js files from disk (for development)
+    --dev                             Serve html & js files from disk (for development)
-    --lang=<str>                  Default UI language. Can be changed by the user
+    --lang=<str>                      Default UI language. Can be changed by the user
 Exec-script options
-    --es-url=<str>                Elasticsearch url. DEFAULT=http://localhost:9200
+    --es-url=<str>                    Elasticsearch url. DEFAULT=http://localhost:9200
-    --es-index=<str>              Elasticsearch index name. DEFAULT=sist2
+    --es-index=<str>                  Elasticsearch index name. DEFAULT=sist2
-    --script-file=<str>           Path to user script.
+    --script-file=<str>               Path to user script.
-    --async-script                Execute user script asynchronously.
+    --async-script                    Execute user script asynchronously.
 Made by simon987 <me@simon987.net>. Released under GPL-3.0
 ```
 ## Scan
@ -90,13 +93,21 @@ Exec-script options
 * `-t, --threads` 
      Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-CimInstance Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
-* `-q, --quality` 
+* `--mem-throttle`
    Total memory threshold in MiB for scan throttling. Worker threads will not start a new parse job
    until the total memory usage of sist2 is below this threshold. Set to 0 to disable. DEFAULT=0
 * `-q, --thumbnail-quality` 
    Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best.
-* `--size` 
+* `--thumbnail-size` 
    Thumbnail size in pixels.
 * `--thumbnail-count`
    Maximum number of thumbnails to generate. When set to a value >= 2, thumbnails for video previews
    will be generated. The actual number of thumbnails generated depends on the length of the video (maximum 1 image 
    every ~5s). Set to 0 to completely disable thumbnails.
 * `--content-size` 
-    Number of bytes of text to be extracted from the content of files (plain text and PDFs).
+    Number of bytes of text to be extracted from the content of files (plain text, PDFs etc.).
    Repeated whitespace and special characters do not count toward this limit.
    Set to 0 to completely disable content parsing.
 * `--incremental`
    Specify an existing index. Information about files in this index that were not modified (based on *mtime* attribute)
    will be copied to the new index and will not be parsed again.
@ -129,13 +140,13 @@ Exec-script options
    In effect, smaller `treemap-threshold` values will yield a more detailed 
    (but also a more cluttered and harder to read) visualization. 
-* `--mem-buffer` Maximum memory buffer size in MB (per thread) for files inside archives. Media files 
+* `--mem-buffer` Maximum memory buffer size in MiB (per thread) for files inside archives. Media files 
    larger than this number will be read sequentially and no *seek* operations will be supported.
    To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
 * `--read-subtitles` When enabled, will attempt to read the subtitles stream from media files.
 * `--fast-epub` Much faster but less accurate EPUB parsing. When enabled, sist2 will use a simple HTML parser to read epub files instead of the MuPDF library. No thumbnails are generated and author/title metadata are not parsed.
-* `--checksums` Calculate file checksums (sha1) when scanning files. This option does not cause any additional read 
+* `--checksums` Calculate file checksums (SHA1) when scanning files. This option does not cause any additional read 
  operations. Checksums are not calculated for all file types, unless the file is inside an archive. When enabled, duplicate
  files are hidden in the web UI (this behaviour can be toggled in the Configuration page).
@ -205,6 +216,9 @@ and values are raw image bytes.
    Elasticsearch index name. DEFAULT=sist2
 * `-p, --print` 
    Print index in JSON format to stdout.
 * `--incremental-index`
   Conduct incremental indexing. Assumes that the old index is already ingested in Elasticsearch.
   Only the new changes since the last scan will be sent.
 * `--script-file` 
    Path to user script. See [Scripting](scripting.md).
 * `--mappings-file`
--- a/sist2-vue/dist/css/chunk-vendors.css
+++ b/sist2-vue/dist/css/chunk-vendors.css
--- a/sist2-vue/dist/css/index.css
+++ b/sist2-vue/dist/css/index.css
--- a/sist2-vue/dist/index.html
+++ b/sist2-vue/dist/index.html
@ -0,0 +1,3 @@
 <!DOCTYPE html><html lang="en"><head><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=no"><title>sist2</title><link href="css/chunk-vendors.css" rel="preload" as="style"><link href="css/index.css" rel="preload" as="style"><link href="js/chunk-vendors.js" rel="preload" as="script"><link href="js/index.js" rel="preload" as="script"><link href="css/chunk-vendors.css" rel="stylesheet"><link href="css/index.css" rel="stylesheet"></head><body><noscript><style>body {
            height: initial;
        }</style><div style="text-align: center; margin-top: 100px"><strong>We're sorry but sist2 doesn't work properly without JavaScript enabled. Please enable it to continue.</strong><br><strong>Nous sommes désolés mais sist2 ne fonctionne pas correctement si JavaScript est activé. Veuillez l'activer pour continuer.</strong></div></noscript><div id="app"></div><script src="js/chunk-vendors.js"></script><script src="js/index.js"></script></body></html>
--- a/sist2-vue/dist/js/chunk-vendors.js
+++ b/sist2-vue/dist/js/chunk-vendors.js
--- a/sist2-vue/dist/js/index.js
+++ b/sist2-vue/dist/js/index.js
--- a/src/main.c
+++ b/src/main.c
@ -103,7 +103,7 @@ void sig_handler(int signum) {
    exit(-1);
 }
-void init_dir(const char *dirpath) {
+void init_dir(const char *dirpath, scan_args_t* args) {
    char path[PATH_MAX];
    snprintf(path, PATH_MAX, "%sdescriptor.json", dirpath);
@ -111,9 +111,18 @@ void init_dir(const char *dirpath) {
    strcpy(ScanCtx.index.desc.version, Version);
    strcpy(ScanCtx.index.desc.type, INDEX_TYPE_NDJSON);
-    unsigned char index_md5[MD5_DIGEST_LENGTH];
+    if (args->incremental != NULL) {
-    MD5((unsigned char *) &ScanCtx.index.desc.timestamp, sizeof(ScanCtx.index.desc.timestamp), index_md5);
+      // copy old index id
-    buf2hex(index_md5, MD5_DIGEST_LENGTH, ScanCtx.index.desc.id);
+      char descriptor_path[PATH_MAX];
      snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->incremental);
      index_descriptor_t original_desc = read_index_descriptor(descriptor_path);
      memcpy(ScanCtx.index.desc.id, original_desc.id, sizeof(original_desc.id));
    } else {
      // genreate new index id based on timestamp
      unsigned char index_md5[MD5_DIGEST_LENGTH];
      MD5((unsigned char *) &ScanCtx.index.desc.timestamp, sizeof(ScanCtx.index.desc.timestamp), index_md5);
      buf2hex(index_md5, MD5_DIGEST_LENGTH, ScanCtx.index.desc.id);
    }
    write_index_descriptor(path, &ScanCtx.index.desc);
 }
@ -378,7 +387,7 @@ void sist2_scan(scan_args_t *args) {
    initialize_scan_context(args);
-    init_dir(ScanCtx.index.path);
+    init_dir(ScanCtx.index.path, args);
    char store_path[PATH_MAX];
    snprintf(store_path, PATH_MAX, "%sthumbs", ScanCtx.index.path);
@ -674,7 +683,7 @@ int main(int argc, const char *argv[]) {
            OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"),
            OPT_BOOLEAN('p', "print", &index_args->print, "Just print JSON documents to stdout."),
            OPT_BOOLEAN(0, "incremental-index", &index_args->incremental,
-                        "Conduct incremental indexing, assumes that the old index is already digested by Elasticsearch."),
+                        "Conduct incremental indexing. Assumes that the old index is already ingested in Elasticsearch."),
            OPT_STRING(0, "script-file", &common_script_path, "Path to user script."),
            OPT_STRING(0, "mappings-file", &index_args->es_mappings_path, "Path to Elasticsearch mappings."),
            OPT_STRING(0, "settings-file", &index_args->es_settings_path, "Path to Elasticsearch settings."),
--- a/third-party/libscan/CMakeLists.txt
+++ b/third-party/libscan/CMakeLists.txt
@ -11,11 +11,6 @@ if (SIST_DEBUG)
            antiword
            DEBUG
    )
 else()
    add_compile_definitions(
            antiword
            NDEBUG
    )
    target_compile_options(
            antiword
            PRIVATE
@ -25,6 +20,11 @@ else()
            -fsanitize=address
            -fno-inline
    )
 else()
    add_compile_definitions(
            antiword
            NDEBUG
    )
 endif()
 add_library(
--- a/third-party/libscan/libscan/media/media.c
+++ b/third-party/libscan/libscan/media/media.c
@ -251,7 +251,7 @@ void append_tag_meta_if_not_exists(scan_media_ctx_t *ctx, document_t *doc, AVDic
    for (; *ptr; ++ptr) *ptr = (char) tolower(*ptr);
 __always_inline
-static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
+static void append_audio_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) {
    AVDictionaryEntry *tag = NULL;
    while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
@ -269,7 +269,7 @@ static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
        } else if (strcmp(key, "album") == 0) {
            APPEND_TAG_META(MetaAlbum)
        } else if (strcmp(key, "comment") == 0) {
-            APPEND_TAG_META(MetaContent)
+            append_tag_meta_if_not_exists(ctx, doc, tag, MetaContent);
        }
    }
 }
@ -437,7 +437,7 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
        return SAVE_THUMBNAIL_FAILED;
    }
-    if (ctx->tesseract_lang != NULL && IS_VIDEO(pFormatCtx)) {
+    if (ctx->tesseract_lang != NULL && IS_VIDEO(pFormatCtx) && thumbnail_index == 0) {
        ocr_image(ctx, doc, decoder, frame_and_packet->frame);
    }
@ -558,7 +558,7 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
    }
    if (audio_stream != -1) {
-        append_audio_meta(pFormatCtx, doc);
+        append_audio_meta(ctx, pFormatCtx, doc);
    }
    if (video_stream != -1 && ctx->tn_count > 0) {
--- a/third-party/libscan/libscan/raw/raw.c
+++ b/third-party/libscan/libscan/raw/raw.c
@ -7,8 +7,22 @@
 #define MIN_SIZE 32
-int store_thumbnail_jpeg(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
+int store_thumbnail_jpeg(scan_raw_ctx_t *ctx, libraw_thumbnail_t img, document_t *doc) {
-    return store_image_thumbnail((scan_media_ctx_t *) ctx, img->data, img->data_size, doc, "x.jpeg");
+
    scan_media_ctx_t media_ctx = {
            .read_subtitles = FALSE,
            .tn_count = 1,
            .max_media_buffer = 0,
            .store = ctx->store,
            .log = ctx->log,
            .logf = ctx->logf,
            .tn_size = ctx->tn_size,
            .tn_qscale = ctx->tn_qscale,
            .tesseract_lang = NULL,
            .tesseract_path = NULL
    };
    return store_image_thumbnail(&media_ctx, img.thumb, img.tlength, doc, "x.jpeg");
 }
 int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
@ -171,25 +185,25 @@ void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
        return;
    }
    int errc = 0;
    libraw_processed_image_t *thumb = libraw_dcraw_make_mem_thumb(libraw_lib, &errc);
    if (errc != 0) {
        free(buf);
        libraw_dcraw_clear_mem(thumb);
        libraw_close(libraw_lib);
        return;
    }
    int tn_ok = 0;
    if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_JPEG) {
-        tn_ok = store_thumbnail_jpeg(ctx, thumb, doc);
+        tn_ok = store_thumbnail_jpeg(ctx, libraw_lib->thumbnail, doc);
    } else if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_BITMAP) {
        // TODO: technically this should work but is currently untested
        int errc = 0;
        libraw_processed_image_t *thumb = libraw_dcraw_make_mem_thumb(libraw_lib, &errc);
        if (errc != 0) {
            free(buf);
            libraw_dcraw_clear_mem(thumb);
            libraw_close(libraw_lib);
            return;
        }
        tn_ok = store_thumbnail_rgb24(ctx, thumb, doc);
    }
    libraw_dcraw_clear_mem(thumb);
    if (tn_ok == TRUE) {
        free(buf);
        libraw_close(libraw_lib);
@ -206,7 +220,7 @@ void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
    libraw_dcraw_process(libraw_lib);
-    errc = 0;
+    int errc = 0;
    libraw_processed_image_t *img = libraw_dcraw_make_mem_image(libraw_lib, &errc);
    if (errc != 0) {
        free(buf);
Author	SHA1	Message	Date
simon987	57a28d781f	Un-break raw file thumbnails	2022-02-26 20:37:45 -05:00
simon987	6ec98046fa	Merge pull request #262 from yatli/fix_261 fix #261: inherit index id from base index when using incremental scan	2022-02-26 11:37:16 -05:00
Yatao Li	4fac81ca6a	fix #261 : new index ids generated for incremental scan	2022-02-27 00:25:23 +08:00
simon987	2882741926	Fix multiple content metadata bug (but without compilation error this time)	2022-02-20 10:52:22 -05:00
simon987	edba9b7917	Fix multiple content metadata bug	2022-02-20 10:43:34 -05:00
simon987	e89964d592	Fix antiword build	2022-02-20 09:37:24 -05:00
simon987	329afcbe4f	Update docs & UI stuff	2022-02-20 09:13:19 -05:00
simon987	2a2664a5cd	Disable debug in docker image oops	2022-02-20 09:01:17 -05:00
simon987	0d18637e88	Merge pull request #257 from simon987/dev v2.11.7	2022-02-20 08:34:26 -05:00