From c4fbae123ea51aa107bd6fe5dfe78e406a50fd8b Mon Sep 17 00:00:00 2001 From: simon987 Date: Sun, 24 May 2020 14:10:23 -0400 Subject: [PATCH] Better support for media files inside archives --- README.md | 6 ++---- docs/USAGE.md | 8 ++++++++ src/cli.c | 7 +++++++ src/cli.h | 1 + src/main.c | 8 ++++++-- src/static/js/dom.js | 1 + src/static/search.html | 2 +- src/stats.c | 2 +- src/web/static_generated.c | 4 ++-- third-party/libscan | 2 +- 10 files changed, 30 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 9feeae2..2d4265b 100644 --- a/README.md +++ b/README.md @@ -91,13 +91,11 @@ they were directly in the file system. Recursive (archives inside archives) scan is also supported. **Limitations**: -* Parsing media files with formats that require -*seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.) is not supported. +* Support for parsing media files with formats that require *seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.) + is limitted (see `--mem-buffer` option) * Archive files are scanned sequentially, by a single thread. On systems where **sist2** is not I/O bound, scans might be faster when larger archives are split into smaller parts. - -To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -` ### OCR diff --git a/docs/USAGE.md b/docs/USAGE.md index 3715583..e0118b7 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -40,6 +40,9 @@ Scan options --ocr= Tesseract language (use tesseract --list-langs to see which are installed on your machine) -e, --exclude= Files that match this regex will not be scanned --fast Only index file names & mime type + --treemap-threshold= Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005 + --mem-buffer= Maximum memory buffer size in MB for files inside archives (see USAGE.md). DEFAULT: 2000 + Index options --es-url= Elasticsearch url with port. DEFAULT=http://localhost:9200 @@ -101,6 +104,11 @@ Made by simon987 . Released under GPL-3.0 In effect, smaller `treemap-threshold` values will yield a more detailed (but also a more cluttered and harder to read) visualization. + +* `--mem-buffer` Maximum memory buffer size in MB (per thread) for files inside archives. Media files + larger than this number will be read sequentially and no *seek* operations will be supported. + + To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -` ### Scan examples diff --git a/src/cli.c b/src/cli.c index aa738fc..d0c390f 100644 --- a/src/cli.c +++ b/src/cli.c @@ -14,6 +14,8 @@ #define DEFAULT_LISTEN_ADDRESS "localhost:4090" #define DEFAULT_TREEMAP_THRESHOLD 0.0005 +#define DEFAULT_MAX_MEM_BUFFER 2000 + const char* TESS_DATAPATHS[] = { "/usr/share/tessdata/", "/usr/share/tesseract-ocr/tessdata/", @@ -187,6 +189,10 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) { args->treemap_threshold = atof(args->treemap_threshold_str); } + if (args->max_memory_buffer == 0) { + args->max_memory_buffer = DEFAULT_MAX_MEM_BUFFER; + } + LOG_DEBUGF("cli.c", "arg quality=%f", args->quality) LOG_DEBUGF("cli.c", "arg size=%d", args->size) LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size) @@ -203,6 +209,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) { LOG_DEBUGF("cli.c", "arg exclude=%s", args->exclude_regex) LOG_DEBUGF("cli.c", "arg fast=%d", args->fast) LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold) + LOG_DEBUGF("cli.c", "arg max_memory_buffer=%d", args->max_memory_buffer) return 0; } diff --git a/src/cli.h b/src/cli.h index d0c81a5..94541ee 100644 --- a/src/cli.h +++ b/src/cli.h @@ -24,6 +24,7 @@ typedef struct scan_args { int fast; const char* treemap_threshold_str; double treemap_threshold; + int max_memory_buffer; } scan_args_t; scan_args_t *scan_args_create(); diff --git a/src/main.c b/src/main.c index 5c7710e..377cf3a 100644 --- a/src/main.c +++ b/src/main.c @@ -22,7 +22,7 @@ #define EPILOG "Made by simon987 . Released under GPL-3.0" -static const char *const Version = "2.3.1"; +static const char *const Version = "2.3.2"; static const char *const usage[] = { "sist2 scan [OPTION]... PATH", "sist2 index [OPTION]... INDEX", @@ -127,6 +127,7 @@ void initialize_scan_context(scan_args_t *args) { ScanCtx.media_ctx.log = _log; ScanCtx.media_ctx.logf = _logf; ScanCtx.media_ctx.store = _store; + ScanCtx.media_ctx.max_media_buffer = (long) args->max_memory_buffer * 1024 * 1024; init_media(); // OOXML @@ -357,7 +358,10 @@ int main(int argc, const char *argv[]) { OPT_STRING('e', "exclude", &scan_args->exclude_regex, "Files that match this regex will not be scanned"), OPT_BOOLEAN(0, "fast", &scan_args->fast, "Only index file names & mime type"), OPT_STRING(0, "treemap-threshold", &scan_args->treemap_threshold_str, "Relative size threshold for treemap " - "(see USAGE.md). DEFAULT: 0.0005"), + "(see USAGE.md). DEFAULT: 0.0005"), + OPT_INTEGER(0, "mem-buffer", &scan_args->max_memory_buffer, + "Maximum memory buffer size per thread in MB for files inside archives " + "(see USAGE.md). DEFAULT: 2000"), OPT_GROUP("Index options"), OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url with port. DEFAULT=http://localhost:9200"), diff --git a/src/static/js/dom.js b/src/static/js/dom.js index 6e03666..dac18e3 100644 --- a/src/static/js/dom.js +++ b/src/static/js/dom.js @@ -77,6 +77,7 @@ function shouldPlayVideo(hit) { return mime && mime.startsWith("video/") && + !("parent" in hit["_source"]) && hit["_source"]["extension"] !== "mkv" && hit["_source"]["extension"] !== "avi" && videoc !== "hevc" && diff --git a/src/static/search.html b/src/static/search.html index 69d52d1..553ce7f 100644 --- a/src/static/search.html +++ b/src/static/search.html @@ -11,7 +11,7 @@