mirror of
https://github.com/simon987/sist2.git
synced 2025-04-08 13:06:47 +00:00
Better support for media files inside archives
This commit is contained in:
parent
dd2397ef5c
commit
c4fbae123e
@ -91,13 +91,11 @@ they were directly in the file system. Recursive (archives inside archives)
|
||||
scan is also supported.
|
||||
|
||||
**Limitations**:
|
||||
* Parsing media files with formats that require
|
||||
*seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.) is not supported.
|
||||
* Support for parsing media files with formats that require *seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.)
|
||||
is limitted (see `--mem-buffer` option)
|
||||
* Archive files are scanned sequentially, by a single thread. On systems where
|
||||
**sist2** is not I/O bound, scans might be faster when larger archives are split
|
||||
into smaller parts.
|
||||
|
||||
To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
|
||||
|
||||
|
||||
### OCR
|
||||
|
@ -40,6 +40,9 @@ Scan options
|
||||
--ocr=<str> Tesseract language (use tesseract --list-langs to see which are installed on your machine)
|
||||
-e, --exclude=<str> Files that match this regex will not be scanned
|
||||
--fast Only index file names & mime type
|
||||
--treemap-threshold=<str> Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
|
||||
--mem-buffer=<int> Maximum memory buffer size in MB for files inside archives (see USAGE.md). DEFAULT: 2000
|
||||
|
||||
|
||||
Index options
|
||||
--es-url=<str> Elasticsearch url with port. DEFAULT=http://localhost:9200
|
||||
@ -101,6 +104,11 @@ Made by simon987 <me@simon987.net>. Released under GPL-3.0
|
||||
|
||||
In effect, smaller `treemap-threshold` values will yield a more detailed
|
||||
(but also a more cluttered and harder to read) visualization.
|
||||
|
||||
* `--mem-buffer` Maximum memory buffer size in MB (per thread) for files inside archives. Media files
|
||||
larger than this number will be read sequentially and no *seek* operations will be supported.
|
||||
|
||||
To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
|
||||
|
||||
### Scan examples
|
||||
|
||||
|
@ -14,6 +14,8 @@
|
||||
#define DEFAULT_LISTEN_ADDRESS "localhost:4090"
|
||||
#define DEFAULT_TREEMAP_THRESHOLD 0.0005
|
||||
|
||||
#define DEFAULT_MAX_MEM_BUFFER 2000
|
||||
|
||||
const char* TESS_DATAPATHS[] = {
|
||||
"/usr/share/tessdata/",
|
||||
"/usr/share/tesseract-ocr/tessdata/",
|
||||
@ -187,6 +189,10 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
args->treemap_threshold = atof(args->treemap_threshold_str);
|
||||
}
|
||||
|
||||
if (args->max_memory_buffer == 0) {
|
||||
args->max_memory_buffer = DEFAULT_MAX_MEM_BUFFER;
|
||||
}
|
||||
|
||||
LOG_DEBUGF("cli.c", "arg quality=%f", args->quality)
|
||||
LOG_DEBUGF("cli.c", "arg size=%d", args->size)
|
||||
LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size)
|
||||
@ -203,6 +209,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
LOG_DEBUGF("cli.c", "arg exclude=%s", args->exclude_regex)
|
||||
LOG_DEBUGF("cli.c", "arg fast=%d", args->fast)
|
||||
LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold)
|
||||
LOG_DEBUGF("cli.c", "arg max_memory_buffer=%d", args->max_memory_buffer)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -24,6 +24,7 @@ typedef struct scan_args {
|
||||
int fast;
|
||||
const char* treemap_threshold_str;
|
||||
double treemap_threshold;
|
||||
int max_memory_buffer;
|
||||
} scan_args_t;
|
||||
|
||||
scan_args_t *scan_args_create();
|
||||
|
@ -22,7 +22,7 @@
|
||||
#define EPILOG "Made by simon987 <me@simon987.net>. Released under GPL-3.0"
|
||||
|
||||
|
||||
static const char *const Version = "2.3.1";
|
||||
static const char *const Version = "2.3.2";
|
||||
static const char *const usage[] = {
|
||||
"sist2 scan [OPTION]... PATH",
|
||||
"sist2 index [OPTION]... INDEX",
|
||||
@ -127,6 +127,7 @@ void initialize_scan_context(scan_args_t *args) {
|
||||
ScanCtx.media_ctx.log = _log;
|
||||
ScanCtx.media_ctx.logf = _logf;
|
||||
ScanCtx.media_ctx.store = _store;
|
||||
ScanCtx.media_ctx.max_media_buffer = (long) args->max_memory_buffer * 1024 * 1024;
|
||||
init_media();
|
||||
|
||||
// OOXML
|
||||
@ -357,7 +358,10 @@ int main(int argc, const char *argv[]) {
|
||||
OPT_STRING('e', "exclude", &scan_args->exclude_regex, "Files that match this regex will not be scanned"),
|
||||
OPT_BOOLEAN(0, "fast", &scan_args->fast, "Only index file names & mime type"),
|
||||
OPT_STRING(0, "treemap-threshold", &scan_args->treemap_threshold_str, "Relative size threshold for treemap "
|
||||
"(see USAGE.md). DEFAULT: 0.0005"),
|
||||
"(see USAGE.md). DEFAULT: 0.0005"),
|
||||
OPT_INTEGER(0, "mem-buffer", &scan_args->max_memory_buffer,
|
||||
"Maximum memory buffer size per thread in MB for files inside archives "
|
||||
"(see USAGE.md). DEFAULT: 2000"),
|
||||
|
||||
OPT_GROUP("Index options"),
|
||||
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url with port. DEFAULT=http://localhost:9200"),
|
||||
|
@ -77,6 +77,7 @@ function shouldPlayVideo(hit) {
|
||||
|
||||
return mime &&
|
||||
mime.startsWith("video/") &&
|
||||
!("parent" in hit["_source"]) &&
|
||||
hit["_source"]["extension"] !== "mkv" &&
|
||||
hit["_source"]["extension"] !== "avi" &&
|
||||
videoc !== "hevc" &&
|
||||
|
@ -11,7 +11,7 @@
|
||||
|
||||
<nav class="navbar navbar-expand-lg">
|
||||
<a class="navbar-brand" href="/">sist2</a>
|
||||
<span class="badge badge-pill version">2.3.1</span>
|
||||
<span class="badge badge-pill version">2.3.2</span>
|
||||
<span class="tagline">Lightning-fast file system indexer and search tool </span>
|
||||
<a class="btn ml-auto" href="/stats">Stats</a>
|
||||
<button class="btn" type="button" data-toggle="modal" data-target="#settings" onclick="loadSettings()">Settings</button>
|
||||
|
@ -171,7 +171,7 @@ int merge_up(double thresh) {
|
||||
|
||||
int size = g_hash_table_size(FlatTree);
|
||||
|
||||
LOG_DEBUGF("stats.h", "Merge up iteration (%d merged, %d in tree)", count, size)
|
||||
LOG_DEBUGF("stats.c", "Merge up iteration (%d merged, %d in tree)", count, size)
|
||||
return count;
|
||||
}
|
||||
|
||||
|
File diff suppressed because one or more lines are too long
2
third-party/libscan
vendored
2
third-party/libscan
vendored
@ -1 +1 @@
|
||||
Subproject commit fe6232ed8222ee33178d6c50ff2c5ea6312e7dee
|
||||
Subproject commit be2dabe4668b4d0b8197d73a239cb1b4035a9c56
|
Loading…
x
Reference in New Issue
Block a user