Compare commits

...

9 Commits

Author SHA1 Message Date
57a28d781f Un-break raw file thumbnails 2022-02-26 20:37:45 -05:00
6ec98046fa
Merge pull request #262 from yatli/fix_261
fix #261: inherit index id from base index when using incremental scan
2022-02-26 11:37:16 -05:00
Yatao Li
4fac81ca6a fix #261: new index ids generated for incremental scan 2022-02-27 00:25:23 +08:00
2882741926 Fix multiple content metadata bug (but without compilation error this time) 2022-02-20 10:52:22 -05:00
edba9b7917 Fix multiple content metadata bug 2022-02-20 10:43:34 -05:00
e89964d592 Fix antiword build 2022-02-20 09:37:24 -05:00
329afcbe4f Update docs & UI stuff 2022-02-20 09:13:19 -05:00
2a2664a5cd Disable debug in docker image oops 2022-02-20 09:01:17 -05:00
0d18637e88
Merge pull request #257 from simon987/dev
v2.11.7
2022-02-20 08:34:26 -05:00
12 changed files with 283 additions and 86 deletions

View File

@ -3,7 +3,7 @@ MAINTAINER simon987 <me@simon987.net>
WORKDIR /build/
COPY . .
RUN cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake .
RUN cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake .
RUN make -j$(nproc)
RUN strip sist2 || mv sist2_debug sist2

View File

@ -52,7 +52,7 @@ sist2 (Simple incremental search tool)
Select the file corresponding to your CPU architecture and mark the binary as executable with `chmod +x` *
2. *(or)* Download a [development snapshot](https://files.simon987.net/.gate/sist2/simon987_sist2/) *(Not
recommended!)*
3. *(or)* `docker pull simon987/sist2:2.11.6-x64-linux`
3. *(or)* `docker pull simon987/sist2:2.11.7-x64-linux`
1. See [Usage guide](docs/USAGE.md)

View File

@ -13,7 +13,6 @@
* [options](#web-options)
* [examples](#web-examples)
* [rewrite_url](#rewrite_url)
* [link to specific indices](#link-to-specific-indices)
* [elasticsearch](#elasticsearch)
* [exec-script](#exec-script)
* [tagging](#tagging)
@ -33,9 +32,11 @@ Lightning-fast file system indexer and search tool.
Scan options
-t, --threads=<int> Number of threads. DEFAULT=1
-q, --quality=<flt> Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=3
--size=<int> Thumbnail size, in pixels. Use negative value to disable. DEFAULT=500
--content-size=<int> Number of bytes to be extracted from text documents. Use negative value to disable. DEFAULT=32768
--mem-throttle=<int> Total memory threshold in MiB for scan throttling. DEFAULT=0
-q, --thumbnail-quality=<flt> Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=1
--thumbnail-size=<int> Thumbnail size, in pixels. DEFAULT=500
--thumbnail-count=<int> Number of thumbnails to generate. Set a value > 1 to create video previews, set to 0 to disable thumbnails. DEFAULT=1
--content-size=<int> Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT=32768
--incremental=<str> Reuse an existing index and only scan modified files.
-o, --output=<str> Output directory. DEFAULT=index.sist2/
--rewrite-url=<str> Serve files from this url instead of from disk.
@ -49,7 +50,7 @@ Scan options
-e, --exclude=<str> Files that match this regex will not be scanned
--fast Only index file names & mime type
--treemap-threshold=<str> Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
--mem-buffer=<int> Maximum memory buffer size per thread in MB for files inside archives (see USAGE.md). DEFAULT: 2000
--mem-buffer=<int> Maximum memory buffer size per thread in MiB for files inside archives (see USAGE.md). DEFAULT: 2000
--read-subtitles Read subtitles from media files.
--fast-epub Faster but less accurate EPUB parsing (no thumbnails, metadata)
--checksums Calculate file checksums when scanning.
@ -60,6 +61,7 @@ Index options
--es-url=<str> Elasticsearch url with port. DEFAULT=http://localhost:9200
--es-index=<str> Elasticsearch index name. DEFAULT=sist2
-p, --print Just print JSON documents to stdout.
--incremental-index Conduct incremental indexing, assumes that the old index is already digested by Elasticsearch.
--script-file=<str> Path to user script.
--mappings-file=<str> Path to Elasticsearch mappings.
--settings-file=<str> Path to Elasticsearch settings.
@ -82,6 +84,7 @@ Exec-script options
--es-index=<str> Elasticsearch index name. DEFAULT=sist2
--script-file=<str> Path to user script.
--async-script Execute user script asynchronously.
Made by simon987 <me@simon987.net>. Released under GPL-3.0
```
## Scan
@ -90,13 +93,21 @@ Exec-script options
* `-t, --threads`
Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-CimInstance Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
* `-q, --quality`
* `--mem-throttle`
Total memory threshold in MiB for scan throttling. Worker threads will not start a new parse job
until the total memory usage of sist2 is below this threshold. Set to 0 to disable. DEFAULT=0
* `-q, --thumbnail-quality`
Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best.
* `--size`
* `--thumbnail-size`
Thumbnail size in pixels.
* `--thumbnail-count`
Maximum number of thumbnails to generate. When set to a value >= 2, thumbnails for video previews
will be generated. The actual number of thumbnails generated depends on the length of the video (maximum 1 image
every ~5s). Set to 0 to completely disable thumbnails.
* `--content-size`
Number of bytes of text to be extracted from the content of files (plain text and PDFs).
Number of bytes of text to be extracted from the content of files (plain text, PDFs etc.).
Repeated whitespace and special characters do not count toward this limit.
Set to 0 to completely disable content parsing.
* `--incremental`
Specify an existing index. Information about files in this index that were not modified (based on *mtime* attribute)
will be copied to the new index and will not be parsed again.
@ -129,13 +140,13 @@ Exec-script options
In effect, smaller `treemap-threshold` values will yield a more detailed
(but also a more cluttered and harder to read) visualization.
* `--mem-buffer` Maximum memory buffer size in MB (per thread) for files inside archives. Media files
* `--mem-buffer` Maximum memory buffer size in MiB (per thread) for files inside archives. Media files
larger than this number will be read sequentially and no *seek* operations will be supported.
To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
* `--read-subtitles` When enabled, will attempt to read the subtitles stream from media files.
* `--fast-epub` Much faster but less accurate EPUB parsing. When enabled, sist2 will use a simple HTML parser to read epub files instead of the MuPDF library. No thumbnails are generated and author/title metadata are not parsed.
* `--checksums` Calculate file checksums (sha1) when scanning files. This option does not cause any additional read
* `--checksums` Calculate file checksums (SHA1) when scanning files. This option does not cause any additional read
operations. Checksums are not calculated for all file types, unless the file is inside an archive. When enabled, duplicate
files are hidden in the web UI (this behaviour can be toggled in the Configuration page).
@ -205,6 +216,9 @@ and values are raw image bytes.
Elasticsearch index name. DEFAULT=sist2
* `-p, --print`
Print index in JSON format to stdout.
* `--incremental-index`
Conduct incremental indexing. Assumes that the old index is already ingested in Elasticsearch.
Only the new changes since the last scan will be sent.
* `--script-file`
Path to user script. See [Scripting](scripting.md).
* `--mappings-file`

9
sist2-vue/dist/css/chunk-vendors.css vendored Normal file

File diff suppressed because one or more lines are too long

1
sist2-vue/dist/css/index.css vendored Normal file

File diff suppressed because one or more lines are too long

3
sist2-vue/dist/index.html vendored Normal file
View File

@ -0,0 +1,3 @@
<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=no"><title>sist2</title><link href="css/chunk-vendors.css" rel="preload" as="style"><link href="css/index.css" rel="preload" as="style"><link href="js/chunk-vendors.js" rel="preload" as="script"><link href="js/index.js" rel="preload" as="script"><link href="css/chunk-vendors.css" rel="stylesheet"><link href="css/index.css" rel="stylesheet"></head><body><noscript><style>body {
height: initial;
}</style><div style="text-align: center; margin-top: 100px"><strong>We're sorry but sist2 doesn't work properly without JavaScript enabled. Please enable it to continue.</strong><br><strong>Nous sommes désolés mais sist2 ne fonctionne pas correctement si JavaScript est activé. Veuillez l'activer pour continuer.</strong></div></noscript><div id="app"></div><script src="js/chunk-vendors.js"></script><script src="js/index.js"></script></body></html>

146
sist2-vue/dist/js/chunk-vendors.js vendored Normal file

File diff suppressed because one or more lines are too long

1
sist2-vue/dist/js/index.js vendored Normal file

File diff suppressed because one or more lines are too long

View File

@ -103,7 +103,7 @@ void sig_handler(int signum) {
exit(-1);
}
void init_dir(const char *dirpath) {
void init_dir(const char *dirpath, scan_args_t* args) {
char path[PATH_MAX];
snprintf(path, PATH_MAX, "%sdescriptor.json", dirpath);
@ -111,9 +111,18 @@ void init_dir(const char *dirpath) {
strcpy(ScanCtx.index.desc.version, Version);
strcpy(ScanCtx.index.desc.type, INDEX_TYPE_NDJSON);
if (args->incremental != NULL) {
// copy old index id
char descriptor_path[PATH_MAX];
snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->incremental);
index_descriptor_t original_desc = read_index_descriptor(descriptor_path);
memcpy(ScanCtx.index.desc.id, original_desc.id, sizeof(original_desc.id));
} else {
// genreate new index id based on timestamp
unsigned char index_md5[MD5_DIGEST_LENGTH];
MD5((unsigned char *) &ScanCtx.index.desc.timestamp, sizeof(ScanCtx.index.desc.timestamp), index_md5);
buf2hex(index_md5, MD5_DIGEST_LENGTH, ScanCtx.index.desc.id);
}
write_index_descriptor(path, &ScanCtx.index.desc);
}
@ -378,7 +387,7 @@ void sist2_scan(scan_args_t *args) {
initialize_scan_context(args);
init_dir(ScanCtx.index.path);
init_dir(ScanCtx.index.path, args);
char store_path[PATH_MAX];
snprintf(store_path, PATH_MAX, "%sthumbs", ScanCtx.index.path);
@ -674,7 +683,7 @@ int main(int argc, const char *argv[]) {
OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"),
OPT_BOOLEAN('p', "print", &index_args->print, "Just print JSON documents to stdout."),
OPT_BOOLEAN(0, "incremental-index", &index_args->incremental,
"Conduct incremental indexing, assumes that the old index is already digested by Elasticsearch."),
"Conduct incremental indexing. Assumes that the old index is already ingested in Elasticsearch."),
OPT_STRING(0, "script-file", &common_script_path, "Path to user script."),
OPT_STRING(0, "mappings-file", &index_args->es_mappings_path, "Path to Elasticsearch mappings."),
OPT_STRING(0, "settings-file", &index_args->es_settings_path, "Path to Elasticsearch settings."),

View File

@ -11,11 +11,6 @@ if (SIST_DEBUG)
antiword
DEBUG
)
else()
add_compile_definitions(
antiword
NDEBUG
)
target_compile_options(
antiword
PRIVATE
@ -25,6 +20,11 @@ else()
-fsanitize=address
-fno-inline
)
else()
add_compile_definitions(
antiword
NDEBUG
)
endif()
add_library(

View File

@ -251,7 +251,7 @@ void append_tag_meta_if_not_exists(scan_media_ctx_t *ctx, document_t *doc, AVDic
for (; *ptr; ++ptr) *ptr = (char) tolower(*ptr);
__always_inline
static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
static void append_audio_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) {
AVDictionaryEntry *tag = NULL;
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
@ -269,7 +269,7 @@ static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
} else if (strcmp(key, "album") == 0) {
APPEND_TAG_META(MetaAlbum)
} else if (strcmp(key, "comment") == 0) {
APPEND_TAG_META(MetaContent)
append_tag_meta_if_not_exists(ctx, doc, tag, MetaContent);
}
}
}
@ -437,7 +437,7 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
return SAVE_THUMBNAIL_FAILED;
}
if (ctx->tesseract_lang != NULL && IS_VIDEO(pFormatCtx)) {
if (ctx->tesseract_lang != NULL && IS_VIDEO(pFormatCtx) && thumbnail_index == 0) {
ocr_image(ctx, doc, decoder, frame_and_packet->frame);
}
@ -558,7 +558,7 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
}
if (audio_stream != -1) {
append_audio_meta(pFormatCtx, doc);
append_audio_meta(ctx, pFormatCtx, doc);
}
if (video_stream != -1 && ctx->tn_count > 0) {

View File

@ -7,8 +7,22 @@
#define MIN_SIZE 32
int store_thumbnail_jpeg(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
return store_image_thumbnail((scan_media_ctx_t *) ctx, img->data, img->data_size, doc, "x.jpeg");
int store_thumbnail_jpeg(scan_raw_ctx_t *ctx, libraw_thumbnail_t img, document_t *doc) {
scan_media_ctx_t media_ctx = {
.read_subtitles = FALSE,
.tn_count = 1,
.max_media_buffer = 0,
.store = ctx->store,
.log = ctx->log,
.logf = ctx->logf,
.tn_size = ctx->tn_size,
.tn_qscale = ctx->tn_qscale,
.tesseract_lang = NULL,
.tesseract_path = NULL
};
return store_image_thumbnail(&media_ctx, img.thumb, img.tlength, doc, "x.jpeg");
}
int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
@ -171,6 +185,13 @@ void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
return;
}
int tn_ok = 0;
if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_JPEG) {
tn_ok = store_thumbnail_jpeg(ctx, libraw_lib->thumbnail, doc);
} else if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_BITMAP) {
// TODO: technically this should work but is currently untested
int errc = 0;
libraw_processed_image_t *thumb = libraw_dcraw_make_mem_thumb(libraw_lib, &errc);
if (errc != 0) {
@ -180,16 +201,9 @@ void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
return;
}
int tn_ok = 0;
if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_JPEG) {
tn_ok = store_thumbnail_jpeg(ctx, thumb, doc);
} else if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_BITMAP) {
// TODO: technically this should work but is currently untested
tn_ok = store_thumbnail_rgb24(ctx, thumb, doc);
}
libraw_dcraw_clear_mem(thumb);
if (tn_ok == TRUE) {
free(buf);
libraw_close(libraw_lib);
@ -206,7 +220,7 @@ void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
libraw_dcraw_process(libraw_lib);
errc = 0;
int errc = 0;
libraw_processed_image_t *img = libraw_dcraw_make_mem_image(libraw_lib, &errc);
if (errc != 0) {
free(buf);