mirror of
https://github.com/simon987/sist2.git
synced 2025-04-21 19:26:45 +00:00
Compare commits
9 Commits
8ad9fc9e32
...
57a28d781f
Author | SHA1 | Date | |
---|---|---|---|
57a28d781f | |||
6ec98046fa | |||
|
4fac81ca6a | ||
2882741926 | |||
edba9b7917 | |||
e89964d592 | |||
329afcbe4f | |||
2a2664a5cd | |||
0d18637e88 |
@ -3,7 +3,7 @@ MAINTAINER simon987 <me@simon987.net>
|
||||
|
||||
WORKDIR /build/
|
||||
COPY . .
|
||||
RUN cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake .
|
||||
RUN cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake .
|
||||
RUN make -j$(nproc)
|
||||
RUN strip sist2 || mv sist2_debug sist2
|
||||
|
||||
|
@ -52,7 +52,7 @@ sist2 (Simple incremental search tool)
|
||||
Select the file corresponding to your CPU architecture and mark the binary as executable with `chmod +x` *
|
||||
2. *(or)* Download a [development snapshot](https://files.simon987.net/.gate/sist2/simon987_sist2/) *(Not
|
||||
recommended!)*
|
||||
3. *(or)* `docker pull simon987/sist2:2.11.6-x64-linux`
|
||||
3. *(or)* `docker pull simon987/sist2:2.11.7-x64-linux`
|
||||
|
||||
1. See [Usage guide](docs/USAGE.md)
|
||||
|
||||
|
@ -13,7 +13,6 @@
|
||||
* [options](#web-options)
|
||||
* [examples](#web-examples)
|
||||
* [rewrite_url](#rewrite_url)
|
||||
* [link to specific indices](#link-to-specific-indices)
|
||||
* [elasticsearch](#elasticsearch)
|
||||
* [exec-script](#exec-script)
|
||||
* [tagging](#tagging)
|
||||
@ -33,9 +32,11 @@ Lightning-fast file system indexer and search tool.
|
||||
|
||||
Scan options
|
||||
-t, --threads=<int> Number of threads. DEFAULT=1
|
||||
-q, --quality=<flt> Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=3
|
||||
--size=<int> Thumbnail size, in pixels. Use negative value to disable. DEFAULT=500
|
||||
--content-size=<int> Number of bytes to be extracted from text documents. Use negative value to disable. DEFAULT=32768
|
||||
--mem-throttle=<int> Total memory threshold in MiB for scan throttling. DEFAULT=0
|
||||
-q, --thumbnail-quality=<flt> Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=1
|
||||
--thumbnail-size=<int> Thumbnail size, in pixels. DEFAULT=500
|
||||
--thumbnail-count=<int> Number of thumbnails to generate. Set a value > 1 to create video previews, set to 0 to disable thumbnails. DEFAULT=1
|
||||
--content-size=<int> Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT=32768
|
||||
--incremental=<str> Reuse an existing index and only scan modified files.
|
||||
-o, --output=<str> Output directory. DEFAULT=index.sist2/
|
||||
--rewrite-url=<str> Serve files from this url instead of from disk.
|
||||
@ -49,7 +50,7 @@ Scan options
|
||||
-e, --exclude=<str> Files that match this regex will not be scanned
|
||||
--fast Only index file names & mime type
|
||||
--treemap-threshold=<str> Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
|
||||
--mem-buffer=<int> Maximum memory buffer size per thread in MB for files inside archives (see USAGE.md). DEFAULT: 2000
|
||||
--mem-buffer=<int> Maximum memory buffer size per thread in MiB for files inside archives (see USAGE.md). DEFAULT: 2000
|
||||
--read-subtitles Read subtitles from media files.
|
||||
--fast-epub Faster but less accurate EPUB parsing (no thumbnails, metadata)
|
||||
--checksums Calculate file checksums when scanning.
|
||||
@ -60,6 +61,7 @@ Index options
|
||||
--es-url=<str> Elasticsearch url with port. DEFAULT=http://localhost:9200
|
||||
--es-index=<str> Elasticsearch index name. DEFAULT=sist2
|
||||
-p, --print Just print JSON documents to stdout.
|
||||
--incremental-index Conduct incremental indexing, assumes that the old index is already digested by Elasticsearch.
|
||||
--script-file=<str> Path to user script.
|
||||
--mappings-file=<str> Path to Elasticsearch mappings.
|
||||
--settings-file=<str> Path to Elasticsearch settings.
|
||||
@ -82,6 +84,7 @@ Exec-script options
|
||||
--es-index=<str> Elasticsearch index name. DEFAULT=sist2
|
||||
--script-file=<str> Path to user script.
|
||||
--async-script Execute user script asynchronously.
|
||||
Made by simon987 <me@simon987.net>. Released under GPL-3.0
|
||||
```
|
||||
|
||||
## Scan
|
||||
@ -90,13 +93,21 @@ Exec-script options
|
||||
|
||||
* `-t, --threads`
|
||||
Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-CimInstance Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
|
||||
* `-q, --quality`
|
||||
* `--mem-throttle`
|
||||
Total memory threshold in MiB for scan throttling. Worker threads will not start a new parse job
|
||||
until the total memory usage of sist2 is below this threshold. Set to 0 to disable. DEFAULT=0
|
||||
* `-q, --thumbnail-quality`
|
||||
Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best.
|
||||
* `--size`
|
||||
* `--thumbnail-size`
|
||||
Thumbnail size in pixels.
|
||||
* `--thumbnail-count`
|
||||
Maximum number of thumbnails to generate. When set to a value >= 2, thumbnails for video previews
|
||||
will be generated. The actual number of thumbnails generated depends on the length of the video (maximum 1 image
|
||||
every ~5s). Set to 0 to completely disable thumbnails.
|
||||
* `--content-size`
|
||||
Number of bytes of text to be extracted from the content of files (plain text and PDFs).
|
||||
Number of bytes of text to be extracted from the content of files (plain text, PDFs etc.).
|
||||
Repeated whitespace and special characters do not count toward this limit.
|
||||
Set to 0 to completely disable content parsing.
|
||||
* `--incremental`
|
||||
Specify an existing index. Information about files in this index that were not modified (based on *mtime* attribute)
|
||||
will be copied to the new index and will not be parsed again.
|
||||
@ -129,13 +140,13 @@ Exec-script options
|
||||
In effect, smaller `treemap-threshold` values will yield a more detailed
|
||||
(but also a more cluttered and harder to read) visualization.
|
||||
|
||||
* `--mem-buffer` Maximum memory buffer size in MB (per thread) for files inside archives. Media files
|
||||
* `--mem-buffer` Maximum memory buffer size in MiB (per thread) for files inside archives. Media files
|
||||
larger than this number will be read sequentially and no *seek* operations will be supported.
|
||||
|
||||
To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
|
||||
* `--read-subtitles` When enabled, will attempt to read the subtitles stream from media files.
|
||||
* `--fast-epub` Much faster but less accurate EPUB parsing. When enabled, sist2 will use a simple HTML parser to read epub files instead of the MuPDF library. No thumbnails are generated and author/title metadata are not parsed.
|
||||
* `--checksums` Calculate file checksums (sha1) when scanning files. This option does not cause any additional read
|
||||
* `--checksums` Calculate file checksums (SHA1) when scanning files. This option does not cause any additional read
|
||||
operations. Checksums are not calculated for all file types, unless the file is inside an archive. When enabled, duplicate
|
||||
files are hidden in the web UI (this behaviour can be toggled in the Configuration page).
|
||||
|
||||
@ -205,6 +216,9 @@ and values are raw image bytes.
|
||||
Elasticsearch index name. DEFAULT=sist2
|
||||
* `-p, --print`
|
||||
Print index in JSON format to stdout.
|
||||
* `--incremental-index`
|
||||
Conduct incremental indexing. Assumes that the old index is already ingested in Elasticsearch.
|
||||
Only the new changes since the last scan will be sent.
|
||||
* `--script-file`
|
||||
Path to user script. See [Scripting](scripting.md).
|
||||
* `--mappings-file`
|
||||
|
9
sist2-vue/dist/css/chunk-vendors.css
vendored
Normal file
9
sist2-vue/dist/css/chunk-vendors.css
vendored
Normal file
File diff suppressed because one or more lines are too long
1
sist2-vue/dist/css/index.css
vendored
Normal file
1
sist2-vue/dist/css/index.css
vendored
Normal file
File diff suppressed because one or more lines are too long
3
sist2-vue/dist/index.html
vendored
Normal file
3
sist2-vue/dist/index.html
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=no"><title>sist2</title><link href="css/chunk-vendors.css" rel="preload" as="style"><link href="css/index.css" rel="preload" as="style"><link href="js/chunk-vendors.js" rel="preload" as="script"><link href="js/index.js" rel="preload" as="script"><link href="css/chunk-vendors.css" rel="stylesheet"><link href="css/index.css" rel="stylesheet"></head><body><noscript><style>body {
|
||||
height: initial;
|
||||
}</style><div style="text-align: center; margin-top: 100px"><strong>We're sorry but sist2 doesn't work properly without JavaScript enabled. Please enable it to continue.</strong><br><strong>Nous sommes désolés mais sist2 ne fonctionne pas correctement si JavaScript est activé. Veuillez l'activer pour continuer.</strong></div></noscript><div id="app"></div><script src="js/chunk-vendors.js"></script><script src="js/index.js"></script></body></html>
|
146
sist2-vue/dist/js/chunk-vendors.js
vendored
Normal file
146
sist2-vue/dist/js/chunk-vendors.js
vendored
Normal file
File diff suppressed because one or more lines are too long
1
sist2-vue/dist/js/index.js
vendored
Normal file
1
sist2-vue/dist/js/index.js
vendored
Normal file
File diff suppressed because one or more lines are too long
15
src/main.c
15
src/main.c
@ -103,7 +103,7 @@ void sig_handler(int signum) {
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
void init_dir(const char *dirpath) {
|
||||
void init_dir(const char *dirpath, scan_args_t* args) {
|
||||
char path[PATH_MAX];
|
||||
snprintf(path, PATH_MAX, "%sdescriptor.json", dirpath);
|
||||
|
||||
@ -111,9 +111,18 @@ void init_dir(const char *dirpath) {
|
||||
strcpy(ScanCtx.index.desc.version, Version);
|
||||
strcpy(ScanCtx.index.desc.type, INDEX_TYPE_NDJSON);
|
||||
|
||||
if (args->incremental != NULL) {
|
||||
// copy old index id
|
||||
char descriptor_path[PATH_MAX];
|
||||
snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->incremental);
|
||||
index_descriptor_t original_desc = read_index_descriptor(descriptor_path);
|
||||
memcpy(ScanCtx.index.desc.id, original_desc.id, sizeof(original_desc.id));
|
||||
} else {
|
||||
// genreate new index id based on timestamp
|
||||
unsigned char index_md5[MD5_DIGEST_LENGTH];
|
||||
MD5((unsigned char *) &ScanCtx.index.desc.timestamp, sizeof(ScanCtx.index.desc.timestamp), index_md5);
|
||||
buf2hex(index_md5, MD5_DIGEST_LENGTH, ScanCtx.index.desc.id);
|
||||
}
|
||||
|
||||
write_index_descriptor(path, &ScanCtx.index.desc);
|
||||
}
|
||||
@ -378,7 +387,7 @@ void sist2_scan(scan_args_t *args) {
|
||||
|
||||
initialize_scan_context(args);
|
||||
|
||||
init_dir(ScanCtx.index.path);
|
||||
init_dir(ScanCtx.index.path, args);
|
||||
|
||||
char store_path[PATH_MAX];
|
||||
snprintf(store_path, PATH_MAX, "%sthumbs", ScanCtx.index.path);
|
||||
@ -674,7 +683,7 @@ int main(int argc, const char *argv[]) {
|
||||
OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"),
|
||||
OPT_BOOLEAN('p', "print", &index_args->print, "Just print JSON documents to stdout."),
|
||||
OPT_BOOLEAN(0, "incremental-index", &index_args->incremental,
|
||||
"Conduct incremental indexing, assumes that the old index is already digested by Elasticsearch."),
|
||||
"Conduct incremental indexing. Assumes that the old index is already ingested in Elasticsearch."),
|
||||
OPT_STRING(0, "script-file", &common_script_path, "Path to user script."),
|
||||
OPT_STRING(0, "mappings-file", &index_args->es_mappings_path, "Path to Elasticsearch mappings."),
|
||||
OPT_STRING(0, "settings-file", &index_args->es_settings_path, "Path to Elasticsearch settings."),
|
||||
|
10
third-party/libscan/CMakeLists.txt
vendored
10
third-party/libscan/CMakeLists.txt
vendored
@ -11,11 +11,6 @@ if (SIST_DEBUG)
|
||||
antiword
|
||||
DEBUG
|
||||
)
|
||||
else()
|
||||
add_compile_definitions(
|
||||
antiword
|
||||
NDEBUG
|
||||
)
|
||||
target_compile_options(
|
||||
antiword
|
||||
PRIVATE
|
||||
@ -25,6 +20,11 @@ else()
|
||||
-fsanitize=address
|
||||
-fno-inline
|
||||
)
|
||||
else()
|
||||
add_compile_definitions(
|
||||
antiword
|
||||
NDEBUG
|
||||
)
|
||||
endif()
|
||||
|
||||
add_library(
|
||||
|
8
third-party/libscan/libscan/media/media.c
vendored
8
third-party/libscan/libscan/media/media.c
vendored
@ -251,7 +251,7 @@ void append_tag_meta_if_not_exists(scan_media_ctx_t *ctx, document_t *doc, AVDic
|
||||
for (; *ptr; ++ptr) *ptr = (char) tolower(*ptr);
|
||||
|
||||
__always_inline
|
||||
static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
|
||||
static void append_audio_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) {
|
||||
|
||||
AVDictionaryEntry *tag = NULL;
|
||||
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
|
||||
@ -269,7 +269,7 @@ static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
|
||||
} else if (strcmp(key, "album") == 0) {
|
||||
APPEND_TAG_META(MetaAlbum)
|
||||
} else if (strcmp(key, "comment") == 0) {
|
||||
APPEND_TAG_META(MetaContent)
|
||||
append_tag_meta_if_not_exists(ctx, doc, tag, MetaContent);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -437,7 +437,7 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
|
||||
return SAVE_THUMBNAIL_FAILED;
|
||||
}
|
||||
|
||||
if (ctx->tesseract_lang != NULL && IS_VIDEO(pFormatCtx)) {
|
||||
if (ctx->tesseract_lang != NULL && IS_VIDEO(pFormatCtx) && thumbnail_index == 0) {
|
||||
ocr_image(ctx, doc, decoder, frame_and_packet->frame);
|
||||
}
|
||||
|
||||
@ -558,7 +558,7 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
|
||||
}
|
||||
|
||||
if (audio_stream != -1) {
|
||||
append_audio_meta(pFormatCtx, doc);
|
||||
append_audio_meta(ctx, pFormatCtx, doc);
|
||||
}
|
||||
|
||||
if (video_stream != -1 && ctx->tn_count > 0) {
|
||||
|
34
third-party/libscan/libscan/raw/raw.c
vendored
34
third-party/libscan/libscan/raw/raw.c
vendored
@ -7,8 +7,22 @@
|
||||
|
||||
#define MIN_SIZE 32
|
||||
|
||||
int store_thumbnail_jpeg(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
|
||||
return store_image_thumbnail((scan_media_ctx_t *) ctx, img->data, img->data_size, doc, "x.jpeg");
|
||||
int store_thumbnail_jpeg(scan_raw_ctx_t *ctx, libraw_thumbnail_t img, document_t *doc) {
|
||||
|
||||
scan_media_ctx_t media_ctx = {
|
||||
.read_subtitles = FALSE,
|
||||
.tn_count = 1,
|
||||
.max_media_buffer = 0,
|
||||
.store = ctx->store,
|
||||
.log = ctx->log,
|
||||
.logf = ctx->logf,
|
||||
.tn_size = ctx->tn_size,
|
||||
.tn_qscale = ctx->tn_qscale,
|
||||
.tesseract_lang = NULL,
|
||||
.tesseract_path = NULL
|
||||
};
|
||||
|
||||
return store_image_thumbnail(&media_ctx, img.thumb, img.tlength, doc, "x.jpeg");
|
||||
}
|
||||
|
||||
int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
|
||||
@ -171,6 +185,13 @@ void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
return;
|
||||
}
|
||||
|
||||
int tn_ok = 0;
|
||||
|
||||
if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_JPEG) {
|
||||
tn_ok = store_thumbnail_jpeg(ctx, libraw_lib->thumbnail, doc);
|
||||
} else if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_BITMAP) {
|
||||
// TODO: technically this should work but is currently untested
|
||||
|
||||
int errc = 0;
|
||||
libraw_processed_image_t *thumb = libraw_dcraw_make_mem_thumb(libraw_lib, &errc);
|
||||
if (errc != 0) {
|
||||
@ -180,16 +201,9 @@ void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
return;
|
||||
}
|
||||
|
||||
int tn_ok = 0;
|
||||
if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_JPEG) {
|
||||
tn_ok = store_thumbnail_jpeg(ctx, thumb, doc);
|
||||
} else if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_BITMAP) {
|
||||
// TODO: technically this should work but is currently untested
|
||||
tn_ok = store_thumbnail_rgb24(ctx, thumb, doc);
|
||||
}
|
||||
|
||||
libraw_dcraw_clear_mem(thumb);
|
||||
|
||||
if (tn_ok == TRUE) {
|
||||
free(buf);
|
||||
libraw_close(libraw_lib);
|
||||
@ -206,7 +220,7 @@ void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
libraw_dcraw_process(libraw_lib);
|
||||
|
||||
errc = 0;
|
||||
int errc = 0;
|
||||
libraw_processed_image_t *img = libraw_dcraw_make_mem_image(libraw_lib, &errc);
|
||||
if (errc != 0) {
|
||||
free(buf);
|
||||
|
Loading…
x
Reference in New Issue
Block a user