Compare commits

...

5 Commits

Author SHA1 Message Date
c5cd00b76c
Update USAGE.md 2021-09-05 20:26:09 -04:00
ec5f07cab8
Merge pull request #184 from simon987/dev
v2.11.1
2021-09-05 20:06:18 -04:00
f098f7916a Version bump 2021-09-05 20:05:46 -04:00
85d67a9393 null checks in sig_handler 2021-09-05 20:03:42 -04:00
c5ac89813f Fix UI bug when losing focus of tags/mime tree 2021-09-05 19:59:01 -04:00
8 changed files with 48 additions and 106 deletions

View File

@ -32,7 +32,7 @@ Lightning-fast file system indexer and search tool.
Scan options Scan options
-t, --threads=<int> Number of threads. DEFAULT=1 -t, --threads=<int> Number of threads. DEFAULT=1
-q, --quality=<flt> Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=5 -q, --quality=<flt> Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=3
--size=<int> Thumbnail size, in pixels. Use negative value to disable. DEFAULT=500 --size=<int> Thumbnail size, in pixels. Use negative value to disable. DEFAULT=500
--content-size=<int> Number of bytes to be extracted from text documents. Use negative value to disable. DEFAULT=32768 --content-size=<int> Number of bytes to be extracted from text documents. Use negative value to disable. DEFAULT=32768
--incremental=<str> Reuse an existing index and only scan modified files. --incremental=<str> Reuse an existing index and only scan modified files.
@ -41,12 +41,14 @@ Scan options
--name=<str> Index display name. DEFAULT: (name of the directory) --name=<str> Index display name. DEFAULT: (name of the directory)
--depth=<int> Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1 --depth=<int> Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1
--archive=<str> Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse --archive=<str> Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse
--archive-passphrase=<str> Passphrase for encrypted archive files
--ocr=<str> Tesseract language (use tesseract --list-langs to see which are installed on your machine) --ocr=<str> Tesseract language (use tesseract --list-langs to see which are installed on your machine)
-e, --exclude=<str> Files that match this regex will not be scanned -e, --exclude=<str> Files that match this regex will not be scanned
--fast Only index file names & mime type --fast Only index file names & mime type
--treemap-threshold=<str> Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005 --treemap-threshold=<str> Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
--mem-buffer=<int> Maximum memory buffer size per thread in MB for files inside archives (see USAGE.md). DEFAULT: 2000 --mem-buffer=<int> Maximum memory buffer size per thread in MB for files inside archives (see USAGE.md). DEFAULT: 2000
--read-subtitles Read subtitles from media files --read-subtitles Read subtitles from media files.
--fast-epub Faster but less accurate EPUB parsing (no thumbnails, metadata)
Index options Index options
-t, --threads=<int> Number of threads. DEFAULT=1 -t, --threads=<int> Number of threads. DEFAULT=1
@ -66,13 +68,14 @@ Web options
--bind=<str> Listen on this address. DEFAULT=localhost:4090 --bind=<str> Listen on this address. DEFAULT=localhost:4090
--auth=<str> Basic auth in user:password format --auth=<str> Basic auth in user:password format
--tag-auth=<str> Basic auth in user:password format for tagging --tag-auth=<str> Basic auth in user:password format for tagging
--tagline=<str> Tagline in navbar
--dev Serve html & js files from disk (for development)
Exec-script options Exec-script options
--es-url=<str> Elasticsearch url. DEFAULT=http://localhost:9200 --es-url=<str> Elasticsearch url. DEFAULT=http://localhost:9200
--es-index=<str> Elasticsearch index name. DEFAULT=sist2 --es-index=<str> Elasticsearch index name. DEFAULT=sist2
--script-file=<str> Path to user script. --script-file=<str> Path to user script.
--async-script Execute user script asynchronously. --async-script Execute user script asynchronously.
Made by simon987 <me@simon987.net>. Released under GPL-3.0
``` ```
## Scan ## Scan
@ -82,7 +85,7 @@ Made by simon987 <me@simon987.net>. Released under GPL-3.0
* `-t, --threads` * `-t, --threads`
Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-WmiObject Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!** Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-WmiObject Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
* `-q, --quality` * `-q, --quality`
Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. *Does not affect PDF thumbnails quality* Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best.
* `--size` * `--size`
Thumbnail size in pixels. Thumbnail size in pixels.
* `--content-size` * `--content-size`
@ -125,6 +128,7 @@ Made by simon987 <me@simon987.net>. Released under GPL-3.0
To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -` To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
* `--read-subtitles` When enabled, will attempt to read the subtitles stream from media files. * `--read-subtitles` When enabled, will attempt to read the subtitles stream from media files.
* `--fast-epub` Much faster but less accurate EPUB parsing. When enabled, sist2 will use a simple HTML parser to read epub files instead of the MuPDF library. No thumbnails are generated and author/title metadata are not parsed.
### Scan examples ### Scan examples
@ -145,15 +149,11 @@ sist2 scan --incremental ./orig_idx/ -o ./updated_idx/ ~/Documents
### Index format ### Index format
A typical `binary` type index structure looks like this: A typical `ndjson` type index structure looks like this:
``` ```
documents.idx/ documents.idx/
├── descriptor.json ├── descriptor.json
├── _index_139965416830720 ├── _index_main.ndjson.zst
├── _index_139965425223424
├── _index_139965433616128
├── _index_139965442008832
├── _index_139965442008832
├── treemap.csv ├── treemap.csv
├── agg_mime.csv ├── agg_mime.csv
├── agg_date.csv ├── agg_date.csv
@ -169,9 +169,7 @@ documents.idx/
└── lock.mdb └── lock.mdb
``` ```
The `_index_*` files contain the raw binary index data and are not meant to be The `_index_*.ndjson.zst` files contain the document data in JSON format, in a compressed newline-delemited file.
read by other applications. The format is generally compatible across different
sist2 versions.
The `thumbs/` folder is a [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database) The `thumbs/` folder is a [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database)
database containing the thumbnails. database containing the thumbnails.
@ -181,66 +179,6 @@ following fields are safe to modify manually: `root`, `name`, [rewrite_url](#rew
The `.csv` are pre-computed aggregations necessary for the stats page. The `.csv` are pre-computed aggregations necessary for the stats page.
*Advanced usage*
Instead of using the `scan` module, you can also import an index generated
by a third party application. The 'external' index must have the following format:
```
my_index/
├── descriptor.json
├── _index_0
└── thumbs/
| ├── data.mdb
| └── lock.mdb
└── meta/
└── <empty>
```
*descriptor.json*:
```json
{
"uuid": "<valid UUID4>",
"version": "_external_v1",
"root": "(optional)",
"name": "<name>",
"rewrite_url": "(optional)",
"type": "json",
"timestamp": 1578971024
}
```
*_index_0*: NDJSON format (One json object per line)
```json
{
"_id": "unique uuid for the file",
"index": "index uuid4 (same one as descriptor.json!)",
"mime": "application/x-cbz",
"size": 14341204,
"mtime": 1578882996,
"extension": "cbz",
"name": "my_book",
"path": "path/to/books",
"content": "text contents of the book",
"title": "Title of the book",
"tag": ["genre.fiction", "author.someguy", "etc..."],
"_keyword": [
{"k": "ISBN", "v": "ABCD34789231"}
],
"_text": [
{"k": "other", "v": "This will be indexed as text"}
]
}
```
You can find the full list of supported fields [here](../src/io/serialize.c#L90)
The `_keyword.*` items will be indexed and searchable as **keyword** fields (only full matches allowed).
The `_text.*` items will be indexed and searchable as **text** fields (fuzzy searching allowed)
*thumbs/*: *thumbs/*:
LMDB key-value store. Keys are **binary** 16-byte md5 hash* (`_id` field) LMDB key-value store. Keys are **binary** 16-byte md5 hash* (`_id` field)
@ -248,9 +186,6 @@ and values are raw image bytes.
*\* Hash is calculated from the full path of the file, including the extension, relative to the index root* *\* Hash is calculated from the full path of the file, including the extension, relative to the index root*
Importing an external `binary` type index is technically possible but
it is currently unsupported and has no guaranties of back/forward compatibility.
## Index ## Index
### Index options ### Index options
@ -276,6 +211,7 @@ it is currently unsupported and has no guaranties of back/forward compatibility.
down the process. down the process.
* `-f, --force-reset` * `-f, --force-reset`
Reset Elasticsearch mappings and settings. Reset Elasticsearch mappings and settings.
* `-t, --threads` Number of threads to use. Ideally, choose a number equal to the number of logical cores of the machine hosting Elasticsearch.
### Index examples ### Index examples
@ -305,6 +241,8 @@ sist2 index --print ./my_index/ | jq | less
* `--auth=<str>` Basic auth in user:password format * `--auth=<str>` Basic auth in user:password format
* `--tag-auth=<str>` Basic auth in user:password format. Works the same way as the * `--tag-auth=<str>` Basic auth in user:password format. Works the same way as the
`--auth` argument, but authentication is only applied the `/tag/` endpoint. `--auth` argument, but authentication is only applied the `/tag/` endpoint.
* `--tagline=<str>` When specified, will replace the default tagline in the navbar.
* `--dev` Serve html & js files from disk (for development, used to modify frontend files without having to recompile)
### Web examples ### Web examples
@ -327,12 +265,6 @@ instead of serving the file from disk.
Both the `root` and `rewrite_url` fields are safe to manually modify from the Both the `root` and `rewrite_url` fields are safe to manually modify from the
`descriptor.json` file. `descriptor.json` file.
### Link to specific indices
To link to specific indices, you can add a list of comma-separated index name to
the URL: `?i=<name>,<name>`. By default, indices with `"(nsfw)"` in their name are
not displayed.
## exec-script ## exec-script
The `exec-script` command is used to execute a user script for an index that has already been imported to Elasticsearch with the `index` command. Note that the documents will not be reset to their default state before each execution as the `index` command does: if you make undesired changes to the documents by accident, you will need to run `index` again to revert to the original state. The `exec-script` command is used to execute a user script for an index that has already been imported to Elasticsearch with the `index` command. Note that the documents will not be reset to their default state before each execution as the `index` command does: if you make undesired changes to the documents by accident, you will need to run `index` again to revert to the original state.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -43,7 +43,7 @@ export default {
}, },
methods: { methods: {
handleTreeClick(node, e) { handleTreeClick(node, e) {
if (e === "indeterminate" || e === "collapsed") { if (e === "indeterminate" || e === "collapsed" || e === 'rendered' || e === "focused") {
return; return;
} }

View File

@ -162,7 +162,7 @@ export default {
}); });
}, },
handleTreeClick(node, e) { handleTreeClick(node, e) {
if (e === "indeterminate" || e === "collapsed" || e === 'rendered') { if (e === "indeterminate" || e === "collapsed" || e === 'rendered' || e === "focused") {
return; return;
} }

View File

@ -2,6 +2,8 @@
ScanCtx_t ScanCtx = { ScanCtx_t ScanCtx = {
.stat_index_size = 0, .stat_index_size = 0,
.dbg_current_files = NULL,
.pool = NULL
}; };
WebCtx_t WebCtx; WebCtx_t WebCtx;
IndexCtx_t IndexCtx; IndexCtx_t IndexCtx;

View File

@ -43,6 +43,7 @@ void sig_handler(int signum) {
LOG_ERROR("*SIGNAL HANDLER*", "=============================================\n\n"); LOG_ERROR("*SIGNAL HANDLER*", "=============================================\n\n");
LOG_ERRORF("*SIGNAL HANDLER*", "Uh oh! Caught fatal signal: %s", strsignal(signum)); LOG_ERRORF("*SIGNAL HANDLER*", "Uh oh! Caught fatal signal: %s", strsignal(signum));
if (ScanCtx.dbg_current_files != NULL) {
GHashTableIter iter; GHashTableIter iter;
g_hash_table_iter_init(&iter, ScanCtx.dbg_current_files); g_hash_table_iter_init(&iter, ScanCtx.dbg_current_files);
@ -65,8 +66,15 @@ void sig_handler(int signum) {
); );
} }
} }
}
if (ScanCtx.pool != NULL) {
tpool_dump_debug_info(ScanCtx.pool); tpool_dump_debug_info(ScanCtx.pool);
}
if (IndexCtx.pool != NULL) {
tpool_dump_debug_info(IndexCtx.pool);
}
LOG_INFO( LOG_INFO(
"*SIGNAL HANDLER*", "*SIGNAL HANDLER*",

View File

@ -49,7 +49,7 @@
#include <ctype.h> #include <ctype.h>
#include "git_hash.h" #include "git_hash.h"
#define VERSION "2.11.0" #define VERSION "2.11.1"
static const char *const Version = VERSION; static const char *const Version = VERSION;
#ifndef SIST_PLATFORM #ifndef SIST_PLATFORM