Fix tesseract language paths

Update readme
Bug fixes
2025-12-13 07:19:06 +00:00 · 2023-04-23 17:29:55 -04:00 · 2023-04-23 16:39:34 -04:00 · 2023-04-23 14:15:31 -04:00 · 2023-04-23 12:54:33 -04:00 · 2023-04-23 12:53:27 -04:00
50 changed files with 3051 additions and 1717 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +0,0 @@
-CMakeModules/* linguist-vendored
-**/*_generated.c linguist-vendored
-**/*_generated.h linguist-vendored
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,7 @@ set(CMAKE_C_STANDARD 11)

 option(SIST_DEBUG "Build a debug executable" on)
 option(SIST_FAST "Enable more optimisation flags" off)
+option(SIST_DEBUG_INFO "Turn on debug information in web interface" on)

 add_compile_definitions(
        "SIST_PLATFORM=${SIST_PLATFORM}"
@@ -14,8 +15,18 @@ if (SIST_DEBUG)
    add_compile_definitions(
            "SIST_DEBUG=${SIST_DEBUG}"
    )
+    set(VCPKG_BUILD_TYPE debug)
+else ()
+    set(VCPKG_BUILD_TYPE release)
 endif ()

+if (SIST_DEBUG_INFO)
+    add_compile_definitions(
+            "SIST_DEBUG_INFO=${SIST_DEBUG_INFO}"
+    )
+endif ()
+
+
 add_subdirectory(third-party/libscan)
 set(ARGPARSE_SHARED off)
 add_subdirectory(third-party/argparse)
@@ -47,7 +58,7 @@ add_executable(sist2

        src/auth0/auth0_c_api.h src/auth0/auth0_c_api.cpp

-        src/database/database_stats.c src/database/database_stats.h src/database/database_schema.c)
+        src/database/database_stats.c src/database/database_schema.c)
 set_target_properties(sist2 PROPERTIES LINKER_LANGUAGE C)

 target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/)
--- a/30
+++ b/30
@@ -19,13 +19,12 @@ COPY sist2-admin sist2-admin
 RUN cd sist2-vue/ && npm install && npm run build
 RUN cd sist2-admin/frontend/ && npm install && npm run build

-RUN mkdir build && cd build && cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake ..
+RUN mkdir build && cd build && cmake -DSIST_PLATFORM=x64_linux_docker -DSIST_DEBUG_INFO=on -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake ..
 RUN cd build && make -j$(nproc)
 RUN strip build/sist2 || mv build/sist2_debug build/sist2

 FROM --platform="linux/amd64" ubuntu@sha256:965fbcae990b0467ed5657caceaec165018ef44a4d2d46c7cdea80a9dff0d1ea

-WORKDIR /root

 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8
@@ -37,21 +36,22 @@ RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y curl libasan5 li

 RUN mkdir -p /usr/share/tessdata && \
    cd /usr/share/tessdata/ && \
-    curl -o /usr/share/tessdata/hin.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/hin.traineddata &&\
-    curl -o /usr/share/tessdata/jpn.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/jpn.traineddata &&\
-    curl -o /usr/share/tessdata/eng.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/eng.traineddata &&\
-    curl -o /usr/share/tessdata/fra.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/fra.traineddata &&\
-    curl -o /usr/share/tessdata/rus.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/rus.traineddata &&\
-    curl -o /usr/share/tessdata/osd.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/osd.traineddata &&\
-    curl -o /usr/share/tessdata/spa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata &&\
-    curl -o /usr/share/tessdata/deu.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/deu.traineddata &&\
-    curl -o /usr/share/tessdata/equ.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/equ.traineddata &&\
-    curl -o /usr/share/tessdata/chi_sim.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/chi_sim.traineddata
+    curl -o /usr/share/tesseract-ocr/4.00/tessdata/hin.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/hin.traineddata &&\
+    curl -o /usr/share/tesseract-ocr/4.00/tessdata/jpn.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/jpn.traineddata &&\
+    curl -o /usr/share/tesseract-ocr/4.00/tessdata/ng.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/eng.traineddata &&\
+    curl -o /usr/share/tesseract-ocr/4.00/tessdata/ra.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/fra.traineddata &&\
+    curl -o /usr/share/tesseract-ocr/4.00/tessdata/us.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/rus.traineddata &&\
+    curl -o /usr/share/tesseract-ocr/4.00/tessdata/sd.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/osd.traineddata &&\
+    curl -o /usr/share/tesseract-ocr/4.00/tessdata/pa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata &&\
+    curl -o /usr/share/tesseract-ocr/4.00/tessdata/eu.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/deu.traineddata &&\
+    curl -o /usr/share/tesseract-ocr/4.00/tessdata/qu.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/equ.traineddata &&\
+    curl -o /usr/share/tesseract-ocr/4.00/tessdata/hi_sim.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/chi_sim.traineddata

 # sist2
 COPY --from=build /build/build/sist2 /root/sist2

 # sist2-admin
-COPY sist2-admin/requirements.txt sist2-admin/
-RUN python3 -m pip install --no-cache -r sist2-admin/requirements.txt
-COPY --from=build /build/sist2-admin/ sist2-admin/
+WORKDIR /root/sist2-admin
+COPY sist2-admin/requirements.txt /root/sist2-admin/
+RUN python3 -m pip install --no-cache -r /root/sist2-admin/requirements.txt
+COPY --from=build /build/sist2-admin/ /root/sist2-admin/
--- a/Dockerfile.arm64
+++ b/Dockerfile.arm64
@@ -3,7 +3,7 @@ MAINTAINER simon987 <me@simon987.net>

 WORKDIR /build/
 ADD . /build/
-RUN mkdir build && cd build && cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake ..
+RUN mkdir build && cd build && cmake -DSIST_PLATFORM=arm64_linux_docker -DSIST_DEBUG_INFO=on -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake ..
 RUN cd build && make -j$(nproc)
 RUN strip build/sist2 || mv build/sist2_debug build/sist2

--- a/README.md
+++ b/README.md
@@ -10,13 +10,13 @@ sist2 (Simple incremental search tool)

 *Warning: sist2 is in early development*

-![search panel](docs/sist2.png)
+![search panel](docs/sist2.gif)

 ## Features

 * Fast, low memory usage, multi-threaded
+* Manage & schedule scan jobs with simple web interface (Docker only)
 * Mobile-friendly Web interface
-* Portable (all its features are packaged in a single executable)
 * Extracts text and metadata from common file types \*
 * Generates thumbnails \*
 * Incremental scanning
@@ -24,47 +24,60 @@ sist2 (Simple incremental search tool)
 * Recursive scan inside archive files \*\*
 * OCR support with tesseract \*\*\*
 * Stats page & disk utilisation visualization
+* Named-entity recognition (client-side) \*\*\*\*

 \* See [format support](#format-support)    
 \*\* See [Archive files](#archive-files)    
 \*\*\* See [OCR](#ocr)
-
-![stats](docs/stats.png)
+\*\*\*\* See [Named-Entity Recognition](#NER)

 ## Getting Started

+### Using Docker Compose *(Windows/Linux/Mac)*
+
+```yaml
+version: "3"
+
+services:
+  elasticsearch:
+    image: elasticsearch:7.17.9
+    restart: unless-stopped
+    environment:
+      - "discovery.type=single-node"
+      - "ES_JAVA_OPTS=-Xms2g -Xmx2g"
+  sist2-admin:
+    image: simon987/sist2:3.0.4-x64-linux
+    restart: unless-stopped
+    volumes:
+      - ./sist2-admin-data/:/sist2-admin/
+      - /:/host
+    ports:
+      - 4090:4090 # sist2
+      - 8080:8080 # sist2-admin
+    working_dir: /root/sist2-admin/
+    entrypoint: python3 /root/sist2-admin/sist2_admin/app.py
+```
+
+Navigate to http://localhost:8080/ to configure sist2-admin.
+
+### Using the executable file *(Linux/WSL only)*
+
 1. Have an Elasticsearch (>= 6.8.X, ideally >=7.14.0) instance running
    1. Download [from official website](https://www.elastic.co/downloads/elasticsearch)
-    1. *(or)* Run using docker:
+    2. *(or)* Run using docker:
        ```bash
        docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.17.9
        ```
-    1. *(or)* Run using docker-compose:
-        ```yaml
-          elasticsearch:
-            image: docker.elastic.co/elasticsearch/elasticsearch:7.17.9
-            environment:
-              - discovery.type=single-node
-              - "ES_JAVA_OPTS=-Xms1G -Xmx2G"
-        ```
-1. Download sist2 executable
-    1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases). 
-Select the file corresponding to your CPU architecture and mark the binary as executable with `chmod +x` *
-    2. *(or)* Download a [development snapshot](https://files.simon987.net/.gate/sist2/simon987_sist2/) *(Not
-       recommended!)*
-    3. *(or)* `docker pull simon987/sist2:2.12.1-x64-linux`

-1. See [Usage guide](docs/USAGE.md)
+2. Download the [latest sist2 release](https://github.com/simon987/sist2/releases).
+   Select the file corresponding to your CPU architecture and mark the binary as executable with `chmod +x`.
+3. See [usage guide](docs/USAGE.md) for command line usage.

-\* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)
+Example usage:

-## Example usage
-
-See [Usage guide](docs/USAGE.md) for more details
-
-1. Scan a directory: `sist2 scan ~/Documents -o ./docs_idx`
-1. Push index to Elasticsearch: `sist2 index ./docs_idx`
-1. Start web interface: `sist2 web ./docs_idx`
+1. Scan a directory: `sist2 scan ~/Documents --output ./documents.sist2`
+2. Push index to Elasticsearch: `sist2 index ./documents.sist2`
+3. Start web interface: `sist2 web ./documents.sist2`

 ## Format support

@@ -82,7 +95,7 @@ See [Usage guide](docs/USAGE.md) for more details
 | tar, zip, rar, 7z, ar ...                                                 | Libarchive                                                                   | yes\*    | -           | no                                                                                                                                     |
 | docx, xlsx, pptx                                                          | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes      | if embedded | creator, modified_by, title                                                                                                            |
 | doc (MS Word 97-2003)                                                     | antiword                                                                     | yes      | no          | author, title                                                                                                                          |
-| mobi, azw, azw3                                                           | libmobi                                                                      | yes      | no          | author, title                                                                                                                          |
+| mobi, azw, azw3                                                           | libmobi                                                                      | yes      | yes         | author, title                                                                                                                          |
 | wpd (WordPerfect)                                                         | libwpd                                                                       | yes      | no          | *planned*                                                                                                                              |
 | json, jsonl, ndjson                                                       | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes      | -           | -                                                                                                                                      |

@@ -123,20 +136,44 @@ sist2 scan --ocr-images --ocr-lang eng ~/Images/Screenshots/
 sist2 scan --ocr-ebooks --ocr-images --ocr-lang eng+chi_sim ~/Chinese-Bilingual/
 ```

+### NER
+
+sist2 v3.0.4+ supports named-entity recognition (NER). Simply add a supported repository URL to 
+**Configuration** > **Machine learning options** > **Model repositories**
+to enable it.
+
+The text processing is done in your browser, no data is sent to any third-party services.
+See [simon987/sist2-ner-models](https://github.com/simon987/sist2-ner-models) for more details.
+
+#### List of available repositories:
+
+| URL                                                                                                     | Maintainer                              | Purpose |
+|---------------------------------------------------------------------------------------------------------|-----------------------------------------|---------|
+| [simon987/sist2-ner-models](https://raw.githubusercontent.com/simon987/sist2-ner-models/main/repo.json) | [simon987](https://github.com/simon987) | General |
+
+
+<details>
+  <summary>Screenshot</summary>
+
+![ner](docs/ner.png)
+
+</details>
+
 ## Build from source

 You can compile **sist2** by yourself if you don't want to use the pre-compiled binaries

-### With docker (recommended)
+### Using docker

 ```bash
 git clone --recursive https://github.com/simon987/sist2/
 cd sist2
-docker build . -f ./Dockerfile -t my-sist2-image
+docker build . -t my-sist2-image
+# Copy sist2 executable from docker image
 docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
 ```

-### On a linux computer
+### Using a linux computer

 1. Install compile-time dependencies

@@ -144,15 +181,14 @@ docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
   apt install gcc g++ python3 yasm ragel automake autotools-dev wget libtool libssl-dev curl zip unzip tar xorg-dev libglu1-mesa-dev libxcursor-dev libxml2-dev libxinerama-dev gettext nasm git nodejs
   ```

-1. Apply vcpkg patches, as per [sist2-build](https://github.com/simon987/sist2-build) Dockerfile
-
-1. Install vcpkg dependencies
+2. Install vcpkg using my fork: https://github.com/simon987/vcpkg
+3. Install vcpkg dependencies

    ```bash
    vcpkg install curl[core,openssl] sqlite3 cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample]
    ```

-1. Build
+4. Build
    ```bash
    git clone --recursive https://github.com/simon987/sist2/
    (cd sist2-vue; npm install; npm run build)
--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@@ -1,78 +1,64 @@
 # Usage

-*More examples (specifically with docker/compose) are in progress*
-
-* [scan](#scan)
-    * [options](#scan-options)
-    * [examples](#scan-examples)
-    * [index format](#index-format)
-* [index](#index)
-    * [options](#index-options)
-    * [examples](#index-examples)
-* [web](#web)
-    * [options](#web-options)
-    * [examples](#web-examples)
-    * [rewrite_url](#rewrite_url)
-* [elasticsearch](#elasticsearch)
-* [exec-script](#exec-script)
-* [tagging](#tagging)
-* [sidecar files](#sidecar-files)
-
 ```
 Usage: sist2 scan [OPTION]... PATH
   or: sist2 index [OPTION]... INDEX
   or: sist2 web [OPTION]... INDEX...
   or: sist2 exec-script [OPTION]... INDEX
+
 Lightning-fast file system indexer and search tool.

    -h, --help                        show this help message and exit
-    -v, --version                     Show version and exit
-    --verbose                         Turn on logging
-    --very-verbose                    Turn on debug messages
+    -v, --version                     Print version and exit.
+    --verbose                         Turn on logging.
+    --very-verbose                    Turn on debug messages.
+    --json-logs                       Output logs in JSON format.

 Scan options
-    -t, --threads=<int>               Number of threads. DEFAULT=1
-    --mem-throttle=<int>              Total memory threshold in MiB for scan throttling. DEFAULT=0
-    -q, --thumbnail-quality=<int>     Thumbnail quality, on a scale of 2 to 31, 2 being the best. DEFAULT=2
-    --thumbnail-size=<int>            Thumbnail size, in pixels. DEFAULT=500
-    --thumbnail-count=<int>           Number of thumbnails to generate. Set a value > 1 to create video previews, set to 0 to disable thumbnails. DEFAULT=1
-    --content-size=<int>              Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT=32768
-    --incremental=<str>               Reuse an existing index and only scan modified files.
-    -o, --output=<str>                Output directory. DEFAULT=index.sist2/
+    -t, --threads=<int>               Number of threads. DEFAULT: 1
+    -q, --thumbnail-quality=<int>     Thumbnail quality, on a scale of 2 to 31, 2 being the best. DEFAULT: 2
+    --thumbnail-size=<int>            Thumbnail size, in pixels. DEFAULT: 552
+    --thumbnail-count=<int>           Number of thumbnails to generate. Set a value > 1 to create video previews, set to 0 to disable thumbnails. DEFAULT: 1
+    --content-size=<int>              Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT: 32768
+    -o, --output=<str>                Output index file path. DEFAULT: index.sist2
+    --incremental                     If the output file path exists, only scan new or modified files.
+    --optimize-index                  Defragment index file after scan to reduce its file size.
    --rewrite-url=<str>               Serve files from this url instead of from disk.
-    --name=<str>                      Index display name. DEFAULT: (name of the directory)
+    --name=<str>                      Index display name. DEFAULT: index
    --depth=<int>                     Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1
-    --archive=<str>                   Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse
+    --archive=<str>                   Archive file mode (skip|list|shallow|recurse). skip: don't scan, list: only save file names as text, shallow: don't scan archives inside archives. DEFAULT: recurse
    --archive-passphrase=<str>        Passphrase for encrypted archive files
    --ocr-lang=<str>                  Tesseract language (use 'tesseract --list-langs' to see which are installed on your machine)
    --ocr-images                      Enable OCR'ing of image files.
    --ocr-ebooks                      Enable OCR'ing of ebook files.
-    -e, --exclude=<str>               Files that match this regex will not be scanned
-    --fast                            Only index file names & mime type
+    -e, --exclude=<str>               Files that match this regex will not be scanned.
+    --fast                            Only index file names & mime type.
    --treemap-threshold=<str>         Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
    --mem-buffer=<int>                Maximum memory buffer size per thread in MiB for files inside archives (see USAGE.md). DEFAULT: 2000
    --read-subtitles                  Read subtitles from media files.
-    --fast-epub                       Faster but less accurate EPUB parsing (no thumbnails, metadata)
+    --fast-epub                       Faster but less accurate EPUB parsing (no thumbnails, metadata).
    --checksums                       Calculate file checksums when scanning.
    --list-file=<str>                 Specify a list of newline-delimited paths to be scanned instead of normal directory traversal. Use '-' to read from stdin.

 Index options
-    -t, --threads=<int>               Number of threads. DEFAULT=1
-    --es-url=<str>                    Elasticsearch url with port. DEFAULT=http://localhost:9200
-    --es-index=<str>                  Elasticsearch index name. DEFAULT=sist2
-    -p, --print                       Just print JSON documents to stdout.
-    --incremental-index               Conduct incremental indexing, assumes that the old index is already digested by Elasticsearch.
+    -t, --threads=<int>               Number of threads. DEFAULT: 1
+    --es-url=<str>                    Elasticsearch url with port. DEFAULT: http://localhost:9200
+    --es-insecure-ssl                 Do not verify SSL connections to Elasticsearch.
+    --es-index=<str>                  Elasticsearch index name. DEFAULT: sist2
+    -p, --print                       Print JSON documents to stdout instead of indexing to elasticsearch.
+    --incremental-index               Conduct incremental indexing. Assumes that the old index is already ingested in Elasticsearch.
    --script-file=<str>               Path to user script.
    --mappings-file=<str>             Path to Elasticsearch mappings.
    --settings-file=<str>             Path to Elasticsearch settings.
    --async-script                    Execute user script asynchronously.
-    --batch-size=<int>                Index batch size. DEFAULT: 100
-    -f, --force-reset                 Reset Elasticsearch mappings and settings. (You must use this option the first time you use the index command)
+    --batch-size=<int>                Index batch size. DEFAULT: 70
+    -f, --force-reset                 Reset Elasticsearch mappings and settings.

 Web options
-    --es-url=<str>                    Elasticsearch url. DEFAULT=http://localhost:9200
-    --es-index=<str>                  Elasticsearch index name. DEFAULT=sist2
-    --bind=<str>                      Listen on this address. DEFAULT=localhost:4090
+    --es-url=<str>                    Elasticsearch url. DEFAULT: http://localhost:9200
+    --es-insecure-ssl                 Do not verify SSL connections to Elasticsearch.
+    --es-index=<str>                  Elasticsearch index name. DEFAULT: sist2
+    --bind=<str>                      Listen for connections on this address. DEFAULT: localhost:4090
    --auth=<str>                      Basic auth in user:password format
    --auth0-audience=<str>            API audience/identifier
    --auth0-domain=<str>              Application domain
@@ -84,77 +70,15 @@ Web options
    --lang=<str>                      Default UI language. Can be changed by the user

 Exec-script options
-    --es-url=<str>                    Elasticsearch url. DEFAULT=http://localhost:9200
-    --es-index=<str>                  Elasticsearch index name. DEFAULT=sist2
+    --es-url=<str>                    Elasticsearch url. DEFAULT: http://localhost:9200
+    --es-insecure-ssl                 Do not verify SSL connections to Elasticsearch.
+    --es-index=<str>                  Elasticsearch index name. DEFAULT: sist2
    --script-file=<str>               Path to user script.
    --async-script                    Execute user script asynchronously.
+
 Made by simon987 <me@simon987.net>. Released under GPL-3.0
 ```

-## Scan
-
-### Scan options
-
-* `-t, --threads` 
-      Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-CimInstance Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
-* `--mem-throttle`
-    Total memory threshold in MiB for scan throttling. Worker threads will not start a new parse job
-    until the total memory usage of sist2 is below this threshold. Set to 0 to disable. DEFAULT=0
-* `-q, --thumbnail-quality` 
-    Thumbnail quality, on a scale of 2 to 32, 2 being the best. See section below for a rough estimate of thumbnail database size
-* `--thumbnail-size` 
-    Thumbnail size in pixels.
-* `--thumbnail-count`
-    Maximum number of thumbnails to generate. When set to a value >= 2, thumbnails for video previews
-    will be generated. The actual number of thumbnails generated depends on the length of the video (maximum 1 image 
-    every ~7s). Set to 0 to completely disable thumbnails.
-* `--content-size` 
-    Number of bytes of text to be extracted from the content of files (plain text, PDFs etc.).
-    Repeated whitespace and special characters do not count toward this limit.
-    Set to 0 to completely disable content parsing.
-* `--incremental`
-    Specify an existing index. Information about files in this index that were not modified (based on *mtime* attribute)
-    will be copied to the new index and will not be parsed again.
-* `-o, --output` Output directory. 
-* `--rewrite-url` Set the `rewrite_url` option for the web module (See [rewrite_url](#rewrite_url)) 
-* `--name` Set the `name` option for the web module
-* `--depth` Maximum scan dept. Set to 0 only scan files directly in the root directory, set to -1 for infinite depth
-* `--archive` Archive file mode.
-    * skip: Don't parse
-    * list: Only get file names as text
-    * shallow: Don't parse archives inside archives.
-    * recurse: Scan archives recursively (default)
-* `--ocr-lang`, `--ocr-ebooks`, `--ocr-images` See [OCR](../README.md#OCR)
-* `-e, --exclude` Regex pattern to exclude files. A file is excluded if the pattern matches any 
-    part of the full absolute path.
-    
-    Examples: 
-    * `-e ".*\.ttf"`: Ignore ttf files
-    * `-e ".*\.(ttf|rar)"`: Ignore ttf and rar files
-    * `-e "^/mnt/backups/"`: Ignore all files in the `/mnt/backups/` directory
-    * `-e "^/mnt/Data[12]/"`: Ignore all files in the `/mnt/Data1/` and `/mnt/Data2/` directory
-    * `-e "(^/usr/)|(^/var/)|(^/media/DRIVE-A/tmp/)|(^/media/DRIVE-B/Trash/)"` Exclude the
-     `/usr`, `/var`, `/media/DRIVE-A/tmp`, `/media/DRIVE-B/Trash` directories
-* `--fast` Only index file names and mime type
-* `--treemap-threshold` Directories smaller than (`treemap-threshold` * `<total size of the index>`)
-    will not be considered for the disk utilisation visualization; their size will be added to
-    the parent directory. If the parent directory is still smaller than the threshold, it will also be "merged upwards"
-    and so on.
-    
-    In effect, smaller `treemap-threshold` values will yield a more detailed 
-    (but also a more cluttered and harder to read) visualization. 
-    
-* `--mem-buffer` Maximum memory buffer size in MiB (per thread) for files inside archives. Media files 
-    larger than this number will be read sequentially and no *seek* operations will be supported.
-
-    To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
-* `--read-subtitles` When enabled, will attempt to read the subtitles stream from media files.
-* `--fast-epub` Much faster but less accurate EPUB parsing. When enabled, sist2 will use a simple HTML parser to read epub files instead of the MuPDF library. No thumbnails are generated and author/title metadata are not parsed.
-* `--checksums` Calculate file checksums (SHA1) when scanning files. This option does not cause any additional read 
-  operations. Checksums are not calculated for all file types, unless the file is inside an archive. When enabled, duplicate
-  files are hidden in the web UI (this behaviour can be toggled in the Configuration page).
-
-
 #### Thumbnail database size estimation

 See chart below for rough estimate of thumbnail size vs. thumbnail size & quality arguments:
@@ -164,8 +88,6 @@ that is about `8000000 * 36kB = 288GB`.

 ![thumbnail_size](thumbnail_size.png)

-// TODO: add note about LMDB page size 4096
-
 ### Scan examples

 Simple scan
@@ -175,82 +97,19 @@ sist2 scan ~/Documents
 sist2 scan \
    --threads 4 --content-size 16000000 --thumbnail-quality 2 --archive shallow \
    --name "My Documents" --rewrite-url "http://nas.domain.local/My Documents/" \
-    ~/Documents -o ./documents.idx/
+    ~/Documents -o ./documents.sist2
 ```

 Incremental scan
+
+If the index file does not exist, `--incremental` has no effect.
+```bash
+sist scan ~/Documents -o ./documents.sist2
+sist scan ~/Documents -o ./documents.sist2 --incremental
+# or
+sist scan ~/Documents -o ./documents.sist2 --incremental
+sist scan ~/Documents -o ./documents.sist2 --incremental
 ```
-sist2 scan --incremental ./orig_idx/ -o ./updated_idx/ ~/Documents
-```
-
-### Index format
-
-A typical `ndjson` type index structure looks like this:
-```
-documents.idx/
-├── descriptor.json
-├── _index_main.ndjson.zst
-├── treemap.csv
-├── agg_mime.csv
-├── agg_date.csv
-├── add_size.csv
-├── thumbs/
-|   ├── data.mdb
-|   └── lock.mdb
-├── tags/
-|   ├── data.mdb
-|   └── lock.mdb
-└── meta/
-    ├── data.mdb
-    └── lock.mdb
-```
-
-The `_index_*.ndjson.zst` files contain the document data in JSON format, in a compressed newline-delemited file.
-
-The `thumbs/` folder is a [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database)
-database containing the thumbnails.
-
-The `descriptor.json` file contains general information about the index. The 
-following fields are safe to modify manually: `root`, `name`, [rewrite_url](#rewrite_url) and `timestamp`.
-
-The `.csv` are pre-computed aggregations necessary for the stats page.
-
-*thumbs/*:
-
-LMDB key-value store. Keys are **binary** 16-byte md5 hash* (`_id` field)
-and values are raw image bytes.
-
-*\* Hash is calculated from the full path of the file, including the extension, relative to the index root*
-
-
-## Index
-### Index options
- * `--es-url` 
- Elasticsearch url and port. If you are using docker, make sure that both containers are on the
- same network.
- * `--es-index` 
-    Elasticsearch index name. DEFAULT=sist2
- * `-p, --print` 
-    Print index in JSON format to stdout.
- * `--incremental-index`
-   Conduct incremental indexing. Assumes that the old index is already ingested in Elasticsearch.
-   Only the new changes since the last scan will be sent.
- * `--script-file` 
-    Path to user script. See [Scripting](scripting.md).
- * `--mappings-file`
-    Path to custom Elasticsearch mappings. If none is specified, [the bundled mappings](https://github.com/simon987/sist2/tree/master/schema) will be used.
- * `--settings-file`
-    Path to custom Elasticsearch settings. *(See above)*
- * `--async-script` 
-    Use `wait_for_completion=false` elasticsearch option while executing user script.
-     (See [Elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/tasks.html))
- * `--batch-size=<int>` 
-    Index batch size. Indexing is generally faster with larger batches, but payloads that
-    are too large will fail and additional overhead for retrying with smaller sizes may slow
-    down the process.
- * `-f, --force-reset` 
-    Reset Elasticsearch mappings and settings.
- * `-t, --threads` Number of threads to use. Ideally, choose a number equal to the number of logical cores of the machine hosting Elasticsearch.

 ### Index examples

@@ -380,8 +239,8 @@ The sidecar file must have exactly the same file path and the `.s2meta` suffix.
 ```

 ```
-sist2 scan ~/Documents -o ./docs.idx
-sist2 index ./docs.idx
+sist2 scan ~/Documents -o ./docs.sist2
+sist2 index ./docs.sist2
 ```

 *NOTE*: It is technically possible to overwrite the `tag` value using sidecar files, however,
--- a/docs/ner.png
+++ b/docs/ner.png
--- a/docs/sist2.gif
+++ b/docs/sist2.gif
--- a/docs/sist2.png
+++ b/docs/sist2.png
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -4,14 +4,20 @@ VCPKG_ROOT="/vcpkg"

 git submodule update --init --recursive

-rm -rf CMakeFiles CMakeCache.txt
-cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
+mkdir build
+(
+  cd build
+  cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG_INFO=on -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" ..
  make -j $(nproc)
  strip sist2
  ./sist2 -v > VERSION
-mv sist2 sist2-x64-linux
+)
+mv build/sist2 sist2-x64-linux

+(
+  cd build
  rm -rf CMakeFiles CMakeCache.txt
-cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
+  cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG_INFO=on -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" ..
  make -j  $(nproc)
-mv sist2_debug sist2-x64-linux-debug
+)
+mv build/sist2_debug sist2-x64-linux-debug
--- a/scripts/build_arm64.sh
+++ b/scripts/build_arm64.sh
@@ -4,14 +4,19 @@ VCPKG_ROOT="/vcpkg"

 git submodule update --init --recursive

-rm -rf CMakeFiles CMakeCache.txt
-cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
+mkdir build
+(
+  cd build
+  cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG_INFO=on -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" ..
  make -j $(nproc)
  strip sist2
-mv sist2 sist2-arm64-linux
+)
+mv build/sist2 sist2-arm64-linux

 rm -rf CMakeFiles CMakeCache.txt
-cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
+(
+  cd build
+  cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG_INFO=on -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" ..
  make -j $(nproc)
-strip sist2
-mv sist2_debug sist2-arm64-linux-debug
+)
+mv build/sist2_debug sist2-arm64-linux-debug
--- a/scripts/start_dev_es_8.sh
+++ b/scripts/start_dev_es_8.sh
@@ -1,3 +1,3 @@
 docker run --rm -it --name "sist2-dev-es"\
       	-p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" \
-	-e "ES_JAVA_OPTS=-Xms8g -Xmx8g" elasticsearch:8.1.2
+	-e "ES_JAVA_OPTS=-Xms8g -Xmx8g" elasticsearch:8.7.0
--- a/sist2-admin/frontend/package-lock.json
+++ b/sist2-admin/frontend/package-lock.json
@@ -10491,9 +10491,9 @@
      "integrity": "sha1-JFNCdeKnvGvnvIZhHMFq4KVlSHE="
    },
    "node_modules/webpack": {
-      "version": "5.75.0",
-      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.75.0.tgz",
-      "integrity": "sha512-piaIaoVJlqMsPtX/+3KTTO6jfvrSYgauFVdt8cr9LTHKmcq/AMd4mhzsiP7ZF/PGRNPGA8336jldh9l2Kt2ogQ==",
+      "version": "5.78.0",
+      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.78.0.tgz",
+      "integrity": "sha512-gT5DP72KInmE/3azEaQrISjTvLYlSM0j1Ezhht/KLVkrqtv10JoP/RXhwmX/frrutOPuSq3o5Vq0ehR/4Vmd1g==",
      "dev": true,
      "dependencies": {
        "@types/eslint-scope": "^3.7.3",
@@ -18719,9 +18719,9 @@
      "integrity": "sha1-JFNCdeKnvGvnvIZhHMFq4KVlSHE="
    },
    "webpack": {
-      "version": "5.75.0",
-      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.75.0.tgz",
-      "integrity": "sha512-piaIaoVJlqMsPtX/+3KTTO6jfvrSYgauFVdt8cr9LTHKmcq/AMd4mhzsiP7ZF/PGRNPGA8336jldh9l2Kt2ogQ==",
+      "version": "5.78.0",
+      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.78.0.tgz",
+      "integrity": "sha512-gT5DP72KInmE/3azEaQrISjTvLYlSM0j1Ezhht/KLVkrqtv10JoP/RXhwmX/frrutOPuSq3o5Vq0ehR/4Vmd1g==",
      "dev": true,
      "requires": {
        "@types/eslint-scope": "^3.7.3",
--- a/sist2-admin/frontend/yarn.lock
+++ b/sist2-admin/frontend/yarn.lock
@@ -1390,14 +1390,14 @@
    thread-loader "^3.0.0"
    webpack "^5.54.0"

-"@vue/cli-plugin-router@~5.0.8":
+"@vue/cli-plugin-router@^5.0.8", "@vue/cli-plugin-router@~5.0.8":
  version "5.0.8"
  resolved "https://registry.npmjs.org/@vue/cli-plugin-router/-/cli-plugin-router-5.0.8.tgz"
  integrity sha512-Gmv4dsGdAsWPqVijz3Ux2OS2HkMrWi1ENj2cYL75nUeL+Xj5HEstSqdtfZ0b1q9NCce+BFB6QnHfTBXc/fCvMg==
  dependencies:
    "@vue/cli-shared-utils" "^5.0.8"

-"@vue/cli-plugin-vuex@~5.0.8":
+"@vue/cli-plugin-vuex@^5.0.8", "@vue/cli-plugin-vuex@~5.0.8":
  version "5.0.8"
  resolved "https://registry.npmjs.org/@vue/cli-plugin-vuex/-/cli-plugin-vuex-5.0.8.tgz"
  integrity sha512-HSYWPqrunRE5ZZs8kVwiY6oWcn95qf/OQabwLfprhdpFWAGtLStShjsGED2aDpSSeGAskQETrtR/5h7VqgIlBA==
@@ -5492,9 +5492,9 @@ webpack-virtual-modules@^0.4.2:
  integrity sha512-5tyDlKLqPfMqjT3Q9TAqf2YqjwmnUleZwzJi1A5qXnlBCdj2AtOJ6wAWdglTIDOPgOiOrXeBeFcsQ8+aGQ6QbA==

 webpack@^5.54.0:
-  version "5.75.0"
-  resolved "https://registry.npmjs.org/webpack/-/webpack-5.75.0.tgz"
-  integrity sha512-piaIaoVJlqMsPtX/+3KTTO6jfvrSYgauFVdt8cr9LTHKmcq/AMd4mhzsiP7ZF/PGRNPGA8336jldh9l2Kt2ogQ==
+  version "5.78.0"
+  resolved "https://registry.yarnpkg.com/webpack/-/webpack-5.78.0.tgz#836452a12416af2a7beae906b31644cb2562f9e6"
+  integrity sha512-gT5DP72KInmE/3azEaQrISjTvLYlSM0j1Ezhht/KLVkrqtv10JoP/RXhwmX/frrutOPuSq3o5Vq0ehR/4Vmd1g==
  dependencies:
    "@types/eslint-scope" "^3.7.3"
    "@types/estree" "^0.0.51"
--- a/sist2-vue/package-lock.json
+++ b/sist2-vue/package-lock.json
--- a/sist2-vue/package.json
+++ b/sist2-vue/package.json
@@ -9,10 +9,11 @@
  "dependencies": {
    "@auth0/auth0-spa-js": "^2.0.2",
    "@egjs/vue-infinitegrid": "3.3.0",
+    "@tensorflow/tfjs": "^4.4.0",
    "axios": "^0.25.0",
    "bootstrap-vue": "^2.21.2",
    "core-js": "^3.6.5",
-    "d3": "^5.6.1",
+    "d3": "^7.8.4",
    "date-fns": "^2.21.3",
    "dom-to-image": "^2.6.0",
    "fslightbox-vue": "fslightbox-vue.tgz",
--- a/sist2-vue/src/App.vue
+++ b/sist2-vue/src/App.vue
@@ -19,6 +19,7 @@
 import NavBar from "@/components/NavBar";
 import {mapActions, mapGetters, mapMutations} from "vuex";
 import Sist2Api from "@/Sist2Api";
+import ModelsRepo from "@/ml/modelsRepo";
 import {setupAuth0} from "@/main";

 export default {
@@ -36,6 +37,17 @@ export default {
    mounted() {
        this.$store.dispatch("loadConfiguration").then(() => {
            this.$root.$i18n.locale = this.$store.state.optLang;
+            ModelsRepo.init(this.$store.getters.mlRepositoryList).catch(err => {
+                this.$bvToast.toast(
+                    this.$t("ml.repoFetchError"),
+                    {
+                        title: this.$t("ml.repoFetchErrorTitle"),
+                        noAutoHide: true,
+                        toaster: "b-toaster-bottom-right",
+                        headerClass: "toast-header-warning",
+                        bodyClass: "toast-body-warning",
+                    });
+            });
        });

        this.$store.subscribe((mutation) => {
--- a/sist2-vue/src/Sist2Api.ts
+++ b/sist2-vue/src/Sist2Api.ts
@@ -361,20 +361,20 @@ class Sist2Api {
        });
    }

-    getTreemapCsvUrl(indexId: string) {
-        return `${this.baseUrl}s/${indexId}/1`;
+    getTreemapStat(indexId: string) {
+        return `${this.baseUrl}s/${indexId}/TMAP`;
    }

-    getMimeCsvUrl(indexId: string) {
-        return `${this.baseUrl}s/${indexId}/2`;
+    getMimeStat(indexId: string) {
+        return `${this.baseUrl}s/${indexId}/MAGG`;
    }

-    getSizeCsv(indexId: string) {
-        return `${this.baseUrl}s/${indexId}/3`;
+    getSizeStat(indexId: string) {
+        return `${this.baseUrl}s/${indexId}/SAGG`;
    }

-    getDateCsv(indexId: string) {
-        return `${this.baseUrl}s/${indexId}/4`;
+    getDateStat(indexId: string) {
+        return `${this.baseUrl}s/${indexId}/DAGG`;
    }
 }

--- a/sist2-vue/src/components/AnalyzedContentSpan.vue
+++ b/sist2-vue/src/components/AnalyzedContentSpan.vue
@@ -0,0 +1,21 @@
+<template>
+    <span :style="getStyle()">{{span.text}}</span>
+</template>
+
+<script>
+
+
+import ModelsRepo from "@/ml/modelsRepo";
+
+export default {
+    name: "AnalyzedContentSpan",
+    props: ["span", "text"],
+    methods: {
+        getStyle() {
+            return ModelsRepo.data[this.$store.getters.mlModel.name].labelStyles[this.span.label];
+        }
+    }
+}
+</script>
+
+<style scoped></style>
--- a/sist2-vue/src/components/AnalyzedContentSpanContainer.vue
+++ b/sist2-vue/src/components/AnalyzedContentSpanContainer.vue
@@ -0,0 +1,75 @@
+<template>
+    <div>
+        <b-card class="mb-2">
+            <AnalyzedContentSpan v-for="span of legend" :key="span.id" :span="span"
+                                 class="mr-2"></AnalyzedContentSpan>
+        </b-card>
+        <div class="content-div">
+            <AnalyzedContentSpan v-for="span of mergedSpans" :key="span.id" :span="span"></AnalyzedContentSpan>
+        </div>
+    </div>
+</template>
+
+<script>
+
+
+import AnalyzedContentSpan from "@/components/AnalyzedContentSpan.vue";
+import ModelsRepo from "@/ml/modelsRepo";
+
+export default {
+    name: "AnalyzedContentSpanContainer",
+    components: {AnalyzedContentSpan},
+    props: ["spans", "text"],
+    computed: {
+        legend() {
+            return Object.entries(ModelsRepo.data[this.$store.state.mlModel.name].legend)
+                .map(([label, name]) => ({
+                    text: name,
+                    id: label,
+                    label: label
+                }));
+        },
+        mergedSpans() {
+            const spans = this.spans;
+
+            const merged = [];
+
+            let lastLabel = null;
+            let fixSpace = false;
+            for (let i = 0; i < spans.length; i++) {
+
+                if (spans[i].label !== lastLabel) {
+                    let start = spans[i].wordIndex;
+                    const nextSpan = spans.slice(i + 1).find(s => s.label !== spans[i].label)
+                    let end = nextSpan ? nextSpan.wordIndex : undefined;
+
+                    if (end !== undefined && this.text[end - 1] === " ") {
+                        end -= 1;
+                        fixSpace = true;
+                    }
+
+                    merged.push({
+                        text: this.text.slice(start, end),
+                        label: spans[i].label,
+                        id: spans[i].wordIndex
+                    });
+
+                    if (fixSpace) {
+                        merged.push({
+                            text: " ",
+                            label: "O",
+                            id: end
+                        });
+                        fixSpace = false;
+                    }
+                    lastLabel = spans[i].label;
+                }
+            }
+
+            return merged;
+        },
+    },
+}
+</script>
+
+<style scoped></style>
--- a/sist2-vue/src/components/D3DateHistogram.vue
+++ b/sist2-vue/src/components/D3DateHistogram.vue
@@ -120,7 +120,7 @@ export default {
    update(indexId) {
      const svg = d3.select("#date-histogram");

-      d3.csv(Sist2Api.getDateCsv(indexId)).then(tabularData => {
+      d3.json(Sist2Api.getDateStat(indexId)).then(tabularData => {
        dateHistogram(tabularData.slice(), svg, this.$t("d3.dateHistogram"));
      });
    }
--- a/sist2-vue/src/components/D3MimeBarCount.vue
+++ b/sist2-vue/src/components/D3MimeBarCount.vue
@@ -91,7 +91,7 @@ export default {
      const mimeSvgCount = d3.select("#agg-mime-count");
      const fillOpacity = this.$store.state.optTheme === "black" ? 0.9 : 0.6;

-      d3.csv(Sist2Api.getMimeCsvUrl(indexId)).then(tabularData => {
+      d3.json(Sist2Api.getMimeStat(indexId)).then(tabularData => {
        mimeBarCount(tabularData.slice(), mimeSvgCount, fillOpacity, this.$t("d3.mimeCount"));
      });
    }
--- a/sist2-vue/src/components/D3MimeBarSize.vue
+++ b/sist2-vue/src/components/D3MimeBarSize.vue
@@ -90,7 +90,7 @@ export default {
      const mimeSvgSize = d3.select("#agg-mime-size");
      const fillOpacity = this.$store.state.optTheme === "black" ? 0.9 : 0.6;

-      d3.csv(Sist2Api.getMimeCsvUrl(indexId)).then(tabularData => {
+      d3.json(Sist2Api.getMimeStat(indexId)).then(tabularData => {
        mimeBarSize(tabularData.slice(), mimeSvgSize, fillOpacity, this.$t("d3.mimeSize"));
      });
    }
--- a/sist2-vue/src/components/D3SizeHistogram.vue
+++ b/sist2-vue/src/components/D3SizeHistogram.vue
@@ -117,7 +117,7 @@ export default {
    update(indexId) {
      const svg = d3.select("#size-histogram");

-      d3.csv(Sist2Api.getSizeCsv(indexId)).then(tabularData => {
+      d3.json(Sist2Api.getSizeStat(indexId)).then(tabularData => {
        sizeHistogram(tabularData.slice(), svg, this.$t("d3.sizeHistogram"));
      });
    }
--- a/sist2-vue/src/components/D3Treemap.vue
+++ b/sist2-vue/src/components/D3Treemap.vue
@@ -240,7 +240,7 @@ export default {
          .style("overflow", "visible")
          .style("font", "10px sans-serif");

-      d3.csv(Sist2Api.getTreemapCsvUrl(indexId)).then(tabularData => {
+      d3.json(Sist2Api.getTreemapStat(indexId)).then(tabularData => {
        tabularData.forEach(row => {
          row.taxonomy = row.path.split("/");
          row.size = Number(row.size);
--- a/sist2-vue/src/components/DebugInfo.vue
+++ b/sist2-vue/src/components/DebugInfo.vue
@@ -1,5 +1,5 @@
 <template>
-  <b-card class="mb-4 mt-4">
+  <b-card v-if="$store.state.sist2Info.showDebugInfo" class="mb-4 mt-4">
    <b-card-title><DebugIcon class="mr-1"></DebugIcon>{{ $t("debug") }}</b-card-title>
    <p v-html="$t('debugDescription')"></p>

--- a/sist2-vue/src/components/FeaturedFieldsLine.vue
+++ b/sist2-vue/src/components/FeaturedFieldsLine.vue
@@ -16,6 +16,10 @@ export default {
    props: ["doc"],
    computed: {
        featuredLineHtml() {
+            if (this.$store.getters.optFeaturedFields === undefined) {
+                return "";
+            }
+
            const scope = {doc: this.doc._source, humanDate: humanDate, humanFileSize: humanFileSize};

            return this.$store.getters.optFeaturedFields
--- a/sist2-vue/src/components/LazyContentDiv.vue
+++ b/sist2-vue/src/components/LazyContentDiv.vue
@@ -1,6 +1,36 @@
 <template>
    <Preloader v-if="loading"></Preloader>
-  <div v-else-if="content" class="content-div" v-html="content"></div>
+    <div v-else-if="content">
+        <b-form inline class="my-2" v-if="ModelsRepo.getOptions().length > 0">
+            <b-checkbox class="ml-auto mr-2" :checked="optAutoAnalyze"
+                        @input="setOptAutoAnalyze($event); $store.dispatch('updateConfiguration')">
+                {{ $t("ml.auto") }}
+            </b-checkbox>
+            <b-button :disabled="mlPredictionsLoading || mlLoading" @click="mlAnalyze" variant="primary"
+            >{{ $t("ml.analyzeText") }}
+            </b-button>
+            <b-select :disabled="mlPredictionsLoading || mlLoading" class="ml-2" v-model="mlModel">
+                <b-select-option :value="opt.value" v-for="opt of ModelsRepo.getOptions()">{{ opt.text }}
+                </b-select-option>
+            </b-select>
+        </b-form>
+
+        <b-progress v-if="mlLoading" variant="warning" show-progress :max="1" class="mb-3"
+        >
+            <b-progress-bar :value="modelLoadingProgress">
+                <strong>{{ ((modelLoadingProgress * modelSize) / (1024*1024)).toFixed(1) }}MB / {{
+                    (modelSize / (1024 * 1024)).toFixed(1)
+                    }}MB</strong>
+            </b-progress-bar>
+        </b-progress>
+
+        <b-progress v-if="mlPredictionsLoading" variant="primary" :value="modelPredictionProgress"
+                    :max="content.length" class="mb-3"></b-progress>
+
+        <AnalyzedContentSpansContainer v-if="analyzedContentSpans.length > 0"
+                                       :spans="analyzedContentSpans" :text="rawContent"></AnalyzedContentSpansContainer>
+        <div v-else class="content-div" v-html="content"></div>
+    </div>
 </template>

 <script>
@@ -8,22 +38,40 @@ import Sist2Api from "@/Sist2Api";
 import Preloader from "@/components/Preloader";
 import Sist2Query from "@/Sist2Query";
 import store from "@/store";
+import BertNerModel from "@/ml/BertNerModel";
+import AnalyzedContentSpansContainer from "@/components/AnalyzedContentSpanContainer.vue";
+import ModelsRepo from "@/ml/modelsRepo";
+import {mapGetters, mapMutations} from "vuex";

 export default {
    name: "LazyContentDiv",
-  components: {Preloader},
+    components: {AnalyzedContentSpansContainer, Preloader},
    props: ["docId"],
    data() {
        return {
+            ModelsRepo,
            content: "",
-      loading: true
+            rawContent: "",
+            loading: true,
+            modelLoadingProgress: 0,
+            modelPredictionProgress: 0,
+            mlPredictionsLoading: false,
+            mlLoading: false,
+            mlModel: null,
+            analyzedContentSpans: []
        }
    },
    mounted() {
+
+        if (this.$store.getters.optMlDefaultModel) {
+            this.mlModel = this.$store.getters.optMlDefaultModel
+        } else {
+            this.mlModel = ModelsRepo.getDefaultModel();
+        }
+
        const query = Sist2Query.searchQuery();

        if (this.$store.state.optHighlight) {
-
            const fields = this.$store.state.fuzzy
                ? {"content.nGram": {}}
                : {content: {}};
@@ -67,14 +115,28 @@ export default {
            this.loading = false;
            if (resp.hits.hits.length === 1) {
                this.content = this.getContent(resp.hits.hits[0]);
-      } else {
-        console.log("FIXME: could not get content")
-        console.log(resp)
+            }
+
+            if (this.optAutoAnalyze) {
+                this.mlAnalyze();
            }
        });
    },
+    computed: {
+        ...mapGetters(["optAutoAnalyze"]),
+        modelSize() {
+            const modelData = ModelsRepo.data[this.mlModel];
+            if (!modelData) {
+                return 0;
+            }
+            return modelData.size;
+        }
+    },
    methods: {
+        ...mapMutations(["setOptAutoAnalyze"]),
        getContent(doc) {
+            this.rawContent = doc._source.content;
+
            if (!doc.highlight) {
                return doc._source.content;
            }
@@ -85,10 +147,60 @@ export default {
            if (doc.highlight.content) {
                return doc.highlight.content[0];
            }
+        },
+        async getMlModel() {
+            if (this.$store.getters.mlModel.name !== this.mlModel) {
+                this.mlLoading = true;
+                this.modelLoadingProgress = 0;
+                const modelInfo = ModelsRepo.data[this.mlModel];
+
+                const model = new BertNerModel(
+                    modelInfo.vocabUrl,
+                    modelInfo.modelUrl,
+                    modelInfo.id2label,
+                )
+
+                await model.init(progress => this.modelLoadingProgress = progress);
+                this.$store.commit("setMlModel", {model, name: this.mlModel});
+
+                this.mlLoading = false;
+                return model
+            }
+
+            return this.$store.getters.mlModel.model;
+        },
+        async mlAnalyze() {
+            if (!this.content) {
+                return;
+            }
+
+            const modelInfo = ModelsRepo.data[this.mlModel];
+            if (modelInfo === undefined) {
+                return;
+            }
+
+            this.$store.commit("setOptMlDefaultModel", this.mlModel);
+            await this.$store.dispatch("updateConfiguration");
+
+            const model = await this.getMlModel();
+
+            this.analyzedContentSpans = [];
+
+            this.mlPredictionsLoading = true;
+
+            await model.predict(this.rawContent, results => {
+                results.forEach(result => result.label = modelInfo.humanLabels[result.label]);
+                this.analyzedContentSpans.push(...results);
+                this.modelPredictionProgress = results[results.length - 1].wordIndex;
+            });
+            this.mlPredictionsLoading = false;
        }
    }
 }
 </script>

-<style scoped>
+<style>
+.progress-bar {
+    transition: none;
+}
 </style>
--- a/sist2-vue/src/i18n/messages.ts
+++ b/sist2-vue/src/i18n/messages.ts
@@ -49,6 +49,7 @@ export default {
        configReset: "Reset configuration",
        searchOptions: "Search options",
        treemapOptions: "Treemap options",
+        mlOptions: "Machine learning options",
        displayOptions: "Display options",
        opt: {
            lang: "Language",
@@ -78,7 +79,10 @@ export default {
            simpleLightbox: "Disable animations in image viewer",
            showTagPickerFilter: "Display the tag filter bar",
            featuredFields: "Featured fields Javascript template string. Will appear in the search results.",
-            featuredFieldsList: "Available variables"
+            featuredFieldsList: "Available variables",
+            autoAnalyze: "Automatically analyze text",
+            defaultModel: "Default model",
+            mlRepositories: "Model repositories (one per line)"
        },
        queryMode: {
            simple: "Simple",
@@ -171,6 +175,12 @@ export default {
            selectedIndex: "selected index",
            selectedIndices: "selected indices",
        },
+        ml: {
+            analyzeText: "Analyze",
+            auto: "Auto",
+            repoFetchError: "Failed to get list of models. Check browser console for more details.",
+            repoFetchErrorTitle: "Could not fetch model repositories",
+        }
    },
    de: {
        filePage: {
@@ -250,8 +260,8 @@ export default {
            vidPreviewInterval: "Videovorschau Framedauer in ms",
            simpleLightbox: "Schalte Animationen im Image-Viewer ab",
            showTagPickerFilter: "Zeige die Tag-Filter-Leiste",
-            featuredFields: "Ausgewählte Felder Javascript Vorlage String. Wird in den Suchergebnissen angezeigt.",
-            featuredFieldsList: "Verfügbare Variablen"
+            featuredFields: "Variablen, welche zusätzlich in den Suchergebnissen angezeigt werden können.",
+            featuredFieldsList: "verfügbare Variablen"
        },
        queryMode: {
            simple: "Einfach",
@@ -333,10 +343,10 @@ export default {
            random: "zufällig",
        },
        d3: {
-            mimeCount: "Anzahlverteilung nach Medientyp",
-            mimeSize: "Größenverteilung nach Medientyp",
-            dateHistogram: "Verteilung der Änderungszeiten",
-            sizeHistogram: "Verteilung der Dateigrößen",
+            mimeCount: "Anzahl nach Medientyp",
+            mimeSize: "Größen nach Medientyp",
+            dateHistogram: "Änderungszeiten",
+            sizeHistogram: "Dateigrößen",
        },
        indexPicker: {
            selectNone: "keinen auswählen",
--- a/sist2-vue/src/ml/BertNerModel.js
+++ b/sist2-vue/src/ml/BertNerModel.js
@@ -0,0 +1,77 @@
+import BertTokenizer from "@/ml/BertTokenizer";
+import * as tf from "@tensorflow/tfjs";
+import axios from "axios";
+
+export default class BertNerModel {
+    vocabUrl;
+    modelUrl;
+
+    id2label;
+    _tokenizer;
+    _model;
+    inputSize = 128;
+
+    _previousWordId = null;
+
+    constructor(vocabUrl, modelUrl, id2label) {
+        this.vocabUrl = vocabUrl;
+        this.modelUrl = modelUrl;
+        this.id2label = id2label;
+    }
+
+    async init(onProgress) {
+        await Promise.all([this.loadTokenizer(), this.loadModel(onProgress)]);
+    }
+
+    async loadTokenizer() {
+        const vocab = (await axios.get(this.vocabUrl)).data;
+        this._tokenizer = new BertTokenizer(vocab);
+    }
+
+    async loadModel(onProgress) {
+        this._model = await tf.loadGraphModel(this.modelUrl, {onProgress});
+    }
+
+    alignLabels(labels, wordIds, words) {
+        const result = [];
+
+        for (let i = 0; i < this.inputSize; i++) {
+            const label = labels[i];
+            const wordId = wordIds[i];
+
+            if (wordId === -1) {
+                continue;
+            }
+            if (wordId === this._previousWordId) {
+                continue;
+            }
+
+            result.push({
+                word: words[wordId].text, wordIndex: words[wordId].index, label: label
+            });
+            this._previousWordId = wordId;
+        }
+
+        return result;
+    }
+
+    async predict(text, callback) {
+        this._previousWordId = null;
+        const encoded = this._tokenizer.encodeText(text, this.inputSize)
+
+        for (let chunk of encoded.inputChunks) {
+            const rawResult = tf.tidy(() => this._model.execute({
+                input_ids: tf.tensor2d(chunk.inputIds, [1, this.inputSize], "int32"),
+                token_type_ids: tf.tensor2d(chunk.segmentIds, [1, this.inputSize], "int32"),
+                attention_mask: tf.tensor2d(chunk.inputMask, [1, this.inputSize], "int32"),
+            }));
+
+            const labelIds = await tf.argMax(rawResult, -1);
+            const labelIdsArray = await labelIds.array();
+            const labels = labelIdsArray[0].map(id => this.id2label[id]);
+            rawResult.dispose()
+
+            callback(this.alignLabels(labels, chunk.wordIds, encoded.words))
+        }
+    }
+}
--- a/sist2-vue/src/ml/BertTokenizer.js
+++ b/sist2-vue/src/ml/BertTokenizer.js
@@ -0,0 +1,184 @@
+import {zip, chunk} from "underscore";
+
+const UNK_INDEX = 100;
+const CLS_INDEX = 101;
+const SEP_INDEX = 102;
+const CONTINUING_SUBWORD_PREFIX = "##";
+
+function isWhitespace(ch) {
+    return /\s/.test(ch);
+}
+
+function isInvalid(ch) {
+    return (ch.charCodeAt(0) === 0 || ch.charCodeAt(0) === 0xfffd);
+}
+
+const punctuations = '[~`!@#$%^&*(){}[];:"\'<,.>?/\\|-_+=';
+
+/** To judge whether it's a punctuation. */
+function isPunctuation(ch) {
+    return punctuations.indexOf(ch) !== -1;
+}
+
+export default class BertTokenizer {
+    vocab;
+
+    constructor(vocab) {
+        this.vocab = vocab;
+    }
+
+    tokenize(text) {
+        const charOriginalIndex = [];
+        const cleanedText = this.cleanText(text, charOriginalIndex);
+        const origTokens = cleanedText.split(' ');
+
+        let charCount = 0;
+        const tokens = origTokens.map((token) => {
+            token = token.toLowerCase();
+            const tokens = this.runSplitOnPunctuation(token, charCount, charOriginalIndex);
+            charCount += token.length + 1;
+            return tokens;
+        });
+
+        let flattenTokens = [];
+        for (let index = 0; index < tokens.length; index++) {
+            flattenTokens = flattenTokens.concat(tokens[index]);
+        }
+        return flattenTokens;
+    }
+
+    /* Performs invalid character removal and whitespace cleanup on text. */
+    cleanText(text, charOriginalIndex) {
+        text = text.replace(/\?/g, "").trim();
+
+        const stringBuilder = [];
+        let originalCharIndex = 0;
+        let newCharIndex = 0;
+
+        for (const ch of text) {
+            // Skip the characters that cannot be used.
+            if (isInvalid(ch)) {
+                originalCharIndex += ch.length;
+                continue;
+            }
+            if (isWhitespace(ch)) {
+                if (stringBuilder.length > 0 && stringBuilder[stringBuilder.length - 1] !== ' ') {
+                    stringBuilder.push(' ');
+                    charOriginalIndex[newCharIndex] = originalCharIndex;
+                    originalCharIndex += ch.length;
+                } else {
+                    originalCharIndex += ch.length;
+                    continue;
+                }
+            } else {
+                stringBuilder.push(ch);
+                charOriginalIndex[newCharIndex] = originalCharIndex;
+                originalCharIndex += ch.length;
+            }
+            newCharIndex++;
+        }
+        return stringBuilder.join('');
+    }
+
+    /* Splits punctuation on a piece of text. */
+    runSplitOnPunctuation(text, count, charOriginalIndex) {
+        const tokens = [];
+        let startNewWord = true;
+        for (const ch of text) {
+            if (isPunctuation(ch)) {
+                tokens.push({text: ch, index: charOriginalIndex[count]});
+                count += ch.length;
+                startNewWord = true;
+            } else {
+                if (startNewWord) {
+                    tokens.push({text: '', index: charOriginalIndex[count]});
+                    startNewWord = false;
+                }
+                tokens[tokens.length - 1].text += ch;
+                count += ch.length;
+            }
+        }
+        return tokens;
+    }
+
+    encode(words) {
+        let outputTokens = [];
+        const wordIds = [];
+
+        for (let i = 0; i < words.length; i++) {
+            let chars = [...words[i].text];
+
+            let isUnknown = false;
+            let start = 0;
+            let subTokens = [];
+
+            while (start < chars.length) {
+                let end = chars.length;
+                let currentSubstring = null;
+                while (start < end) {
+                    let substr = chars.slice(start, end).join('');
+
+                    if (start > 0) {
+                        substr = CONTINUING_SUBWORD_PREFIX + substr;
+                    }
+                    if (this.vocab.includes(substr)) {
+                        currentSubstring = this.vocab.indexOf(substr);
+                        break;
+                    }
+
+                    --end;
+                }
+                if (currentSubstring == null) {
+                    isUnknown = true;
+                    break;
+                }
+                subTokens.push(currentSubstring);
+                start = end;
+            }
+
+            if (isUnknown) {
+                outputTokens.push(UNK_INDEX);
+                wordIds.push(i);
+            } else {
+                subTokens.forEach(tok => {
+                    outputTokens.push(tok);
+                    wordIds.push(i)
+                });
+            }
+        }
+
+        return {tokens: outputTokens, wordIds};
+    }
+
+    encodeText(inputText, inputSize) {
+
+        const tokenized = this.tokenize(inputText);
+        const encoded = this.encode(tokenized);
+
+        const encodedTokenChunks = chunk(encoded.tokens, inputSize - 2);
+        const encodedWordIdChunks = chunk(encoded.wordIds, inputSize - 2);
+
+        const chunks = [];
+
+        zip(encodedTokenChunks, encodedWordIdChunks).forEach(([tokens, wordIds]) => {
+            const inputIds = [CLS_INDEX, ...tokens, SEP_INDEX];
+            const segmentIds = Array(inputIds.length).fill(0);
+            const inputMask = Array(inputIds.length).fill(1);
+            wordIds = [-1, ...wordIds, -1];
+
+            while (inputIds.length < inputSize) {
+                inputIds.push(0);
+                inputMask.push(0);
+                segmentIds.push(0);
+                wordIds.push(-1);
+            }
+
+            chunks.push({inputIds, inputMask, segmentIds, wordIds})
+        });
+
+        return {
+            inputChunks: chunks,
+            words: tokenized
+        };
+    }
+}
--- a/sist2-vue/src/ml/modelsRepo.js
+++ b/sist2-vue/src/ml/modelsRepo.js
@@ -0,0 +1,43 @@
+import axios from "axios";
+
+class ModelsRepo {
+    _repositories;
+    data = {};
+
+    async init(repositories) {
+        this._repositories = repositories;
+
+        const data = await Promise.all(this._repositories.map(this._loadRepository));
+
+        data.forEach(models => {
+            models.forEach(model => {
+                this.data[model.name] = model;
+            })
+        });
+    }
+
+    async _loadRepository(repository) {
+        const data = (await axios.get(repository)).data;
+        data.forEach(model => {
+            model["modelUrl"] = new URL(model["modelPath"], repository).href;
+            model["vocabUrl"] = new URL(model["vocabPath"], repository).href;
+        });
+        return data;
+    }
+
+    getOptions() {
+        return Object.values(this.data).map(model => ({
+            text: `${model.name} (${Math.round(model.size / (1024*1024))}MB)`,
+            value: model.name
+        }));
+    }
+
+    getDefaultModel() {
+        if (Object.values(this.data).length === 0) {
+            return null;
+        }
+        return Object.values(this.data).find(model => model.default).name;
+    }
+}
+
+export default new ModelsRepo();
--- a/sist2-vue/src/store/index.ts
+++ b/sist2-vue/src/store/index.ts
@@ -5,7 +5,7 @@ import {EsHit, EsResult, EsTag, Index, Tag} from "@/Sist2Api";
 import {deserializeMimes, randomSeed, serializeMimes} from "@/util";
 import {getInstance} from "@/plugins/auth0.js";

-const CONF_VERSION = 2;
+const CONF_VERSION = 3;

 Vue.use(Vuex)

@@ -57,6 +57,9 @@ export default new Vuex.Store({
        optVidPreviewInterval: 700,
        optSimpleLightbox: true,
        optShowTagPickerFilter: true,
+        optMlRepositories: "https://raw.githubusercontent.com/simon987/sist2-ner-models/main/repo.json",
+        optAutoAnalyze: false,
+        optMlDefaultModel: null,

        _onLoadSelectedIndices: [] as string[],
        _onLoadSelectedMimeTypes: [] as string[],
@@ -86,7 +89,11 @@ export default new Vuex.Store({

        uiMimeMap: [] as any[],

-        auth0Token: null
+        auth0Token: null,
+        mlModel: {
+            model: null,
+            name: null
+        },
    },
    mutations: {
        setUiShowDetails: (state, val) => state.uiShowDetails = val,
@@ -172,6 +179,9 @@ export default new Vuex.Store({
        setOptVidPreviewInterval: (state, val) => state.optVidPreviewInterval = val,
        setOptSimpleLightbox: (state, val) => state.optSimpleLightbox = val,
        setOptShowTagPickerFilter: (state, val) => state.optShowTagPickerFilter = val,
+        setOptAutoAnalyze: (state, val) => {state.optAutoAnalyze = val},
+        setOptMlRepositories: (state, val) => {state.optMlRepositories = val},
+        setOptMlDefaultModel: (state, val) => {state.optMlDefaultModel = val},

        setOptLightboxLoadOnlyCurrent: (state, val) => state.optLightboxLoadOnlyCurrent = val,
        setOptLightboxSlideDuration: (state, val) => state.optLightboxSlideDuration = val,
@@ -194,6 +204,7 @@ export default new Vuex.Store({
            // noop
        },
        setAuth0Token: (state, val) => state.auth0Token = val,
+        setMlModel: (state, val) => state.mlModel = val,
    },
    actions: {
        setSist2Info: (store, val) => {
@@ -350,6 +361,7 @@ export default new Vuex.Store({
    },
    modules: {},
    getters: {
+        mlModel: (state) => state.mlModel,
        seed: (state) => state.seed,
        getPathText: (state) => state.pathText,
        indices: state => state.indices,
@@ -416,5 +428,12 @@ export default new Vuex.Store({
        optSimpleLightbox: state => state.optSimpleLightbox,
        optShowTagPickerFilter: state => state.optShowTagPickerFilter,
        optFeaturedFields: state => state.optFeaturedFields,
+        optMlRepositories: state => state.optMlRepositories,
+        mlRepositoryList: state => {
+            const repos = state.optMlRepositories.split("\n")
+            return repos[0] == "" ? [] : repos;
+        },
+        optMlDefaultModel: state => state.optMlDefaultModel,
+        optAutoAnalyze: state => state.optAutoAnalyze,
    }
 })
--- a/sist2-vue/src/views/Configuration.vue
+++ b/sist2-vue/src/views/Configuration.vue
@@ -25,7 +25,8 @@
                    <b-form-select :options="themeOptions" :value="optTheme" @input="setOptTheme"></b-form-select>

                    <label>{{ $t("opt.displayMode") }}</label>
-          <b-form-select :options="displayModeOptions" :value="optDisplay" @input="setOptDisplay"></b-form-select>
+                    <b-form-select :options="displayModeOptions" :value="optDisplay"
+                                   @input="setOptDisplay"></b-form-select>

                    <label>{{ $t("opt.columns") }}</label>
                    <b-form-select :options="columnsOptions" :value="optColumns" @input="setOptColumns"></b-form-select>
@@ -123,7 +124,10 @@
                        }}
                    </b-form-checkbox>

-          <b-form-checkbox :checked="optHighlight" @input="setOptHighlight">{{ $t("opt.highlight") }}</b-form-checkbox>
+                    <b-form-checkbox :checked="optHighlight" @input="setOptHighlight">{{
+                        $t("opt.highlight")
+                        }}
+                    </b-form-checkbox>
                    <b-form-checkbox :checked="optTagOrOperator" @input="setOptTagOrOperator">{{
                        $t("opt.tagOrOperator")
                        }}
@@ -148,7 +152,8 @@
                                  @input="setOptResultSize"></b-form-input>

                    <label>{{ $t("opt.queryMode") }}</label>
-          <b-form-select :options="queryModeOptions" :value="optQueryMode" @input="setOptQueryMode"></b-form-select>
+                    <b-form-select :options="queryModeOptions" :value="optQueryMode"
+                                   @input="setOptQueryMode"></b-form-select>

                    <label>{{ $t("opt.slideDuration") }}</label>
                    <b-form-input :value="optLightboxSlideDuration" type="number" min="1"
@@ -159,6 +164,17 @@
                                  @input="setOptVidPreviewInterval"></b-form-input>
                </b-card>

+                <h4 class="mt-3">{{ $t("mlOptions") }}</h4>
+                <b-card>
+                    <label>{{ $t("opt.mlRepositories") }}</label>
+                    <b-textarea rows="3" :value="optMlRepositories" @input="setOptMlRepositories"></b-textarea>
+                    <br>
+                    <b-form-checkbox :checked="optAutoAnalyze" @input="setOptAutoAnalyze">{{
+                            $t("opt.autoAnalyze")
+                        }}
+                    </b-form-checkbox>
+                </b-card>
+
                <h4 class="mt-3">{{ $t("treemapOptions") }}</h4>
                <b-card>
                    <label>{{ $t("opt.treemapType") }}</label>
@@ -311,6 +327,8 @@ export default {
            "optSimpleLightbox",
            "optShowTagPickerFilter",
            "optFeaturedFields",
+            "optMlRepositories",
+            "optAutoAnalyze",
        ]),
        clientWidth() {
            return window.innerWidth;
@@ -355,6 +373,8 @@ export default {
            "setOptSimpleLightbox",
            "setOptShowTagPickerFilter",
            "setOptFeaturedFields",
+            "setOptMlRepositories",
+            "setOptAutoAnalyze",
        ]),
        onResetClick() {
            localStorage.removeItem("sist2_configuration");
--- a/sist2-vue/src/views/SearchPage.vue
+++ b/sist2-vue/src/views/SearchPage.vue
@@ -7,7 +7,11 @@
            <Preloader></Preloader>
        </b-card>

-    <b-card v-show="!uiLoading" id="search-panel">
+        <b-alert v-show="!uiLoading && showEsConnectionError" show variant="danger" class="mt-2">
+            {{ $t("toast.esConnErr") }}
+        </b-alert>
+
+        <b-card v-show="!uiLoading && !showEsConnectionError" id="search-panel">
            <SearchBar @show-help="showHelp=true"></SearchBar>
            <b-row>
                <b-col style="height: 70px;" sm="6">
@@ -94,7 +98,8 @@ export default Vue.extend({
        docChecksums: new Set(),
        searchBusy: false,
        Sist2Query: Sist2Query,
-    showHelp: false
+        showHelp: false,
+        showEsConnectionError: false
    }),
    computed: {
        ...mapGetters(["indices", "optDisplay"]),
@@ -143,6 +148,15 @@ export default Vue.extend({
                this.uiLoading = false;
                this.search(true);
            });
+        }).catch(error => {
+            console.log(error);
+
+            if (error.response.status == 503 || error.response.status == 500) {
+                this.showEsConnectionError = true;
+                this.uiLoading = false;
+            } else {
+                this.showErrorToast();
+            }
        });
    },
    methods: {
@@ -253,11 +267,20 @@ export default Vue.extend({
                },
                size: 0
            }).then(res => {
-        return {
+                const range = {
                    min: res.aggregations.dateMin.value,
                    max: res.aggregations.dateMax.value,
                }
-      })
+
+                if (range.min == null) {
+                    range.min = 0;
+                    range.max = 1;
+                } else if (range.min == range.max) {
+                    range.max += 1;
+                }
+
+                return range;
+            });
        },
        appendFunc() {
            if (!this.$store.state.uiReachedScrollEnd && this.search && !this.searchBusy) {
--- a/src/database/database.c
+++ b/src/database/database.c
@@ -83,6 +83,7 @@ void database_open(database_t *db) {
    LOG_DEBUGF("database.c", "Opening database %s (%d)", db->filename, db->type);

    CRASH_IF_NOT_SQLITE_OK(sqlite3_open(db->filename, &db->db));
+    sqlite3_busy_timeout(db->db, 1000);

    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA cache_size = -200000;", NULL, NULL, NULL));
    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA synchronous = OFF;", NULL, NULL, NULL));
@@ -328,18 +329,18 @@ database_iterator_t *database_create_document_iterator(database_t *db) {
                               " WHEN sc.json_data IS NULL THEN"
                               "  CASE"
                               "   WHEN t.tag IS NULL THEN"
-                               "    document.json_data"
+                               "    json_set(document.json_data, '$._id', document.id, '$.size', document.size, '$.mtime', document.mtime)"
                               "   ELSE"
-                               "    json_set(document.json_data, '$.tag', json_group_array(t.tag))"
+                               "    json_set(document.json_data, '$._id', document.id, '$.size', document.size, '$.mtime', document.mtime, '$.tag', json_group_array(t.tag))"
                               "   END"
                               " ELSE"
                               "  CASE"
                               "   WHEN t.tag IS NULL THEN"
-                               "    json_patch(document.json_data, sc.json_data)"
+                               "    json_patch(json_set(document.json_data, '$._id', document.id, '$.size', document.size, '$.mtime', document.mtime), sc.json_data)"
                               "   ELSE"
                               //   This will overwrite any tags specified in the sidecar file!
                               //   TODO: concatenate the two arrays?
-                               "    json_set(json_patch(document.json_data, sc.json_data), '$.tag', json_group_array(t.tag))"
+                               "    json_set(json_patch(document.json_data, sc.json_data), '$._id', document.id, '$.size', document.size, '$.mtime', document.mtime, '$.tag', json_group_array(t.tag))"
                               "   END"
                               " END"
                               " FROM document"
@@ -581,18 +582,33 @@ void database_add_work(database_t *db, job_t *job) {
            ret = sqlite3_step(db->insert_parse_job_stmt);

            if (ret == SQLITE_FULL) {
+                sqlite3_reset(db->insert_parse_job_stmt);
+                pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
                usleep(1000000);
+                pthread_mutex_lock(&db->ipc_ctx->db_mutex);
+                continue;
            } else {
                CRASH_IF_STMT_FAIL(ret);
            }

-            CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->insert_parse_job_stmt));
-        } while (ret != SQLITE_DONE);
+            ret = sqlite3_reset(db->insert_parse_job_stmt);
+            if (ret == SQLITE_FULL) {
+                pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
+                usleep(100000);
+                pthread_mutex_lock(&db->ipc_ctx->db_mutex);
+            } else if (ret != SQLITE_OK) {
+                LOG_FATALF("database.c", "sqlite3_reset returned error %d", ret);
+            }
+        } while (ret != SQLITE_DONE && ret != SQLITE_OK);
    } else if (job->type == JOB_BULK_LINE) {
        do {
            sqlite3_bind_text(db->insert_index_job_stmt, 1, job->bulk_line->doc_id, -1, SQLITE_STATIC);
            sqlite3_bind_int(db->insert_index_job_stmt, 2, job->bulk_line->type);
+            if (job->bulk_line->type != ES_BULK_LINE_DELETE) {
                sqlite3_bind_text(db->insert_index_job_stmt, 3, job->bulk_line->line, -1, SQLITE_STATIC);
+            } else {
+                sqlite3_bind_null(db->insert_index_job_stmt, 3);
+            }

            ret = sqlite3_step(db->insert_index_job_stmt);

@@ -611,6 +627,8 @@ void database_add_work(database_t *db, job_t *job) {
                pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
                usleep(100000);
                pthread_mutex_lock(&db->ipc_ctx->db_mutex);
+            } else if (ret != SQLITE_OK) {
+                LOG_FATALF("database.c", "sqlite3_reset returned error %d", ret);
            }

        } while (ret != SQLITE_DONE && ret != SQLITE_OK);
--- a/src/database/database.h
+++ b/src/database/database.h
@@ -18,6 +18,14 @@ typedef enum {
    FTS_DATABASE
 } database_type_t;

+typedef enum {
+    DATABASE_STAT_INVALID,
+    DATABASE_STAT_TREEMAP,
+    DATABASE_STAT_MIME_AGG,
+    DATABASE_STAT_SIZE_AGG,
+    DATABASE_STAT_DATE_AGG,
+} database_stat_type_d;
+
 typedef enum {
    JOB_UNDEFINED,
    JOB_BULK_LINE,
@@ -104,14 +112,14 @@ database_iterator_t *database_create_document_iterator(database_t *db);
 cJSON *database_document_iter(database_iterator_t *);

 #define database_document_iter_foreach(element, iter) \
-    for (cJSON *element = database_document_iter(iter); element != NULL; element = database_document_iter(iter))
+    for (cJSON *(element) = database_document_iter(iter); (element) != NULL; (element) = database_document_iter(iter))

 database_iterator_t *database_create_delete_list_iterator(database_t *db);

 char * database_delete_list_iter(database_iterator_t *iter);

 #define database_delete_list_iter_foreach(element, iter) \
-    for (char *element = database_delete_list_iter(iter); element != NULL; element = database_delete_list_iter(iter))
+    for (char *(element) = database_delete_list_iter(iter); (element) != NULL; (element) = database_delete_list_iter(iter))


 cJSON *database_incremental_scan_begin(database_t *db);
@@ -132,12 +140,16 @@ treemap_row_t database_treemap_iter(database_iterator_t *iter);

 void database_generate_stats(database_t *db, double treemap_threshold);

+database_stat_type_d database_get_stat_type_by_mnemonic(const char *name);
+
 job_t *database_get_work(database_t *db, job_type_t job_type);

 void database_add_work(database_t *db, job_t *job);

 //void database_index(database_t *db);

+cJSON *database_get_stats(database_t *db, database_stat_type_d type);
+
 #define CRASH_IF_STMT_FAIL(x) do { \
        int return_value = x;                \
        if (return_value != SQLITE_DONE && return_value != SQLITE_ROW) {     \
--- a/src/database/database_stats.c
+++ b/src/database/database_stats.c
@@ -6,6 +6,7 @@
 #define SIZE_BUCKET (long)(5 * 1000 * 1000)
 #define DATE_BUCKET (long)(2629800) // ~30 days

+
 database_iterator_t *database_create_treemap_iterator(database_t *db, long threshold) {

    sqlite3_stmt *stmt;
@@ -157,3 +158,85 @@ void database_generate_stats(database_t *db, double treemap_threshold) {
    LOG_INFO("database.c", "Done!");
 }

+database_stat_type_d database_get_stat_type_by_mnemonic(const char *name) {
+    if (strcmp(name, "TMAP") == 0) {
+        return DATABASE_STAT_TREEMAP;
+    }
+    if (strcmp(name, "MAGG") == 0) {
+        return DATABASE_STAT_MIME_AGG;
+    }
+    if (strcmp(name, "SAGG") == 0) {
+        return DATABASE_STAT_SIZE_AGG;
+    }
+    if (strcmp(name, "DAGG") == 0) {
+        return DATABASE_STAT_DATE_AGG;
+    }
+
+    return DATABASE_STAT_INVALID;
+}
+
+cJSON *database_get_stats(database_t *db, database_stat_type_d type) {
+
+    sqlite3_stmt *stmt;
+
+    switch (type) {
+        case DATABASE_STAT_TREEMAP:
+            CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                    db->db, "SELECT path,size FROM stats_treemap", -1, &stmt, NULL
+            ));
+            break;
+        case DATABASE_STAT_DATE_AGG:
+            CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                    db->db, "SELECT bucket,count FROM stats_date_agg", -1, &stmt, NULL
+            ));
+            break;
+        case DATABASE_STAT_SIZE_AGG:
+            CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                    db->db, "SELECT bucket,count FROM stats_size_agg", -1, &stmt, NULL
+            ));
+            break;
+        case DATABASE_STAT_MIME_AGG:
+            CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                    db->db, "SELECT mime,size,count FROM stats_mime_agg", -1, &stmt, NULL
+            ));
+            break;
+        case DATABASE_STAT_INVALID:
+        default:
+        LOG_FATALF("database_stats.c", "Invalid stat type: %d", type);
+    }
+
+    cJSON *json = cJSON_CreateArray();
+
+    int ret;
+    do {
+        ret = sqlite3_step(stmt);
+        CRASH_IF_STMT_FAIL(ret);
+
+        if (ret == SQLITE_DONE) {
+            break;
+        }
+
+        cJSON *row = cJSON_CreateObject();
+
+        switch (type) {
+            case DATABASE_STAT_TREEMAP:
+                cJSON_AddStringToObject(row, "path", (const char *) sqlite3_column_text(stmt, 0));
+                cJSON_AddNumberToObject(row, "size", (double) sqlite3_column_int64(stmt, 1));
+                break;
+            case DATABASE_STAT_DATE_AGG:
+            case DATABASE_STAT_SIZE_AGG:
+                cJSON_AddNumberToObject(row, "bucket", (double) sqlite3_column_int64(stmt, 0));
+                cJSON_AddNumberToObject(row, "count", (double) sqlite3_column_int64(stmt, 1));
+                break;
+            case DATABASE_STAT_MIME_AGG:
+                cJSON_AddStringToObject(row, "mime", (const char *) sqlite3_column_text(stmt, 0));
+                cJSON_AddNumberToObject(row, "size", (double) sqlite3_column_int64(stmt, 1));
+                cJSON_AddNumberToObject(row, "count", (double) sqlite3_column_int64(stmt, 2));
+                break;
+        }
+
+        cJSON_AddItemToArray(json, row);
+    } while (TRUE);
+
+    return json;
+}
--- a/src/database/database_stats.h
+++ b/src/database/database_stats.h
@@ -1,5 +0,0 @@
-#ifndef SIST2_DATABASE_STATS_H
-#define SIST2_DATABASE_STATS_H
-
-
-#endif //SIST2_DATABASE_STATS_H
--- a/src/index/elastic.c
+++ b/src/index/elastic.c
@@ -64,20 +64,16 @@ void print_json(cJSON *document, const char id_str[SIST_DOC_ID_LEN]) {
    cJSON_Delete(line);
 }

-void index_json_func(job_t *job) {
-    elastic_index_line(job->bulk_line);
-}
-
 void delete_document(const char *document_id) {
-    es_bulk_line_t *bulk_line = malloc(sizeof(es_bulk_line_t));
+    es_bulk_line_t bulk_line;

-    bulk_line->type = ES_BULK_LINE_DELETE;
-    bulk_line->next = NULL;
-    strcpy(bulk_line->doc_id, document_id);
+    bulk_line.type = ES_BULK_LINE_DELETE;
+    bulk_line.next = NULL;
+    strcpy(bulk_line.doc_id, document_id);

    tpool_add_work(IndexCtx.pool, &(job_t) {
            .type = JOB_BULK_LINE,
-            .bulk_line = bulk_line,
+            .bulk_line = &bulk_line,
    });
 }

@@ -99,6 +95,7 @@ void index_json(cJSON *document, const char doc_id[SIST_DOC_ID_LEN]) {
        .type = JOB_BULK_LINE,
        .bulk_line = bulk_line,
    });
+    free(bulk_line);
 }

 void execute_update_script(const char *script, int async, const char index_id[SIST_INDEX_ID_LEN]) {
--- a/src/io/serialize.c
+++ b/src/io/serialize.c
@@ -91,8 +91,6 @@ char *build_json_string(document_t *doc) {
    } else {
        cJSON_AddStringToObject(json, "mime", mime_text);
    }
-    cJSON_AddNumberToObject(json, "size", (double) doc->size);
-    cJSON_AddNumberToObject(json, "mtime", doc->mtime);

    // Ignore root directory in the file path
    doc->ext = (short) (doc->ext - ScanCtx.index.desc.root_len);
@@ -122,8 +120,6 @@ char *build_json_string(document_t *doc) {
        cJSON_AddStringToObject(json, "path", "");
    }

-    cJSON_AddStringToObject(json, "_id", doc->doc_id);
-
    // Metadata
    meta_line_t *meta = doc->meta_head;
    while (meta != NULL) {
--- a/src/main.c
+++ b/src/main.c
@@ -195,6 +195,10 @@ void initialize_scan_context(scan_args_t *args) {
    ScanCtx.mobi_ctx.content_size = args->content_size;
    ScanCtx.mobi_ctx.log = log_callback;
    ScanCtx.mobi_ctx.logf = logf_callback;
+    ScanCtx.mobi_ctx.store = write_thumbnail_callback;
+    ScanCtx.mobi_ctx.enable_tn = args->tn_count > 0;
+    ScanCtx.mobi_ctx.tn_size = args->tn_size;
+    ScanCtx.mobi_ctx.tn_qscale = args->tn_quality;

    // TEXT
    ScanCtx.text_ctx.content_size = args->content_size;
@@ -312,17 +316,20 @@ void sist2_index(index_args_t *args) {
    database_open(db);
    database_iterator_t *iterator = database_create_document_iterator(db);
    database_document_iter_foreach(json, iterator) {
-        const char *doc_id = cJSON_GetObjectItem(json, "_id")->valuestring;
+        char doc_id[SIST_DOC_ID_LEN];
+        strcpy(doc_id, cJSON_GetObjectItem(json, "_id")->valuestring);
+        cJSON_DeleteItemFromObject(json, "_id");
+
        if (args->print) {
            print_json(json, doc_id);
        } else {
            index_json(json, doc_id);
            cnt += 1;
        }
+        cJSON_Delete(json);
    }

    free(iterator);
-    database_close(db, FALSE);

    if (!args->print) {
        database_iterator_t *del_iter = database_create_delete_list_iterator(db);
@@ -330,8 +337,11 @@ void sist2_index(index_args_t *args) {
            delete_document(id);
            free(id);
        }
+        free(del_iter);
    }

+    database_close(db, FALSE);
+
    tpool_wait(IndexCtx.pool);
    tpool_destroy(IndexCtx.pool);

--- a/src/sist.h
+++ b/src/sist.h
@@ -51,11 +51,11 @@
 #include <ctype.h>
 #include "git_hash.h"

-#define VERSION "3.0.0"
+#define VERSION "3.0.5"
 static const char *const Version = VERSION;
 static const int VersionMajor = 3;
 static const int VersionMinor = 0;
-static const int VersionPatch = 0;
+static const int VersionPatch = 5;

 #ifndef SIST_PLATFORM
 #define SIST_PLATFORM unknown
--- a/src/tpool.c
+++ b/src/tpool.c
@@ -149,6 +149,11 @@ void worker_proc_cleanup(tpool_t *pool) {
    if (ProcData.index_db != NULL) {
        database_close(ProcData.index_db, FALSE);
    }
+
+    if (IndexCtx.needs_es_connection) {
+        elastic_cleanup();
+    }
+
    database_close(ProcData.ipc_db, FALSE);
 }

@@ -242,6 +247,7 @@ static void *tpool_worker(void *arg) {
    pthread_mutex_lock(&pool->shm->mutex);
    pthread_cond_signal(&pool->shm->done_working_cond);
    pthread_mutex_unlock(&pool->shm->mutex);
+    worker_proc_cleanup(pool);
 #endif

    return NULL;
--- a/src/web/serve.c
+++ b/src/web/serve.c
@@ -20,49 +20,40 @@ static struct mg_http_serve_opts DefaultServeOpts = {

 void stats_files(struct mg_connection *nc, struct mg_http_message *hm) {

-    if (hm->uri.len != SIST_INDEX_ID_LEN + 4) {
+    if (hm->uri.len != SIST_INDEX_ID_LEN + 7) {
        HTTP_REPLY_NOT_FOUND
        return;
    }

    char arg_index_id[SIST_INDEX_ID_LEN];
+    char arg_stat_type[5];
+
    memcpy(arg_index_id, hm->uri.ptr + 3, SIST_INDEX_ID_LEN);
    *(arg_index_id + SIST_INDEX_ID_LEN - 1) = '\0';
+    memcpy(arg_stat_type, hm->uri.ptr + 3 + SIST_INDEX_ID_LEN, 4);
+    *(arg_stat_type + sizeof(arg_stat_type) - 1) = '\0';

-    index_t *index = web_get_index_by_id(arg_index_id);
-    if (index == NULL) {
+    database_stat_type_d stat_type = database_get_stat_type_by_mnemonic(arg_stat_type);
+    if (stat_type == DATABASE_STAT_INVALID) {
        HTTP_REPLY_NOT_FOUND
        return;
    }

-    const char *file;
-    switch (atoi(hm->uri.ptr + 3 + SIST_INDEX_ID_LEN)) {
-        case 1:
-            file = "treemap.csv";
-            break;
-        case 2:
-            file = "mime_agg.csv";
-            break;
-        case 3:
-            file = "size_agg.csv";
-            break;
-        case 4:
-            file = "date_agg.csv";
-            break;
-        default:
+    database_t *db = web_get_database(arg_index_id);
+    if (db == NULL) {
+        LOG_DEBUGF("serve.c", "Could not get database for index: %s", arg_index_id);
+        HTTP_REPLY_NOT_FOUND
        return;
    }

-    char disposition[8192];
-    snprintf(disposition, sizeof(disposition),
-             "Content-Disposition: inline; filename=\"%s\"\r\nCache-Control: max-age=31536000\r\n", file);
+    cJSON *json = database_get_stats(db, stat_type);
+    char *json_str = cJSON_PrintUnformatted(json);

-    char full_path[PATH_MAX];
-    strcpy(full_path, index->path);
-    strcat(full_path, file);
+    web_send_headers(nc, 200, strlen(json_str), "Content-Type: application/json");
+    mg_send(nc, json_str, strlen(json_str));

-    struct mg_http_serve_opts opts = {};
-    mg_http_serve_file(nc, hm, full_path, &opts);
+    free(json_str);
+    cJSON_Delete(json);
 }

 void serve_index_html(struct mg_connection *nc, struct mg_http_message *hm) {
@@ -286,16 +277,23 @@ void index_info(struct mg_connection *nc) {
    cJSON *json = cJSON_CreateObject();
    cJSON *arr = cJSON_AddArrayToObject(json, "indices");

-    cJSON_AddStringToObject(json, "mongooseVersion", MG_VERSION);
    cJSON_AddStringToObject(json, "esIndex", WebCtx.es_index);
    cJSON_AddStringToObject(json, "version", Version);
+
+#ifdef SIST_DEBUG_INFO
+    cJSON_AddStringToObject(json, "mongooseVersion", MG_VERSION);
    cJSON_AddStringToObject(json, "esVersion", es_version);
-    cJSON_AddBoolToObject(json, "esVersionSupported", IS_SUPPORTED_ES_VERSION(WebCtx.es_version));
-    cJSON_AddBoolToObject(json, "esVersionLegacy", IS_LEGACY_VERSION(WebCtx.es_version));
    cJSON_AddStringToObject(json, "platform", QUOTE(SIST_PLATFORM));
    cJSON_AddStringToObject(json, "sist2Hash", Sist2CommitHash);
-    cJSON_AddStringToObject(json, "lang", WebCtx.lang);
    cJSON_AddBoolToObject(json, "dev", WebCtx.dev);
+    cJSON_AddBoolToObject(json, "showDebugInfo", TRUE);
+#else
+    cJSON_AddBoolToObject(json, "showDebugInfo", FALSE);
+#endif
+
+    cJSON_AddBoolToObject(json, "esVersionSupported", IS_SUPPORTED_ES_VERSION(WebCtx.es_version));
+    cJSON_AddBoolToObject(json, "esVersionLegacy", IS_LEGACY_VERSION(WebCtx.es_version));
+    cJSON_AddStringToObject(json, "lang", WebCtx.lang);

    cJSON_AddBoolToObject(json, "auth0Enabled", WebCtx.auth0_enabled);
    if (WebCtx.auth0_enabled) {
@@ -668,6 +666,9 @@ static void ev_router(struct mg_connection *nc, int ev, void *ev_data, UNUSED(vo
                    mg_send(nc, r->body, r->size);
                } else if (r->status_code == 0) {
                    sist_log("serve.c", LOG_SIST_ERROR, "Could not connect to elasticsearch!");
+
+                    mg_http_reply(nc, 503, HTTP_SERVER_HEADER HTTP_TEXT_TYPE_HEADER,
+                                  "Elasticsearch connection error, see server logs.");
                } else {
                    sist_logf("serve.c", LOG_SIST_WARNING, "ElasticSearch error during query (%d)", r->status_code);
                    if (r->size != 0) {
--- a/third-party/libscan/CMakeLists.txt
+++ b/third-party/libscan/CMakeLists.txt
@@ -106,7 +106,7 @@ find_library(MUPDF_LIB NAMES liblibmupdf.a)
 find_library(CMS_LIB NAMES lcms2)
 find_library(JAS_LIB NAMES jasper)
 find_library(GUMBO_LIB NAMES gumbo)
-find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/11/ /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/11/)
+find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/11/ /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/11/ /usr/lib/gcc/x86_64-linux-gnu/8/ /usr/lib/gcc/aarch64-linux-gnu/8/)
 find_package(Leptonica CONFIG REQUIRED)
 find_package(FFMPEG REQUIRED)
 find_package(libraw CONFIG REQUIRED)
--- a/third-party/libscan/libscan/mobi/scan_mobi.c
+++ b/third-party/libscan/libscan/mobi/scan_mobi.c
@@ -1,9 +1,44 @@
 #include "scan_mobi.h"

 #include "../../third-party/libmobi/src/mobi.h"
+#include "../media/media.h"
 #include <errno.h>
 #include "stdlib.h"

+int store_cover(scan_mobi_ctx_t *ctx, document_t *doc, MOBIData *m) {
+    MOBIExthHeader *exth = mobi_get_exthrecord_by_tag(m, EXTH_COVEROFFSET);
+
+    if (exth == NULL) {
+        return FALSE;
+    }
+
+    uint32_t offset = mobi_decode_exthvalue(exth->data, exth->size);
+    size_t first_resource = mobi_get_first_resource_record(m);
+    size_t uid = first_resource + offset;
+    MOBIPdbRecord *record = mobi_get_record_by_seqnumber(m, uid);
+
+    if (record == NULL || record->size < 4) {
+        return FALSE;
+    }
+
+    scan_media_ctx_t media_ctx = {
+            .tn_count = TRUE,
+            .tn_size = ctx->tn_size,
+            .tn_qscale = ctx->tn_qscale,
+            .tesseract_lang = NULL,
+            .tesseract_path = NULL,
+            .read_subtitles = FALSE,
+            .max_media_buffer = 0,
+            .log = ctx->log,
+            .logf = ctx->logf,
+            .store = ctx->store,
+    };
+
+    store_image_thumbnail(&media_ctx, record->data, record->size, doc, "img.jpg");
+
+    return TRUE;
+}
+
 void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc) {

    MOBIData *m = mobi_init();
@@ -72,6 +107,10 @@ void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc) {

    APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);

+    if (ctx->enable_tn) {
+        store_cover(ctx, doc, m);
+    }
+
    free(content_str);
    free(buf);
    text_buffer_destroy(&tex);
--- a/third-party/libscan/libscan/mobi/scan_mobi.h
+++ b/third-party/libscan/libscan/mobi/scan_mobi.h
@@ -7,6 +7,11 @@ typedef struct {
    long content_size;
    log_callback_t log;
    logf_callback_t logf;
+    store_callback_t store;
+
+    int tn_qscale;
+    int tn_size;
+    int enable_tn;
 } scan_mobi_ctx_t;

 void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc);
--- a/third-party/libscan/third-party/antiword
+++ b/third-party/libscan/third-party/antiword
--- a/third-party/libscan/third-party/libmobi
+++ b/third-party/libscan/third-party/libmobi
Author	SHA1	Message	Date
simon987	2d8685f8f5	Fix tesseract language paths	2023-04-23 17:29:55 -04:00
simon987	c930ef7840	Update readme	2023-04-23 16:39:34 -04:00
simon987	d32bda0d68	Bug fixes	2023-04-23 14:15:31 -04:00
simon987	499ed0be79	Fix readme link	2023-04-23 12:54:33 -04:00
simon987	dc39c0ec4b	Add NER support	2023-04-23 12:53:27 -04:00
simon987	b5cdd9a5df	Work on README, optimize database storage	2023-04-22 16:02:19 -04:00
simon987	a8b6886f7b	Fix stats page	2023-04-16 19:46:01 -04:00
simon987	a7e9b6af96	Flush documents in index	2023-04-15 13:49:18 -04:00
simon987	0710dc6d3d	Update readme / format support	2023-04-15 13:39:25 -04:00
simon987	75b66b5982	Fix #351	2023-04-15 13:06:13 -04:00
simon987	9813646c11	Fix #343	2023-04-15 12:39:47 -04:00
simon987	ebc9468251	Fix some memory leaks	2023-04-15 11:54:56 -04:00
simon987	7baaca5078	add_work fix for problem in #349 pt 2	2023-04-15 09:48:50 -04:00
simon987	6c4bdc87cf	add_work fix for problem in #349	2023-04-15 09:18:17 -04:00
simon987	1ea78887c3	Fix aarch64 build	2023-04-14 21:53:25 -04:00
simon987	886fa720ec	Fix for ES 8.X #302	2023-04-14 21:48:29 -04:00
simon987	d43aac735f	Add build flag to toggle debug info in web module	2023-04-14 21:07:48 -04:00
simon987	faf438a798	Add error message in home page on ES connection error #331	2023-04-14 20:51:35 -04:00
simon987	5b3b9911bd	Bug fix for delete iterator	2023-04-13 18:35:36 -04:00
simon987	237d55ec9c	Merge pull request #348 from simon987/dependabot/npm_and_yarn/sist2-vue/d3-color-and-d3-3.1.0 Bump d3-color and d3 in /sist2-vue	2023-04-10 20:40:35 -04:00
dependabot[bot]	ced4c7de88	Bump d3-color and d3 in /sist2-vue Bumps [d3-color](https://github.com/d3/d3-color) to 3.1.0 and updates ancestor dependency [d3](https://github.com/d3/d3). These dependencies need to be updated together. Updates `d3-color` from 1.4.1 to 3.1.0 - [Release notes](https://github.com/d3/d3-color/releases) - [Commits](https://github.com/d3/d3-color/compare/v1.4.1...v3.1.0) Updates `d3` from 5.16.0 to 7.8.4 - [Release notes](https://github.com/d3/d3/releases) - [Changelog](https://github.com/d3/d3/blob/main/CHANGES.md) - [Commits](https://github.com/d3/d3/compare/v5.16.0...v7.8.4) --- updated-dependencies: - dependency-name: d3-color dependency-type: indirect - dependency-name: d3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com>	2023-04-11 00:35:05 +00:00
simon987	90ee318981	Update CI build script	2023-04-10 20:07:52 -04:00
simon987	785121e46c	Merge pull request #347 from simon987/dependabot/npm_and_yarn/sist2-admin/frontend/webpack-5.78.0 Bump webpack from 5.75.0 to 5.78.0 in /sist2-admin/frontend	2023-04-10 19:57:27 -04:00
simon987	585c57a2ad	Fix antiword version	2023-04-10 19:57:02 -04:00
simon987	42abbbce95	Fix libmobi version	2023-04-10 19:54:05 -04:00
dependabot[bot]	e8607df26f	Bump webpack from 5.75.0 to 5.78.0 in /sist2-admin/frontend Bumps [webpack](https://github.com/webpack/webpack) from 5.75.0 to 5.78.0. - [Release notes](https://github.com/webpack/webpack/releases) - [Commits](https://github.com/webpack/webpack/compare/v5.75.0...v5.78.0) --- updated-dependencies: - dependency-name: webpack dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com>	2023-04-10 23:51:06 +00:00
simon987	f1726ca0a9	Merge pull request #342 from simon987/dependabot/npm_and_yarn/sist2-vue/webpack-5.76.1 Bump webpack from 5.75.0 to 5.76.1 in /sist2-vue	2023-04-10 19:50:40 -04:00
simon987	3ef675abcf	Merge pull request #345 from simon987/process-pool Process pool	2023-04-10 19:50:23 -04:00
dependabot[bot]	81658efb19	Bump webpack from 5.75.0 to 5.76.1 in /sist2-vue Bumps [webpack](https://github.com/webpack/webpack) from 5.75.0 to 5.76.1. - [Release notes](https://github.com/webpack/webpack/releases) - [Commits](https://github.com/webpack/webpack/compare/v5.75.0...v5.76.1) --- updated-dependencies: - dependency-name: webpack dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com>	2023-03-15 10:25:29 +00:00
simon987	60c77678b4	Merge pull request #339 from einfachTobi/patch-1 Update messages.ts	2023-02-28 17:40:45 -05:00
einfachTobi	bf1d2f7d55	Update messages.ts	2023-02-28 11:24:02 +01:00