Bug fixes

Fix readme link
Add NER support
2025-12-12 15:08:53 +00:00 · 2023-04-23 14:15:31 -04:00 · 2023-04-23 12:54:33 -04:00 · 2023-04-23 12:53:27 -04:00 · 2023-04-22 16:02:19 -04:00 · 2023-04-16 19:46:01 -04:00
146 changed files with 9060 additions and 8294 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -15,7 +15,6 @@ Makefile
 **/*.cbp
 VERSION
 **/node_modules/
-.git/
 sist2-*-linux-debug
 sist2-*-linux
 sist2_debug
@@ -33,4 +32,9 @@ tmp_scan/
 Dockerfile
 Dockerfile.arm64
 docker-compose.yml
-state.db
+state.db
+*-journal
+build/
+__pycache__/
+sist2-vue/dist
+sist2-admin/frontend/dist
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +0,0 @@
-CMakeModules/* linguist-vendored
-**/*_generated.c linguist-vendored
-**/*_generated.h linguist-vendored
--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,14 @@ state.db
 *.pyc
 !sist2-admin/frontend/dist
 *.js.map
+sist2-vue/dist
+sist2-admin/frontend/dist
+.ninja_deps
+.ninja_log
+build.ninja
+src/web/static_generated.c
+src/magic_generated.c
+src/index/static_generated.c
+*.sist2
+*-shm
+*-journal
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "third-party/libscan/third-party/libmobi"]
 	path = third-party/libscan/third-party/libmobi
 	url = https://github.com/bfabiszewski/libmobi
+[submodule "third-party/libscan/libscan-test-files"]
+	path = third-party/libscan/libscan-test-files
+	url = https://github.com/simon987/libscan-test-files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@ set(CMAKE_C_STANDARD 11)

 option(SIST_DEBUG "Build a debug executable" on)
 option(SIST_FAST "Enable more optimisation flags" off)
-option(SIST_FAKE_STORE "Disable IO operations of LMDB stores for debugging purposes" 0)
+option(SIST_DEBUG_INFO "Turn on debug information in web interface" on)

 add_compile_definitions(
        "SIST_PLATFORM=${SIST_PLATFORM}"
@@ -15,36 +15,50 @@ if (SIST_DEBUG)
    add_compile_definitions(
            "SIST_DEBUG=${SIST_DEBUG}"
    )
-endif()
+    set(VCPKG_BUILD_TYPE debug)
+else ()
+    set(VCPKG_BUILD_TYPE release)
+endif ()
+
+if (SIST_DEBUG_INFO)
+    add_compile_definitions(
+            "SIST_DEBUG_INFO=${SIST_DEBUG_INFO}"
+    )
+endif ()
+

 add_subdirectory(third-party/libscan)
 set(ARGPARSE_SHARED off)
 add_subdirectory(third-party/argparse)

 add_executable(sist2
+        # argparse
+        third-party/argparse/argparse.h third-party/argparse/argparse.c
+
        src/main.c
        src/sist.h
        src/io/walk.h src/io/walk.c
-        src/io/store.h src/io/store.c
        src/tpool.h src/tpool.c
        src/parsing/parse.h src/parsing/parse.c
+        src/parsing/magic_util.c src/parsing/magic_util.h
        src/io/serialize.h src/io/serialize.c
        src/parsing/mime.h src/parsing/mime.c src/parsing/mime_generated.c
        src/index/web.c src/index/web.h
        src/web/serve.c src/web/serve.h
+        src/web/web_util.c src/web/web_util.h
        src/index/elastic.c src/index/elastic.h
        src/util.c src/util.h
-        src/ctx.h src/types.h
+        src/ctx.c src/ctx.h
+        src/types.h
        src/log.c src/log.h
        src/cli.c src/cli.h
-        src/stats.c src/stats.h src/ctx.c
        src/parsing/sidecar.c src/parsing/sidecar.h
+        src/database/database.c src/database/database.h
+        src/parsing/fs_util.h

        src/auth0/auth0_c_api.h src/auth0/auth0_c_api.cpp

-        # argparse
-        third-party/argparse/argparse.h third-party/argparse/argparse.c
-        )
+        src/database/database_stats.c src/database/database_schema.c)
 set_target_properties(sist2 PROPERTIES LINKER_LANGUAGE C)

 target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/)
@@ -52,16 +66,11 @@ set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib)

 find_package(PkgConfig REQUIRED)

-pkg_search_module(GLIB REQUIRED glib-2.0)
-
-find_package(lmdb CONFIG REQUIRED)
 find_package(cJSON CONFIG REQUIRED)
 find_package(unofficial-mongoose CONFIG REQUIRED)
 find_package(CURL CONFIG REQUIRED)
-find_library(MAGIC_LIB
-        NAMES libmagic.so.1 magic
-        PATHS /usr/lib/x86_64-linux-gnu/ /usr/lib/aarch64-linux-gnu/
-)
+find_library(MAGIC_LIB NAMES libmagic.a REQUIRED)
+find_package(unofficial-sqlite3 CONFIG REQUIRED)


 target_include_directories(
@@ -70,7 +79,6 @@ target_include_directories(
        ${CMAKE_SOURCE_DIR}/third-party/utf8.h/
        ${CMAKE_SOURCE_DIR}/third-party/libscan/
        ${CMAKE_SOURCE_DIR}/
-        ${GLIB_INCLUDE_DIRS}
 )

 target_compile_options(
@@ -88,7 +96,7 @@ if (SIST_DEBUG)
            -fno-omit-frame-pointer
            -fsanitize=address
            -fno-inline
-#            -O2
+            #            -O2
    )
    target_link_options(
            sist2
@@ -120,6 +128,7 @@ else ()
            -Ofast
            -fno-stack-protector
            -fomit-frame-pointer
+            -w
    )
 endif ()

@@ -133,20 +142,16 @@ target_link_libraries(
        sist2

        z
-        lmdb
-        cjson
        argparse
-        ${GLIB_LDFLAGS}
        unofficial::mongoose::mongoose
        CURL::libcurl

        pthread

-        c
-
        scan

        ${MAGIC_LIB}
+        unofficial::sqlite3::sqlite3
 )

 add_custom_target(
--- a/29
+++ b/29
@@ -1,6 +1,11 @@
 FROM simon987/sist2-build as build
 MAINTAINER simon987 <me@simon987.net>

+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN curl -fsSL https://deb.nodesource.com/setup_16.x | bash
+RUN apt update -y; apt install -y nodejs && rm -rf /var/lib/apt/lists/*
+
 WORKDIR /build/

 COPY scripts scripts
@@ -9,14 +14,17 @@ COPY CMakeLists.txt .
 COPY third-party third-party
 COPY src src
 COPY sist2-vue sist2-vue
+COPY sist2-admin sist2-admin

-RUN cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake .
-RUN make -j$(nproc)
-RUN strip sist2 || mv sist2_debug sist2
+RUN cd sist2-vue/ && npm install && npm run build
+RUN cd sist2-admin/frontend/ && npm install && npm run build
+
+RUN mkdir build && cd build && cmake -DSIST_PLATFORM=x64_linux_docker -DSIST_DEBUG_INFO=on -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake ..
+RUN cd build && make -j$(nproc)
+RUN strip build/sist2 || mv build/sist2_debug build/sist2

 FROM --platform="linux/amd64" ubuntu@sha256:965fbcae990b0467ed5657caceaec165018ef44a4d2d46c7cdea80a9dff0d1ea

-WORKDIR /root

 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8
@@ -24,7 +32,7 @@ ENV LC_ALL C.UTF-8
 ENTRYPOINT ["/root/sist2"]

 RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y curl libasan5 libmagic1 python3  \
-    python3-pip git tesseract-ocr libpq-dev && rm -rf /var/lib/apt/lists/*
+    python3-pip git tesseract-ocr && rm -rf /var/lib/apt/lists/*

 RUN mkdir -p /usr/share/tessdata && \
    cd /usr/share/tessdata/ && \
@@ -35,12 +43,15 @@ RUN mkdir -p /usr/share/tessdata && \
    curl -o /usr/share/tessdata/rus.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/rus.traineddata &&\
    curl -o /usr/share/tessdata/osd.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/osd.traineddata &&\
    curl -o /usr/share/tessdata/spa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata &&\
+    curl -o /usr/share/tessdata/deu.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/deu.traineddata &&\
+    curl -o /usr/share/tessdata/equ.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/equ.traineddata &&\
    curl -o /usr/share/tessdata/chi_sim.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/chi_sim.traineddata

 # sist2
-COPY --from=build /build/sist2 /root/sist2
+COPY --from=build /build/build/sist2 /root/sist2

 # sist2-admin
-COPY sist2-admin/requirements.txt sist2-admin/
-RUN python3 -m pip install --no-cache -r sist2-admin/requirements.txt
-COPY sist2-admin/ sist2-admin/
+WORKDIR /root/sist2-admin
+COPY sist2-admin/requirements.txt /root/sist2-admin/
+RUN python3 -m pip install --no-cache -r /root/sist2-admin/requirements.txt
+COPY --from=build /build/sist2-admin/ /root/sist2-admin/
--- a/Dockerfile.arm64
+++ b/Dockerfile.arm64
@@ -3,13 +3,20 @@ MAINTAINER simon987 <me@simon987.net>

 WORKDIR /build/
 ADD . /build/
-RUN cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake .
-RUN make -j$(nproc)
-RUN strip sist2
+RUN mkdir build && cd build && cmake -DSIST_PLATFORM=arm64_linux_docker -DSIST_DEBUG_INFO=on -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake ..
+RUN cd build && make -j$(nproc)
+RUN strip build/sist2 || mv build/sist2_debug build/sist2

-FROM --platform="linux/arm64/v8" ubuntu:20.04
+FROM --platform=linux/arm64/v8 ubuntu@sha256:537da24818633b45fcb65e5285a68c3ec1f3db25f5ae5476a7757bc8dfae92a3

-RUN apt update && apt install -y curl libasan5 && rm -rf /var/lib/apt/lists/*
+WORKDIR /root
+
+ENV LANG C.UTF-8
+ENV LC_ALL C.UTF-8
+
+ENTRYPOINT ["/root/sist2"]
+
+RUN apt update && apt install -y curl libasan5 libmagic1 tesseract-ocr python3-pip python3 git && rm -rf /var/lib/apt/lists/*

 RUN mkdir -p /usr/share/tessdata && \
    cd /usr/share/tessdata/ && \
@@ -18,11 +25,16 @@ RUN mkdir -p /usr/share/tessdata && \
    curl -o /usr/share/tessdata/eng.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/eng.traineddata &&\
    curl -o /usr/share/tessdata/fra.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/fra.traineddata &&\
    curl -o /usr/share/tessdata/rus.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/rus.traineddata &&\
-    curl -o /usr/share/tessdata/spa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata
+    curl -o /usr/share/tessdata/osd.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/osd.traineddata &&\
+    curl -o /usr/share/tessdata/spa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata &&\
+    curl -o /usr/share/tessdata/deu.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/deu.traineddata &&\
+    curl -o /usr/share/tessdata/equ.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/equ.traineddata &&\
+    curl -o /usr/share/tessdata/chi_sim.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/chi_sim.traineddata

-ENV LANG C.UTF-8
-ENV LC_ALL C.UTF-8
+# sist2
+COPY --from=build /build/build/sist2 /root/sist2

-ENTRYPOINT ["/root/sist2"]
-
-COPY --from=build /build/sist2 /root/sist2
+# sist2-admin
+COPY sist2-admin/requirements.txt sist2-admin/
+RUN python3 -m pip install --no-cache -r sist2-admin/requirements.txt
+COPY --from=build /build/sist2-admin/ sist2-admin/
--- a/README.md
+++ b/README.md
@@ -10,13 +10,13 @@ sist2 (Simple incremental search tool)

 *Warning: sist2 is in early development*

-![search panel](docs/sist2.png)
+![search panel](docs/sist2.gif)

 ## Features

 * Fast, low memory usage, multi-threaded
+* Manage & schedule scan jobs with simple web interface (Docker only)
 * Mobile-friendly Web interface
-* Portable (all its features are packaged in a single executable)
 * Extracts text and metadata from common file types \*
 * Generates thumbnails \*
 * Incremental scanning
@@ -24,47 +24,60 @@ sist2 (Simple incremental search tool)
 * Recursive scan inside archive files \*\*
 * OCR support with tesseract \*\*\*
 * Stats page & disk utilisation visualization
+* Named-entity recognition (client-side) \*\*\*\*

 \* See [format support](#format-support)    
 \*\* See [Archive files](#archive-files)    
 \*\*\* See [OCR](#ocr)
-
-![stats](docs/stats.png)
+\*\*\*\* See [Named-Entity Recognition](#NER)

 ## Getting Started

+### Using Docker Compose *(Windows/Linux/Mac)*
+
+```yaml
+version: "3"
+
+services:
+  elasticsearch:
+    image: elasticsearch:7.17.9
+    restart: unless-stopped
+    environment:
+      - "discovery.type=single-node"
+      - "ES_JAVA_OPTS=-Xms2g -Xmx2g"
+  sist2-admin:
+    image: simon987/sist2:3.0.3
+    restart: unless-stopped
+    volumes:
+      - ./sist2-admin-data/:/sist2-admin/
+      - /:/host
+    ports:
+      - 4090:4090 # sist2
+      - 8080:8080 # sist2-admin
+    working_dir: /root/sist2-admin/
+    entrypoint: python3 /root/sist2-admin/sist2_admin/app.py
+```
+
+Navigate to http://localhost:8080/ to configure sist2-admin.
+
+### Using the executable file *(Linux/WSL only)*
+
 1. Have an Elasticsearch (>= 6.8.X, ideally >=7.14.0) instance running
    1. Download [from official website](https://www.elastic.co/downloads/elasticsearch)
-    1. *(or)* Run using docker:
+    2. *(or)* Run using docker:
        ```bash
-        docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.14.0
+        docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.17.9
        ```
-    1. *(or)* Run using docker-compose:
-        ```yaml
-          elasticsearch:
-            image: docker.elastic.co/elasticsearch/elasticsearch:7.14.0
-            environment:
-              - discovery.type=single-node
-              - "ES_JAVA_OPTS=-Xms1G -Xmx2G"
-        ```
-1. Download sist2 executable
-    1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases). 
-Select the file corresponding to your CPU architecture and mark the binary as executable with `chmod +x` *
-    2. *(or)* Download a [development snapshot](https://files.simon987.net/.gate/sist2/simon987_sist2/) *(Not
-       recommended!)*
-    3. *(or)* `docker pull simon987/sist2:2.12.1-x64-linux`

-1. See [Usage guide](docs/USAGE.md)
+2. Download the [latest sist2 release](https://github.com/simon987/sist2/releases).
+   Select the file corresponding to your CPU architecture and mark the binary as executable with `chmod +x`.
+3. See [usage guide](docs/USAGE.md) for command line usage.

-\* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)
+Example usage:

-## Example usage
-
-See [Usage guide](docs/USAGE.md) for more details
-
-1. Scan a directory: `sist2 scan ~/Documents -o ./docs_idx`
-1. Push index to Elasticsearch: `sist2 index ./docs_idx`
-1. Start web interface: `sist2 web ./docs_idx`
+1. Scan a directory: `sist2 scan ~/Documents --output ./documents.sist2`
+2. Push index to Elasticsearch: `sist2 index ./documents.sist2`
+3. Start web interface: `sist2 web ./documents.sist2`

 ## Format support

@@ -81,8 +94,8 @@ See [Usage guide](docs/USAGE.md) for more details
 | html, xml                                                                 | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes      | no          | -                                                                                                                                      |
 | tar, zip, rar, 7z, ar ...                                                 | Libarchive                                                                   | yes\*    | -           | no                                                                                                                                     |
 | docx, xlsx, pptx                                                          | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes      | if embedded | creator, modified_by, title                                                                                                            |
-| doc (MS Word 97-2003)                                                     | antiword                                                                     | yes      | yes         | author, title                                                                                                                          |
-| mobi, azw, azw3                                                           | libmobi                                                                      | yes      | no          | author, title                                                                                                                          |
+| doc (MS Word 97-2003)                                                     | antiword                                                                     | yes      | no          | author, title                                                                                                                          |
+| mobi, azw, azw3                                                           | libmobi                                                                      | yes      | yes         | author, title                                                                                                                          |
 | wpd (WordPerfect)                                                         | libwpd                                                                       | yes      | no          | *planned*                                                                                                                              |
 | json, jsonl, ndjson                                                       | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes      | -           | -                                                                                                                                      |

@@ -109,10 +122,10 @@ Download the language data files with your package manager (`apt install tessera
 directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).

 The `simon987/sist2` image comes with common languages
-(hin, jpn, eng, fra, rus, spa) pre-installed.
+(hin, jpn, eng, fra, rus, spa, chi_sim, deu) pre-installed.

 You can use the `+` separator to specify multiple languages. The language
-name must be identical to the `*.traineddata` file installed on your system 
+name must be identical to the `*.traineddata` file installed on your system
 (use `chi_sim` rather than `chi-sim`).

 Examples:
@@ -123,39 +136,63 @@ sist2 scan --ocr-images --ocr-lang eng ~/Images/Screenshots/
 sist2 scan --ocr-ebooks --ocr-images --ocr-lang eng+chi_sim ~/Chinese-Bilingual/
 ```

+### NER
+
+sist2 v3.0.4+ supports named-entity recognition (NER). Simply add a supported repository URL to 
+**Configuration** > **Machine learning options** > **Model repositories**
+to enable it.
+
+The text processing is done in your browser, no data is sent to any third-party services.
+See [simon987/sist2-ner-models](https://github.com/simon987/sist2-ner-models) for more details.
+
+#### List of available repositories:
+
+| URL                                                                                                     | Maintainer                              | Purpose |
+|---------------------------------------------------------------------------------------------------------|-----------------------------------------|---------|
+| [simon987/sist2-ner-models](https://raw.githubusercontent.com/simon987/sist2-ner-models/main/repo.json) | [simon987](https://github.com/simon987) | General |
+
+
+<details>
+  <summary>Screenshot</summary>
+
+![ner](docs/ner.png)
+
+</details>
+
 ## Build from source

 You can compile **sist2** by yourself if you don't want to use the pre-compiled binaries

-### With docker (recommended)
+### Using docker

 ```bash
 git clone --recursive https://github.com/simon987/sist2/
 cd sist2
-docker build . -f ./Dockerfile -t my-sist2-image
+docker build . -t my-sist2-image
+# Copy sist2 executable from docker image
 docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
 ```

-### On a linux computer
+### Using a linux computer

 1. Install compile-time dependencies

   ```bash
-   apt install gcc g++ python3 yasm ragel automake autotools-dev wget libtool libssl-dev curl zip unzip tar xorg-dev libglu1-mesa-dev libxcursor-dev libxml2-dev libxinerama-dev gettext nasm git
+   apt install gcc g++ python3 yasm ragel automake autotools-dev wget libtool libssl-dev curl zip unzip tar xorg-dev libglu1-mesa-dev libxcursor-dev libxml2-dev libxinerama-dev gettext nasm git nodejs
   ```

-1. Apply vcpkg patches, as per [sist2-build](https://github.com/simon987/sist2-build) Dockerfile
-
-1. Install vcpkg dependencies
+2. Install vcpkg using my fork: https://github.com/simon987/vcpkg
+3. Install vcpkg dependencies

    ```bash
-    vcpkg install curl[core,openssl]
-    vcpkg install lmdb cjson glib brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw jasper lcms gumbo
+    vcpkg install curl[core,openssl] sqlite3 cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample]
    ```

-1. Build
+4. Build
    ```bash
    git clone --recursive https://github.com/simon987/sist2/
+    (cd sist2-vue; npm install; npm run build)
+    (cd sist2-admin/frontend; npm install; npm run build)
    cmake -DSIST_DEBUG=off -DCMAKE_TOOLCHAIN_FILE=<VCPKG_ROOT>/scripts/buildsystems/vcpkg.cmake .
    make
    ```
--- a/contrib/systemd/sist2-update-files.sh
+++ b/contrib/systemd/sist2-update-files.sh
@@ -12,7 +12,7 @@ REWRITE_URL=""
 sist2 scan \
  --threads 14 \
  --mem-throttle 32768 \
-  --quality 1.0 \
+  --thumbnail-quality 2 \
  --name $NAME \
  --ocr-lang=eng+chi_sim \
  --ocr-ebooks \
--- a/contrib/systemd/sist2-update-nextcloud.sh
+++ b/contrib/systemd/sist2-update-nextcloud.sh
@@ -12,7 +12,7 @@ REWRITE_URL=""
 sist2 scan \
  --threads 14 \
  --mem-throttle 32768 \
-  --quality 1.0 \
+  --thumbnail-quality 2 \
  --name $NAME \
  --ocr-lang=eng+chi_sim \
  --ocr-ebooks \
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,7 +2,7 @@ version: "3"

 services:
  elasticsearch:
-    image: elasticsearch:7.14.0
+    image: elasticsearch:7.17.9
    container_name: sist2-es
    environment:
      - "discovery.type=single-node"
@@ -15,9 +15,9 @@ services:
      - /mnt/array/sist2-admin-data/:/sist2-admin/
      - /:/host
    ports:
+      - 4090:4090
      # NOTE: Don't export this port publicly!
      - 8080:8080
-      - 4090:4090
    working_dir: /root/sist2-admin/
    entrypoint: python3
    command:
--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@@ -1,78 +1,64 @@
 # Usage

-*More examples (specifically with docker/compose) are in progress*
-
-* [scan](#scan)
-    * [options](#scan-options)
-    * [examples](#scan-examples)
-    * [index format](#index-format)
-* [index](#index)
-    * [options](#index-options)
-    * [examples](#index-examples)
-* [web](#web)
-    * [options](#web-options)
-    * [examples](#web-examples)
-    * [rewrite_url](#rewrite_url)
-* [elasticsearch](#elasticsearch)
-* [exec-script](#exec-script)
-* [tagging](#tagging)
-* [sidecar files](#sidecar-files)
-
 ```
 Usage: sist2 scan [OPTION]... PATH
   or: sist2 index [OPTION]... INDEX
   or: sist2 web [OPTION]... INDEX...
   or: sist2 exec-script [OPTION]... INDEX
+
 Lightning-fast file system indexer and search tool.

    -h, --help                        show this help message and exit
-    -v, --version                     Show version and exit
-    --verbose                         Turn on logging
-    --very-verbose                    Turn on debug messages
+    -v, --version                     Print version and exit.
+    --verbose                         Turn on logging.
+    --very-verbose                    Turn on debug messages.
+    --json-logs                       Output logs in JSON format.

 Scan options
-    -t, --threads=<int>               Number of threads. DEFAULT=1
-    --mem-throttle=<int>              Total memory threshold in MiB for scan throttling. DEFAULT=0
-    -q, --thumbnail-quality=<flt>     Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=1
-    --thumbnail-size=<int>            Thumbnail size, in pixels. DEFAULT=500
-    --thumbnail-count=<int>           Number of thumbnails to generate. Set a value > 1 to create video previews, set to 0 to disable thumbnails. DEFAULT=1
-    --content-size=<int>              Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT=32768
-    --incremental=<str>               Reuse an existing index and only scan modified files.
-    -o, --output=<str>                Output directory. DEFAULT=index.sist2/
+    -t, --threads=<int>               Number of threads. DEFAULT: 1
+    -q, --thumbnail-quality=<int>     Thumbnail quality, on a scale of 2 to 31, 2 being the best. DEFAULT: 2
+    --thumbnail-size=<int>            Thumbnail size, in pixels. DEFAULT: 552
+    --thumbnail-count=<int>           Number of thumbnails to generate. Set a value > 1 to create video previews, set to 0 to disable thumbnails. DEFAULT: 1
+    --content-size=<int>              Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT: 32768
+    -o, --output=<str>                Output index file path. DEFAULT: index.sist2
+    --incremental                     If the output file path exists, only scan new or modified files.
+    --optimize-index                  Defragment index file after scan to reduce its file size.
    --rewrite-url=<str>               Serve files from this url instead of from disk.
-    --name=<str>                      Index display name. DEFAULT: (name of the directory)
+    --name=<str>                      Index display name. DEFAULT: index
    --depth=<int>                     Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1
-    --archive=<str>                   Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse
+    --archive=<str>                   Archive file mode (skip|list|shallow|recurse). skip: don't scan, list: only save file names as text, shallow: don't scan archives inside archives. DEFAULT: recurse
    --archive-passphrase=<str>        Passphrase for encrypted archive files
    --ocr-lang=<str>                  Tesseract language (use 'tesseract --list-langs' to see which are installed on your machine)
    --ocr-images                      Enable OCR'ing of image files.
    --ocr-ebooks                      Enable OCR'ing of ebook files.
-    -e, --exclude=<str>               Files that match this regex will not be scanned
-    --fast                            Only index file names & mime type
+    -e, --exclude=<str>               Files that match this regex will not be scanned.
+    --fast                            Only index file names & mime type.
    --treemap-threshold=<str>         Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
    --mem-buffer=<int>                Maximum memory buffer size per thread in MiB for files inside archives (see USAGE.md). DEFAULT: 2000
    --read-subtitles                  Read subtitles from media files.
-    --fast-epub                       Faster but less accurate EPUB parsing (no thumbnails, metadata)
+    --fast-epub                       Faster but less accurate EPUB parsing (no thumbnails, metadata).
    --checksums                       Calculate file checksums when scanning.
    --list-file=<str>                 Specify a list of newline-delimited paths to be scanned instead of normal directory traversal. Use '-' to read from stdin.

 Index options
-    -t, --threads=<int>               Number of threads. DEFAULT=1
-    --es-url=<str>                    Elasticsearch url with port. DEFAULT=http://localhost:9200
-    --es-index=<str>                  Elasticsearch index name. DEFAULT=sist2
-    -p, --print                       Just print JSON documents to stdout.
-    --incremental-index               Conduct incremental indexing, assumes that the old index is already digested by Elasticsearch.
+    -t, --threads=<int>               Number of threads. DEFAULT: 1
+    --es-url=<str>                    Elasticsearch url with port. DEFAULT: http://localhost:9200
+    --es-insecure-ssl                 Do not verify SSL connections to Elasticsearch.
+    --es-index=<str>                  Elasticsearch index name. DEFAULT: sist2
+    -p, --print                       Print JSON documents to stdout instead of indexing to elasticsearch.
+    --incremental-index               Conduct incremental indexing. Assumes that the old index is already ingested in Elasticsearch.
    --script-file=<str>               Path to user script.
    --mappings-file=<str>             Path to Elasticsearch mappings.
    --settings-file=<str>             Path to Elasticsearch settings.
    --async-script                    Execute user script asynchronously.
-    --batch-size=<int>                Index batch size. DEFAULT: 100
-    -f, --force-reset                 Reset Elasticsearch mappings and settings. (You must use this option the first time you use the index command)
+    --batch-size=<int>                Index batch size. DEFAULT: 70
+    -f, --force-reset                 Reset Elasticsearch mappings and settings.

 Web options
-    --es-url=<str>                    Elasticsearch url. DEFAULT=http://localhost:9200
-    --es-index=<str>                  Elasticsearch index name. DEFAULT=sist2
-    --bind=<str>                      Listen on this address. DEFAULT=localhost:4090
+    --es-url=<str>                    Elasticsearch url. DEFAULT: http://localhost:9200
+    --es-insecure-ssl                 Do not verify SSL connections to Elasticsearch.
+    --es-index=<str>                  Elasticsearch index name. DEFAULT: sist2
+    --bind=<str>                      Listen for connections on this address. DEFAULT: localhost:4090
    --auth=<str>                      Basic auth in user:password format
    --auth0-audience=<str>            API audience/identifier
    --auth0-domain=<str>              Application domain
@@ -84,75 +70,23 @@ Web options
    --lang=<str>                      Default UI language. Can be changed by the user

 Exec-script options
-    --es-url=<str>                    Elasticsearch url. DEFAULT=http://localhost:9200
-    --es-index=<str>                  Elasticsearch index name. DEFAULT=sist2
+    --es-url=<str>                    Elasticsearch url. DEFAULT: http://localhost:9200
+    --es-insecure-ssl                 Do not verify SSL connections to Elasticsearch.
+    --es-index=<str>                  Elasticsearch index name. DEFAULT: sist2
    --script-file=<str>               Path to user script.
    --async-script                    Execute user script asynchronously.
+
 Made by simon987 <me@simon987.net>. Released under GPL-3.0
 ```

-## Scan
+#### Thumbnail database size estimation

-### Scan options
+See chart below for rough estimate of thumbnail size vs. thumbnail size & quality arguments:

-* `-t, --threads` 
-      Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-CimInstance Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
-* `--mem-throttle`
-    Total memory threshold in MiB for scan throttling. Worker threads will not start a new parse job
-    until the total memory usage of sist2 is below this threshold. Set to 0 to disable. DEFAULT=0
-* `-q, --thumbnail-quality` 
-    Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best.
-* `--thumbnail-size` 
-    Thumbnail size in pixels.
-* `--thumbnail-count`
-    Maximum number of thumbnails to generate. When set to a value >= 2, thumbnails for video previews
-    will be generated. The actual number of thumbnails generated depends on the length of the video (maximum 1 image 
-    every ~7s). Set to 0 to completely disable thumbnails.
-* `--content-size` 
-    Number of bytes of text to be extracted from the content of files (plain text, PDFs etc.).
-    Repeated whitespace and special characters do not count toward this limit.
-    Set to 0 to completely disable content parsing.
-* `--incremental`
-    Specify an existing index. Information about files in this index that were not modified (based on *mtime* attribute)
-    will be copied to the new index and will not be parsed again.
-* `-o, --output` Output directory. 
-* `--rewrite-url` Set the `rewrite_url` option for the web module (See [rewrite_url](#rewrite_url)) 
-* `--name` Set the `name` option for the web module
-* `--depth` Maximum scan dept. Set to 0 only scan files directly in the root directory, set to -1 for infinite depth
-* `--archive` Archive file mode.
-    * skip: Don't parse
-    * list: Only get file names as text
-    * shallow: Don't parse archives inside archives.
-    * recurse: Scan archives recursively (default)
-* `--ocr-lang`, `--ocr-ebooks`, `--ocr-images` See [OCR](../README.md#OCR)
-* `-e, --exclude` Regex pattern to exclude files. A file is excluded if the pattern matches any 
-    part of the full absolute path.
-    
-    Examples: 
-    * `-e ".*\.ttf"`: Ignore ttf files
-    * `-e ".*\.(ttf|rar)"`: Ignore ttf and rar files
-    * `-e "^/mnt/backups/"`: Ignore all files in the `/mnt/backups/` directory
-    * `-e "^/mnt/Data[12]/"`: Ignore all files in the `/mnt/Data1/` and `/mnt/Data2/` directory
-    * `-e "(^/usr/)|(^/var/)|(^/media/DRIVE-A/tmp/)|(^/media/DRIVE-B/Trash/)"` Exclude the
-     `/usr`, `/var`, `/media/DRIVE-A/tmp`, `/media/DRIVE-B/Trash` directories
-* `--fast` Only index file names and mime type
-* `--treemap-threshold` Directories smaller than (`treemap-threshold` * `<total size of the index>`)
-    will not be considered for the disk utilisation visualization; their size will be added to
-    the parent directory. If the parent directory is still smaller than the threshold, it will also be "merged upwards"
-    and so on.
-    
-    In effect, smaller `treemap-threshold` values will yield a more detailed 
-    (but also a more cluttered and harder to read) visualization. 
-    
-* `--mem-buffer` Maximum memory buffer size in MiB (per thread) for files inside archives. Media files 
-    larger than this number will be read sequentially and no *seek* operations will be supported.
+For example, `--thumbnail-size=500`, `--thumbnail-quality=2` for a directory with 8 million images will create a thumbnail database 
+that is about `8000000 * 36kB = 288GB`.

-    To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
-* `--read-subtitles` When enabled, will attempt to read the subtitles stream from media files.
-* `--fast-epub` Much faster but less accurate EPUB parsing. When enabled, sist2 will use a simple HTML parser to read epub files instead of the MuPDF library. No thumbnails are generated and author/title metadata are not parsed.
-* `--checksums` Calculate file checksums (SHA1) when scanning files. This option does not cause any additional read 
-  operations. Checksums are not calculated for all file types, unless the file is inside an archive. When enabled, duplicate
-  files are hidden in the web UI (this behaviour can be toggled in the Configuration page).
+![thumbnail_size](thumbnail_size.png)

 ### Scan examples

@@ -161,85 +95,22 @@ Simple scan
 sist2 scan ~/Documents

 sist2 scan \
-    --threads 4 --content-size 16000000 --quality 1.0 --archive shallow \
+    --threads 4 --content-size 16000000 --thumbnail-quality 2 --archive shallow \
    --name "My Documents" --rewrite-url "http://nas.domain.local/My Documents/" \
-    ~/Documents -o ./documents.idx/
+    ~/Documents -o ./documents.sist2
 ```

 Incremental scan
-```
-sist2 scan --incremental ./orig_idx/ -o ./updated_idx/ ~/Documents
+
+If the index file does not exist, `--incremental` has no effect.
+```bash
+sist scan ~/Documents -o ./documents.sist2
+sist scan ~/Documents -o ./documents.sist2 --incremental
+# or
+sist scan ~/Documents -o ./documents.sist2 --incremental
+sist scan ~/Documents -o ./documents.sist2 --incremental
 ```

-### Index format
-
-A typical `ndjson` type index structure looks like this:
-```
-documents.idx/
-├── descriptor.json
-├── _index_main.ndjson.zst
-├── treemap.csv
-├── agg_mime.csv
-├── agg_date.csv
-├── add_size.csv
-├── thumbs/
-|   ├── data.mdb
-|   └── lock.mdb
-├── tags/
-|   ├── data.mdb
-|   └── lock.mdb
-└── meta/
-    ├── data.mdb
-    └── lock.mdb
-```
-
-The `_index_*.ndjson.zst` files contain the document data in JSON format, in a compressed newline-delemited file.
-
-The `thumbs/` folder is a [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database)
-database containing the thumbnails.
-
-The `descriptor.json` file contains general information about the index. The 
-following fields are safe to modify manually: `root`, `name`, [rewrite_url](#rewrite_url) and `timestamp`.
-
-The `.csv` are pre-computed aggregations necessary for the stats page.
-
-*thumbs/*:
-
-LMDB key-value store. Keys are **binary** 16-byte md5 hash* (`_id` field)
-and values are raw image bytes.
-
-*\* Hash is calculated from the full path of the file, including the extension, relative to the index root*
-
-
-## Index
-### Index options
- * `--es-url` 
- Elasticsearch url and port. If you are using docker, make sure that both containers are on the
- same network.
- * `--es-index` 
-    Elasticsearch index name. DEFAULT=sist2
- * `-p, --print` 
-    Print index in JSON format to stdout.
- * `--incremental-index`
-   Conduct incremental indexing. Assumes that the old index is already ingested in Elasticsearch.
-   Only the new changes since the last scan will be sent.
- * `--script-file` 
-    Path to user script. See [Scripting](scripting.md).
- * `--mappings-file`
-    Path to custom Elasticsearch mappings. If none is specified, [the bundled mappings](https://github.com/simon987/sist2/tree/master/schema) will be used.
- * `--settings-file`
-    Path to custom Elasticsearch settings. *(See above)*
- * `--async-script` 
-    Use `wait_for_completion=false` elasticsearch option while executing user script.
-     (See [Elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/tasks.html))
- * `--batch-size=<int>` 
-    Index batch size. Indexing is generally faster with larger batches, but payloads that
-    are too large will fail and additional overhead for retrying with smaller sizes may slow
-    down the process.
- * `-f, --force-reset` 
-    Reset Elasticsearch mappings and settings.
- * `-t, --threads` Number of threads to use. Ideally, choose a number equal to the number of logical cores of the machine hosting Elasticsearch.
-    
 ### Index examples

 **Push to elasticsearch**
@@ -368,8 +239,8 @@ The sidecar file must have exactly the same file path and the `.s2meta` suffix.
 ```

 ```
-sist2 scan ~/Documents -o ./docs.idx
-sist2 index ./docs.idx
+sist2 scan ~/Documents -o ./docs.sist2
+sist2 index ./docs.sist2
 ```

 *NOTE*: It is technically possible to overwrite the `tag` value using sidecar files, however,
--- a/docs/ner.png
+++ b/docs/ner.png
--- a/docs/sist2.gif
+++ b/docs/sist2.gif
--- a/docs/sist2.png
+++ b/docs/sist2.png
--- a/docs/thumbnail_size.png
+++ b/docs/thumbnail_size.png
--- a/scripts/before_build.sh
+++ b/scripts/before_build.sh
@@ -1,10 +1,13 @@
 #!/usr/bin/env bash

-rm -rf index.sist2/
+(
+  cd ..
+  rm -rf index.sist2

-python3 scripts/mime.py > src/parsing/mime_generated.c
-python3 scripts/serve_static.py > src/web/static_generated.c
-python3 scripts/index_static.py > src/index/static_generated.c
-python3 scripts/magic_static.py > src/magic_generated.c
+  python3 scripts/mime.py > src/parsing/mime_generated.c
+  python3 scripts/serve_static.py > src/web/static_generated.c
+  python3 scripts/index_static.py > src/index/static_generated.c
+  python3 scripts/magic_static.py > src/magic_generated.c

-printf "static const char *const Sist2CommitHash = \"%s\";\n" $(git rev-parse HEAD) > src/git_hash.h
+  printf "static const char *const Sist2CommitHash = \"%s\";\n" $(git rev-parse HEAD) > src/git_hash.h
+)
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -4,14 +4,20 @@ VCPKG_ROOT="/vcpkg"

 git submodule update --init --recursive

-rm -rf CMakeFiles CMakeCache.txt
-cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
-make -j $(nproc)
-strip sist2
-./sist2 -v > VERSION
-mv sist2 sist2-x64-linux
+mkdir build
+(
+  cd build
+  cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG_INFO=on -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" ..
+  make -j $(nproc)
+  strip sist2
+  ./sist2 -v > VERSION
+)
+mv build/sist2 sist2-x64-linux

-rm -rf CMakeFiles CMakeCache.txt
-cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
-make -j  $(nproc)
-mv sist2_debug sist2-x64-linux-debug
+(
+  cd build
+  rm -rf CMakeFiles CMakeCache.txt
+  cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG_INFO=on -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" ..
+  make -j  $(nproc)
+)
+mv build/sist2_debug sist2-x64-linux-debug
--- a/scripts/build_arm64.sh
+++ b/scripts/build_arm64.sh
@@ -4,14 +4,19 @@ VCPKG_ROOT="/vcpkg"

 git submodule update --init --recursive

-rm -rf CMakeFiles CMakeCache.txt
-cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
-make -j $(nproc)
-strip sist2
-mv sist2 sist2-arm64-linux
+mkdir build
+(
+  cd build
+  cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG_INFO=on -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" ..
+  make -j $(nproc)
+  strip sist2
+)
+mv build/sist2 sist2-arm64-linux

 rm -rf CMakeFiles CMakeCache.txt
-cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
-make -j $(nproc)
-strip sist2
-mv sist2_debug sist2-arm64-linux-debug
+(
+  cd build
+  cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG_INFO=on -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" ..
+  make -j $(nproc)
+)
+mv build/sist2_debug sist2-arm64-linux-debug
--- a/scripts/mime.csv
+++ b/scripts/mime.csv
@@ -1,3 +1,4 @@
+application/x-matlab-data,mat
 application/arj, arj
 application/base64, mme
 application/binhex, hqx
@@ -29,7 +30,7 @@ application/mime, aps
 application/mspowerpoint, ppz
 application/msword, doc|dot|w6w|wiz|word
 application/netmc, mcp
-application/octet-stream, bin|dump|gpg
+application/octet-stream, bin|dump|gpg|pack|idx
 application/oda, oda
 application/ogg, ogv
 application/pdf, pdf
@@ -243,7 +244,7 @@ audio/make, funk|my|pfunk
 audio/midi, kar
 audio/mid, rmi
 audio/mp4, m4b
-audio/mpeg, m2a|mpa
+audio/mpeg, m2a|mpa|mpga
 audio/ogg, ogg
 audio/s3m, s3m
 audio/tsp-audio, tsi
@@ -346,6 +347,8 @@ text/mcf, mcf
 text/pascal, pas
 text/PGP,
 text/plain, com|cmd|conf|def|g|idc|list|lst|mar|sdml|text|txt|md|groovy|license|properties|desktop|ini|rst|cmake|ipynb|readme|less|lo|go|yml|d|cs|hpp|srt|nfo|sfv|m3u|csv|eml|make|log|markdown|yaml
+text/x-script.python, pyx
+text/csv,
 application/vnd.coffeescript, coffee
 text/richtext, rt|rtf|rtx
 text/rtf,
@@ -382,7 +385,7 @@ text/x-pascal, p
 text/x-perl, pl
 text/x-php, php
 text/x-po, po
-text/x-python, py
+text/x-python, py|pyi
 text/x-ruby, rb
 text/x-sass, sass
 text/x-scss, scss
--- a/scripts/mime.py
+++ b/scripts/mime.py
@@ -1,3 +1,5 @@
+import zlib
+
 mimes = {}
 noparse = set()
 ext_in_hash = set()
@@ -135,24 +137,40 @@ def clean(t):
    return t.replace("/", "_").replace(".", "_").replace("+", "_").replace("-", "_")


+def crc(s):
+    return zlib.crc32(s.encode()) & 0xffffffff
+
+
 with open("scripts/mime.csv") as f:
    for l in f:
        mime, ext_list = l.split(",")
        if l.startswith("!"):
            mime = mime[1:]
            noparse.add(mime)
-        ext = [x.strip() for x in ext_list.split("|")]
+        ext = [x.strip() for x in ext_list.split("|") if x.strip() != ""]
        mimes[mime] = ext

+    seen_crc = set()
+    for ext in mimes.values():
+        for e in ext:
+            if crc(e) in seen_crc:
+                raise Exception("CRC32 collision")
+            seen_crc.add(crc(e))
+
+    seen_crc = set()
+    for mime in mimes.keys():
+        if crc(mime) in seen_crc:
+            raise Exception("CRC32 collision")
+        seen_crc.add(crc(mime))
+
    print("// **Generated by mime.py**")
    print("#ifndef MIME_GENERATED_C")
    print("#define MIME_GENERATED_C")
-    print("#include <glib.h>\n")
    print("#include <stdlib.h>\n")
    # Enum
    print("enum mime {")
    for mime, ext in sorted(mimes.items()):
-        print("    " + clean(mime) + "=" + mime_id(mime) + ",")
+        print(f"{clean(mime)}={mime_id(mime)},")
    print("};")

    # Enum -> string
@@ -163,20 +181,20 @@ with open("scripts/mime.csv") as f:
    print("default: return NULL;}}")

    # Ext -> Enum
-    print("GHashTable *mime_get_ext_table() {"
-          "GHashTable *ext_table = g_hash_table_new(g_str_hash, g_str_equal);")
+    print("unsigned int mime_extension_lookup(unsigned long extension_crc32) {"
+          "switch (extension_crc32) {")
    for mime, ext in mimes.items():
-        for e in [e for e in ext if e]:
-            print("g_hash_table_insert(ext_table, \"" + e + "\", (gpointer)" + clean(mime) + ");")
-            if e in ext_in_hash:
-                raise Exception("extension already in hash: " + e)
-            ext_in_hash.add(e)
-    print("return ext_table;}")
+        if len(ext) > 0:
+            for e in ext:
+                print(f"case {crc(e)}:", end="")
+            print(f"return {clean(mime)};")
+    print("default: return 0;}}")

    # string -> Enum
-    print("GHashTable *mime_get_mime_table() {"
-          "GHashTable *mime_table = g_hash_table_new(g_str_hash, g_str_equal);")
-    for mime, ext in mimes.items():
-        print("g_hash_table_insert(mime_table, \"" + mime + "\", (gpointer)" + clean(mime) + ");")
-    print("return mime_table;}")
+    print("unsigned int mime_name_lookup(unsigned long mime_crc32) {"
+          "switch (mime_crc32) {")
+    for mime in mimes.keys():
+        print(f"case {crc(mime)}: return {clean(mime)};")
+
+    print("default: return 0;}}")
    print("#endif")
--- a/scripts/start_dev_es.sh
+++ b/scripts/start_dev_es.sh
@@ -1,3 +1,3 @@
 docker run --rm -it --name "sist2-dev-es"\
       	-p 9200:9200 -e "discovery.type=single-node" \
-	-e "ES_JAVA_OPTS=-Xms8g -Xmx8g" elasticsearch:7.14.0
+	-e "ES_JAVA_OPTS=-Xms8g -Xmx8g" elasticsearch:7.17.9
--- a/scripts/start_dev_es_8.sh
+++ b/scripts/start_dev_es_8.sh
@@ -1,3 +1,3 @@
 docker run --rm -it --name "sist2-dev-es"\
       	-p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" \
-	-e "ES_JAVA_OPTS=-Xms8g -Xmx8g" elasticsearch:8.1.2
+	-e "ES_JAVA_OPTS=-Xms8g -Xmx8g" elasticsearch:8.7.0
--- a/sist2-admin/frontend/dist/css/app.css
+++ b/sist2-admin/frontend/dist/css/app.css
@@ -1 +0,0 @@
-.navbar[data-v-27bc1d68]{box-shadow:0 .125rem .25rem rgba(0,0,0,.08)!important;border-radius:0}.theme-black .navbar[data-v-27bc1d68]{background:rgba(84,107,122,.18823529411764706);border-bottom:none}.navbar-brand[data-v-27bc1d68]{color:#222!important;font-size:1.75rem;padding:0}.navbar-brand[data-v-27bc1d68]:hover{color:#000!important}.version[data-v-27bc1d68]{color:#222!important;margin-left:-18px;margin-top:-14px;font-size:11px;font-family:monospace}.btn-link[data-v-27bc1d68]{color:#222}body,html{height:100%}#app{-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;color:#2c3e50;padding-bottom:1em;min-height:100%}.info-icon{width:1rem;margin-right:.2rem;cursor:pointer;line-height:1rem;height:1rem;background-image:url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCA0MjYuNjY3IDQyNi42NjciIGZpbGw9IiNmZmYiPjxwYXRoIGQ9Ik0xOTIgMTkyaDQyLjY2N3YxMjhIMTkyeiIvPjxwYXRoIGQ9Ik0yMTMuMzMzIDBDOTUuNDY3IDAgMCA5NS40NjcgMCAyMTMuMzMzczk1LjQ2NyAyMTMuMzMzIDIxMy4zMzMgMjEzLjMzM1M0MjYuNjY3IDMzMS4yIDQyNi42NjcgMjEzLjMzMyAzMzEuMiAwIDIxMy4zMzMgMHptMCAzODRjLTk0LjA4IDAtMTcwLjY2Ny03Ni41ODctMTcwLjY2Ny0xNzAuNjY3UzExOS4yNTMgNDIuNjY3IDIxMy4zMzMgNDIuNjY3IDM4NCAxMTkuMjUzIDM4NCAyMTMuMzMzIDMwNy40MTMgMzg0IDIxMy4zMzMgMzg0eiIvPjxwYXRoIGQ9Ik0xOTIgMTA2LjY2N2g0Mi42Njd2NDIuNjY3SDE5MnoiLz48L3N2Zz4=);filter:brightness(45%);display:block}.tabs{margin-top:10px}.modal-title{text-overflow:ellipsis;overflow:hidden;white-space:nowrap}@media screen and (min-width:1500px){.container{max-width:1440px}}label{margin-top:.5rem;margin-bottom:0}.shrink[data-v-9b017c42]{flex-grow:inherit}#task-history[data-v-46960281]{font-family:monospace;font-size:12px}#log-tail-output span{display:block}span.DEBUG{color:#9e9e9e}span.WARNING{color:#ffb300}span.INFO{color:#039be5}span.ERROR,span.FATAL{color:#f4511e}span.ADMIN{color:#ee05ff}#log-tail-output{font-size:13px;font-family:monospace;padding:6px;background-color:#f5f5f5;border:1px solid #ccc;border-radius:4px;margin:3px;white-space:pre;color:#000;overflow:hidden}
--- a/sist2-admin/frontend/dist/css/chunk-vendors.css
+++ b/sist2-admin/frontend/dist/css/chunk-vendors.css
--- a/sist2-admin/frontend/dist/favicon.ico
+++ b/sist2-admin/frontend/dist/favicon.ico
--- a/sist2-admin/frontend/dist/index.html
+++ b/sist2-admin/frontend/dist/index.html
@@ -1 +0,0 @@
-<!DOCTYPE html><html lang=""><head><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width,initial-scale=1"><link rel="icon" href="favicon.ico"><title>sist2-admin</title><link href="css/app.css" rel="preload" as="style"><link href="css/chunk-vendors.css" rel="preload" as="style"><link href="js/app.js" rel="preload" as="script"><link href="js/chunk-vendors.js" rel="preload" as="script"><link href="css/chunk-vendors.css" rel="stylesheet"><link href="css/app.css" rel="stylesheet"></head><body><noscript><strong>We're sorry but sist2-admin-vue doesn't work properly without JavaScript enabled. Please enable it to continue.</strong></noscript><div id="app"></div><script src="js/chunk-vendors.js"></script><script src="js/app.js"></script></body></html>
--- a/sist2-admin/frontend/dist/js/app.js
+++ b/sist2-admin/frontend/dist/js/app.js
--- a/sist2-admin/frontend/dist/js/chunk-vendors.js
+++ b/sist2-admin/frontend/dist/js/chunk-vendors.js
--- a/sist2-admin/frontend/package-lock.json
+++ b/sist2-admin/frontend/package-lock.json
--- a/sist2-admin/frontend/package.json
+++ b/sist2-admin/frontend/package.json
@@ -20,14 +20,11 @@
  },
  "devDependencies": {
    "@vue/cli-plugin-babel": "~5.0.8",
-    "@vue/cli-plugin-eslint": "~5.0.8",
    "@vue/cli-plugin-router": "~5.0.8",
    "@vue/cli-plugin-vuex": "~5.0.8",
    "@vue/cli-service": "~5.0.8",
    "babel-eslint": "^10.1.0",
    "bootstrap": "^4.5.2",
-    "eslint": "^6.7.2",
-    "eslint-plugin-vue": "^6.2.2",
    "vue-template-compiler": "^2.6.11"
  },
  "eslintConfig": {
--- a/sist2-admin/frontend/public/index.html
+++ b/sist2-admin/frontend/public/index.html
@@ -4,7 +4,7 @@
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width,initial-scale=1.0">
-    <link rel="icon" href="<%= BASE_URL %>favicon.ico">
+    <link rel="icon" href="<%= BASE_URL %>serve_favicon_ico.ico">
    <title>sist2-admin</title>
  </head>
  <body>
--- a/sist2-admin/frontend/src/components/JobOptions.vue
+++ b/sist2-admin/frontend/src/components/JobOptions.vue
@@ -28,16 +28,22 @@ export default {
      return this.$store.state.jobDesktopNotificationMap[this.job.name];
    }
  },
-  methods: {
+    mounted() {
+      this.cronValid = this.checkCron(this.job.cron_expression)
+    },
+    methods: {
+      checkCron(expression) {
+          return /((((\d+,)+\d+|(\d+([/-])\d+)|\d+|\*) ?){5,7})/.test(expression);
+      },
    updateNotifications(value) {
      this.$store.dispatch("setJobDesktopNotification", {
        job: this.job.name,
        enabled: value
-      })
+      });
    },
    update() {
      if (this.job.schedule_enabled) {
-        this.cronValid = /((((\d+,)+\d+|(\d+([/-])\d+)|\d+|\*) ?){5,7})/.test(this.job.cron_expression);
+        this.cronValid = this.checkCron(this.job.cron_expression);
      } else {
        this.cronValid = undefined;
      }
--- a/sist2-admin/frontend/src/components/ScanOptions.vue
+++ b/sist2-admin/frontend/src/components/ScanOptions.vue
@@ -6,9 +6,6 @@
    <label>{{ $t("scanOptions.threads") }}</label>
    <b-form-input type="number" min="1" v-model="options.threads" @change="update()"></b-form-input>

-    <label>{{ $t("scanOptions.memThrottle") }}</label>
-    <b-form-input type="number" min="0" v-model="options.mem_throttle" @change="update()"></b-form-input>
-
    <label>{{ $t("scanOptions.thumbnailQuality") }}</label>
    <b-form-input type="number" min="1" max="31" v-model="options.thumbnail_quality" @change="update()"></b-form-input>

@@ -70,8 +67,9 @@
      {{ $t("scanOptions.readSubtitles") }}
    </b-form-checkbox>

-    <label>{{ $t("scanOptions.memBuffer") }}</label>
-    <b-form-input type="number" min="0" v-model="options.mem_buffer" @change="update()"></b-form-input>
+    <b-form-checkbox v-model="options.optimize_index" @change="update()">
+        {{ $t("scanOptions.optimizeIndex") }}
+    </b-form-checkbox>

    <label>{{ $t("scanOptions.treemapThreshold") }}</label>
    <b-form-input type="number" min="0" v-model="options.treemap_threshold" @change="update()"></b-form-input>
--- a/sist2-admin/frontend/src/i18n/messages.js
+++ b/sist2-admin/frontend/src/i18n/messages.js
@@ -56,13 +56,17 @@ export default {
            tagline: "Tagline in navbar",
            auth: "Basic auth in user:password format",
            tagAuth: "Basic auth in user:password format for tagging",
+            auth0Audience: "Auth0 audience",
+            auth0Domain: "Auth0 domain",
+            auth0ClientId: "Auth0 client ID",
+            auth0PublicKey: "Auth0 public key",
        },
        scanOptions: {
            title: "Scanning options",
            path: "Path",
            threads: "Number of threads",
            memThrottle: "Total memory threshold in MiB for scan throttling",
-            thumbnailQuality: "Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best",
+            thumbnailQuality: "Thumbnail quality, on a scale of 2 to 32, 2 being the best",
            thumbnailCount: "Number of thumbnails to generate. Set a value > 1 to create video previews, set to 0 to disable thumbnails.",
            thumbnailSize: "Thumbnail size, in pixels",
            contentSize: "Number of bytes to be extracted from text documents. Set to 0 to disable",
@@ -80,7 +84,8 @@ export default {
            checksums: "Calculate file checksums when scanning",
            readSubtitles: "Read subtitles from media files",
            memBuffer: "Maximum memory buffer size per thread in MiB for files inside archives",
-            treemapThreshold: "Relative size threshold for treemap"
+            treemapThreshold: "Relative size threshold for treemap",
+            optimizeIndex: "Defragment index file after scan to reduce its file size."
        },
        indexOptions: {
            title: "Indexing options",
--- a/sist2-admin/frontend/src/views/Tasks.vue
+++ b/sist2-admin/frontend/src/views/Tasks.vue
@@ -40,6 +40,39 @@ import TaskListItem from "@/components/TaskListItem";
 import Sist2AdminApi from "@/Sist2AdminApi";
 import moment from "moment";

+const DAY = 3600 * 24;
+const HOUR = 3600;
+const MINUTE = 60;
+
+function humanDuration(sec_num) {
+  sec_num = sec_num / 1000;
+  const days = Math.floor(sec_num / DAY);
+  sec_num -= days * DAY;
+  const hours = Math.floor(sec_num / HOUR);
+  sec_num -= hours * HOUR;
+  const minutes = Math.floor(sec_num / MINUTE);
+  sec_num -= minutes * MINUTE;
+  const seconds = Math.floor(sec_num);
+
+  if (days > 0) {
+    return `${days} days ${hours}h ${minutes}m ${seconds}s`;
+  }
+
+  if (hours > 0) {
+    return `${hours}h ${minutes}m ${seconds}s`;
+  }
+
+  if (minutes > 0) {
+    return `${minutes}m ${seconds}s`;
+  }
+
+  if (seconds > 0) {
+    return `${seconds}s`;
+  }
+
+  return "<0s";
+}
+
 export default {
  name: 'Tasks',
  components: {TaskListItem},
@@ -100,17 +133,10 @@ export default {
      })
    },
    taskDuration(task) {
-      const start = moment(task.started);
-      const end = moment(task.ended);
+      const start = moment.utc(task.started);
+      const end = moment.utc(task.ended);

-      let duration = moment.utc(end.diff(start)).format("HH[h] mm[m] ss[s]");
-
-      duration = duration.replace("00h ", "");
-      duration = duration.replace(/^00m /, "");
-      duration = duration.replace(/00s/, "<1s");
-      duration = duration.replace(/^0/, "");
-
-      return duration;
+      return humanDuration(end.diff(start))
    }
  }
 }
--- a/sist2-admin/frontend/yarn.lock
+++ b/sist2-admin/frontend/yarn.lock
--- a/sist2-admin/sist2_admin/app.py
+++ b/sist2-admin/sist2_admin/app.py
@@ -21,13 +21,11 @@ from config import LOG_FOLDER, logger, WEBSERVER_PORT, DATA_FOLDER, SIST2_BINARY
 from jobs import Sist2Job, Sist2ScanTask, TaskQueue, Sist2IndexTask, JobStatus
 from notifications import Subscribe, Notifications
 from sist2 import Sist2
-from state import PickleTable, RUNNING_FRONTENDS, TESSERACT_LANGS, DB_SCHEMA_VERSION
+from state import migrate_v1_to_v2, RUNNING_FRONTENDS, TESSERACT_LANGS, DB_SCHEMA_VERSION
 from web import Sist2Frontend

-VERSION = "1.0"
-
 sist2 = Sist2(SIST2_BINARY, DATA_FOLDER)
-db = PersistentState(table_factory=PickleTable, dbfile=os.path.join(DATA_FOLDER, "state.db"))
+db = PersistentState(dbfile=os.path.join(DATA_FOLDER, "state.db"))
 notifications = Notifications()
 task_queue = TaskQueue(sist2, db, notifications)

@@ -52,7 +50,6 @@ async def home():
@app.get("/api")
 async def api():
    return {
-        "version": VERSION,
        "tesseract_langs": TESSERACT_LANGS,
        "logs_folder": LOG_FOLDER
    }
@@ -60,18 +57,17 @@ async def api():

@app.get("/api/job/{name:str}")
 async def get_job(name: str):
-    row = db["jobs"][name]
-    if row:
-        return row["job"]
-    raise HTTPException(status_code=404)
+    job = db["jobs"][name]
+    if not job:
+        raise HTTPException(status_code=404)
+    return job


@app.get("/api/frontend/{name:str}")
 async def get_frontend(name: str):
-    row = db["frontends"][name]
-    if row:
-        frontend = row["frontend"]
-        frontend: Sist2Frontend
+    frontend = db["frontends"][name]
+    frontend: Sist2Frontend
+    if frontend:
        frontend.running = frontend.name in RUNNING_FRONTENDS
        return frontend
    raise HTTPException(status_code=404)
@@ -79,16 +75,16 @@ async def get_frontend(name: str):

@app.get("/api/job/")
 async def get_jobs():
-    return [row["job"] for row in db["jobs"]]
+    return list(db["jobs"])


@app.put("/api/job/{name:str}")
-async def update_job(name: str, job: Sist2Job):
+async def update_job(name: str, new_job: Sist2Job):
    # TODO: Check etag

-    job.last_modified = datetime.now()
-    row = db["jobs"][name]
-    if not row:
+    new_job.last_modified = datetime.now()
+    job = db["jobs"][name]
+    if not job:
        raise HTTPException(status_code=404)

    args_that_trigger_full_scan = [
@@ -108,15 +104,15 @@ async def update_job(name: str, job: Sist2Job):
        "read_subtitles",
    ]
    for arg in args_that_trigger_full_scan:
-        if getattr(row["job"].scan_options, arg) != getattr(job.scan_options, arg):
-            job.do_full_scan = True
+        if getattr(new_job.scan_options, arg) != getattr(job.scan_options, arg):
+            new_job.do_full_scan = True

-    db["jobs"][name] = {"job": job}
+    db["jobs"][name] = new_job


@app.put("/api/frontend/{name:str}")
 async def update_frontend(name: str, frontend: Sist2Frontend):
-    db["frontends"][name] = {"frontend": frontend}
+    db["frontends"][name] = frontend

    # TODO: Check etag

@@ -142,7 +138,7 @@ def _run_job(job: Sist2Job):
    job.last_modified = datetime.now()
    if job.status == JobStatus("created"):
        job.status = JobStatus("started")
-    db["jobs"][job.name] = {"job": job}
+    db["jobs"][job.name] = job

    scan_task = Sist2ScanTask(job, f"Scan [{job.name}]")
    index_task = Sist2IndexTask(job, f"Index [{job.name}]", depends_on=scan_task)
@@ -153,19 +149,19 @@ def _run_job(job: Sist2Job):

@app.get("/api/job/{name:str}/run")
 async def run_job(name: str):
-    row = db["jobs"][name]
-    if not row:
+    job = db["jobs"][name]
+    if not job:
        raise HTTPException(status_code=404)

-    _run_job(row["job"])
+    _run_job(job)

    return "ok"


@app.delete("/api/job/{name:str}")
 async def delete_job(name: str):
-    row = db["jobs"][name]
-    if row:
+    job = db["jobs"][name]
+    if job:
        del db["jobs"][name]
    else:
        raise HTTPException(status_code=404)
@@ -177,8 +173,8 @@ async def delete_frontend(name: str):
        os.kill(RUNNING_FRONTENDS[name], signal.SIGTERM)
        del RUNNING_FRONTENDS[name]

-    row = db["frontends"][name]
-    if row:
+    frontend = db["frontends"][name]
+    if frontend:
        del db["frontends"][name]
    else:
        raise HTTPException(status_code=404)
@@ -190,18 +186,18 @@ async def create_job(name: str):
        raise ValueError("Job with the same name already exists")

    job = Sist2Job.create_default(name)
-    db["jobs"][name] = {"job": job}
+    db["jobs"][name] = job

    return job


@app.post("/api/frontend/{name:str}")
 async def create_frontend(name: str):
-    if db["frontend"][name]:
+    if db["frontends"][name]:
        raise ValueError("Frontend with the same name already exists")

    frontend = Sist2Frontend.create_default(name)
-    db["frontends"][name] = {"frontend": frontend}
+    db["frontends"][name] = frontend

    return frontend

@@ -255,7 +251,7 @@ def check_es_version(es_url: str, insecure: bool):


 def start_frontend_(frontend: Sist2Frontend):
-    frontend.web_options.indices = list(map(lambda j: db["jobs"][j]["job"].last_index, frontend.jobs))
+    frontend.web_options.indices = list(map(lambda j: db["jobs"][j].index_path, frontend.jobs))

    pid = sist2.web(frontend.web_options, frontend.name)
    RUNNING_FRONTENDS[frontend.name] = pid
@@ -263,11 +259,11 @@ def start_frontend_(frontend: Sist2Frontend):

@app.post("/api/frontend/{name:str}/start")
 async def start_frontend(name: str):
-    row = db["frontends"][name]
-    if not row:
+    frontend = db["frontends"][name]
+    if not frontend:
        raise HTTPException(status_code=404)

-    start_frontend_(row["frontend"])
+    start_frontend_(frontend)


@app.post("/api/frontend/{name:str}/stop")
@@ -280,8 +276,7 @@ async def stop_frontend(name: str):
@app.get("/api/frontend/")
 async def get_frontends():
    res = []
-    for row in db["frontends"]:
-        frontend = row["frontend"]
+    for frontend in db["frontends"]:
        frontend: Sist2Frontend
        frontend.running = frontend.name in RUNNING_FRONTENDS
        res.append(frontend)
@@ -364,14 +359,14 @@ def initialize_db():
    db["sist2_admin"]["info"] = {"version": DB_SCHEMA_VERSION}

    frontend = Sist2Frontend.create_default("default")
-    db["frontends"]["default"] = {"frontend": frontend}
+    db["frontends"]["default"] = frontend

    logger.info("Initialized database.")


 def start_frontends():
-    for row in db["frontends"]:
-        frontend: Sist2Frontend = row["frontend"]
+    for frontend in db["frontends"]:
+        frontend: Sist2Frontend
        if frontend.auto_start and len(frontend.jobs) > 0:
            start_frontend_(frontend)

@@ -380,8 +375,11 @@ if __name__ == '__main__':

    if not db["sist2_admin"]["info"]:
        initialize_db()
-    elif db["sist2_admin"]["info"]["version"] != DB_SCHEMA_VERSION:
-        print("Database has incompatible schema version! Delete state.db to continue.")
+    if db["sist2_admin"]["info"]["version"] == "1":
+        logger.info("Migrating to v2 database schema")
+        migrate_v1_to_v2(db)
+    if db["sist2_admin"]["info"]["version"] == "2":
+        logger.error("Cannot migrate database from v2 to v3. Delete state.db to proceed.")
        exit(-1)

    start_frontends()
--- a/sist2-admin/sist2_admin/cron.py
+++ b/sist2-admin/sist2_admin/cron.py
@@ -10,7 +10,7 @@ from jobs import Sist2Job


 def _check_schedule(db: PersistentState, run_job):
-    for job in (row["job"] for row in db["jobs"]):
+    for job in db["jobs"]:
        job: Sist2Job

        if job.schedule_enabled:
--- a/sist2-admin/sist2_admin/jobs.py
+++ b/sist2-admin/sist2_admin/jobs.py
@@ -1,23 +1,21 @@
 import json
 import logging
 import os.path
-import shutil
 import signal
 import uuid
 from datetime import datetime
 from enum import Enum
-from hashlib import md5
 from logging import FileHandler
 from threading import Lock, Thread
 from time import sleep
 from uuid import uuid4, UUID

 from hexlib.db import PersistentState
-from pydantic import BaseModel, validator
+from pydantic import BaseModel

 from config import logger, LOG_FOLDER
 from notifications import Notifications
-from sist2 import ScanOptions, IndexOptions, Sist2, Sist2Index
+from sist2 import ScanOptions, IndexOptions, Sist2
 from state import RUNNING_FRONTENDS
 from web import Sist2Frontend

@@ -38,7 +36,8 @@ class Sist2Job(BaseModel):
    schedule_enabled: bool = False

    previous_index: str = None
-    last_index: str = None
+    index_path: str = None
+    previous_index_path: str = None
    last_index_date: datetime = None
    status: JobStatus = JobStatus("created")
    last_modified: datetime
@@ -58,10 +57,10 @@ class Sist2Job(BaseModel):
            cron_expression="0 0 * * *"
        )

-    @validator("etag", always=True)
-    def validate_etag(cls, value, values):
-        s = values["name"] + values["scan_options"].json() + values["index_options"].json() + values["cron_expression"]
-        return md5(s.encode()).hexdigest()
+    # @validator("etag", always=True)
+    # def validate_etag(cls, value, values):
+    #     s = values["name"] + values["scan_options"].json() + values["index_options"].json() + values["cron_expression"]
+    #     return md5(s.encode()).hexdigest()


 class Sist2TaskProgress:
@@ -124,10 +123,10 @@ class Sist2ScanTask(Sist2Task):

        self.job.scan_options.name = self.job.name

-        if self.job.last_index and os.path.exists(self.job.last_index) and not self.job.do_full_scan:
-            self.job.scan_options.incremental = self.job.last_index
+        if self.job.index_path is not None and not self.job.do_full_scan:
+            self.job.scan_options.output = self.job.index_path
        else:
-            self.job.scan_options.incremental = None
+            self.job.scan_options.output = None

        def set_pid(pid):
            self.pid = pid
@@ -139,19 +138,26 @@ class Sist2ScanTask(Sist2Task):
            self._logger.error(json.dumps({"sist2-admin": f"Process returned non-zero exit code ({return_code})"}))
            logger.info(f"Task {self.display_name} failed ({return_code})")
        else:
-            index = Sist2Index(self.job.scan_options.output)
-
-            # Save latest index
-            self.job.previous_index = self.job.last_index
-
-            self.job.last_index = index.path
+            self.job.index_path = self.job.scan_options.output
            self.job.last_index_date = datetime.now()
            self.job.do_full_scan = False
-            db["jobs"][self.job.name] = {"job": self.job}
-            self._logger.info(json.dumps({"sist2-admin": f"Save last_index={self.job.last_index}"}))
+            db["jobs"][self.job.name] = self.job
+            self._logger.info(json.dumps({"sist2-admin": f"Save last_index_date={self.job.last_index_date}"}))

        logger.info(f"Completed {self.display_name} ({return_code=})")

+        # Remove old index
+        if return_code == 0:
+            if self.job.previous_index_path is not None and self.job.previous_index_path != self.job.index_path:
+                self._logger.info(json.dumps({"sist2-admin": f"Remove {self.job.previous_index_path=}"}))
+                try:
+                    os.remove(self.job.previous_index_path)
+                except FileNotFoundError:
+                    pass
+
+            self.job.previous_index_path = self.job.index_path
+            db["jobs"][self.job.name] = self.job
+
        return return_code


@@ -173,19 +179,12 @@ class Sist2IndexTask(Sist2Task):
        ok = return_code == 0

        if ok:
-            # Remove old index
-            if self.job.previous_index is not None:
-                self._logger.info(json.dumps({"sist2-admin": f"Remove {self.job.previous_index=}"}))
-                try:
-                    shutil.rmtree(self.job.previous_index)
-                except FileNotFoundError:
-                    pass
-
            self.restart_running_frontends(db, sist2)

        # Update status
        self.job.status = JobStatus("indexed") if ok else JobStatus("failed")
-        db["jobs"][self.job.name] = {"job": self.job}
+        self.job.previous_index_path = self.job.index_path
+        db["jobs"][self.job.name] = self.job

        self._logger.info(json.dumps({"sist2-admin": f"Sist2Scan task finished {return_code=}, {duration=}"}))

@@ -195,16 +194,19 @@ class Sist2IndexTask(Sist2Task):

    def restart_running_frontends(self, db: PersistentState, sist2: Sist2):
        for frontend_name, pid in RUNNING_FRONTENDS.items():
-            frontend = db["frontends"][frontend_name]["frontend"]
+            frontend = db["frontends"][frontend_name]
            frontend: Sist2Frontend

-            os.kill(pid, signal.SIGTERM)
+            try:
+                os.kill(pid, signal.SIGTERM)
+            except ProcessLookupError:
+                pass
            try:
                os.wait()
            except ChildProcessError:
                pass

-            frontend.web_options.indices = map(lambda j: db["jobs"][j]["job"].last_index, frontend.jobs)
+            frontend.web_options.indices = map(lambda j: db["jobs"][j].index_path, frontend.jobs)

            pid = sist2.web(frontend.web_options, frontend.name)
            RUNNING_FRONTENDS[frontend_name] = pid
--- a/sist2-admin/sist2_admin/sist2.py
+++ b/sist2-admin/sist2_admin/sist2.py
@@ -2,7 +2,6 @@ import datetime
 import json
 import logging
 import os.path
-import traceback
 from datetime import datetime
 from io import TextIOWrapper
 from logging import FileHandler
@@ -63,7 +62,7 @@ class WebOptions(BaseModel):
        if self.auth:
            args.append(f"--auth={self.auth}")
        if self.tag_auth:
-            args.append(f"--tag_auth={self.tag_auth}")
+            args.append(f"--tag-auth={self.tag_auth}")
        if self.dev:
            args.append(f"--dev")

@@ -78,10 +77,10 @@ class IndexOptions(BaseModel):
    es_url: str = "http://elasticsearch:9200"
    es_insecure_ssl: bool = False
    es_index: str = "sist2"
-    incremental_index: bool = False
+    incremental_index: bool = True
    script: str = ""
    script_file: str = None
-    batch_size: int = 100
+    batch_size: int = 70

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
@@ -110,15 +109,14 @@ ARCHIVE_RECURSE = "recurse"
 class ScanOptions(BaseModel):
    path: str
    threads: int = 1
-    mem_throttle: int = 0
-    thumbnail_quality: float = 1.0
-    thumbnail_size: int = 500
+    thumbnail_quality: int = 2
+    thumbnail_size: int = 552
    thumbnail_count: int = 1
    content_size: int = 32768
    depth: int = -1
    archive: str = ARCHIVE_RECURSE
    archive_passphrase: str = None
-    ocr_lang: bool = None
+    ocr_lang: str = None
    ocr_images: bool = False
    ocr_ebooks: bool = False
    exclude: str = None
@@ -128,7 +126,8 @@ class ScanOptions(BaseModel):
    read_subtitles: bool = False
    fast_epub: bool = False
    checksums: bool = False
-    incremental: str = None
+    incremental: bool = True
+    optimize_index: bool = False
    output: str = None
    name: str = None
    rewrite_url: str = None
@@ -138,13 +137,15 @@ class ScanOptions(BaseModel):
        super().__init__(**kwargs)

    def args(self):
-        args = ["scan", self.path, f"--threads={self.threads}", f"--mem-throttle={self.mem_throttle}",
-                f"--thumbnail-quality={self.thumbnail_quality}", f"--thumbnail-count={self.thumbnail_count}",
+        args = ["scan", self.path, f"--threads={self.threads}", f"--thumbnail-quality={self.thumbnail_quality}",
+                f"--thumbnail-count={self.thumbnail_count}", f"--thumbnail-size={self.thumbnail_size}",
                f"--content-size={self.content_size}", f"--output={self.output}", f"--depth={self.depth}",
                f"--archive={self.archive}", f"--mem-buffer={self.mem_buffer}"]

        if self.incremental:
-            args.append(f"--incremental={self.incremental}")
+            args.append(f"--incremental")
+        if self.optimize_index:
+            args.append(f"--optimize-index")
        if self.rewrite_url:
            args.append(f"--rewrite-url={self.rewrite_url}")
        if self.name:
@@ -234,11 +235,11 @@ class Sist2:

    def scan(self, options: ScanOptions, logs_cb, set_pid_cb):

-        output_dir = os.path.join(
-            self._data_dir,
-            f"scan-{datetime.now()}.sist2"
-        )
-        options.output = output_dir
+        if options.output is None:
+            options.output = os.path.join(
+                self._data_dir,
+                f"scan-{options.name.replace('/', '_')}-{datetime.now()}.sist2"
+            )

        args = [
            self._bin_path,
@@ -277,23 +278,17 @@ class Sist2:
    @staticmethod
    def _consume_logs_stdout(logs_cb, proc):
        pipe_wrapper = TextIOWrapper(proc.stdout, encoding="utf8", errors="ignore")
-        try:
-            for line in pipe_wrapper:
+        for line in pipe_wrapper:
+            try:
                if line.strip() == "":
                    continue
                log_object = json.loads(line)
                logs_cb(log_object)
-        except Exception as e:
-            proc.kill()
-            try:
-                print(line)
-            except NameError:
-                pass
-            print(traceback.format_exc())
-        finally:
-            pass
-            # proc.wait()
-            # pipe_wrapper.close()
+            except Exception as e:
+                try:
+                    logs_cb({"sist2-admin": f"Could not decode log line: {line}; {e}"})
+                except NameError:
+                    pass

    def web(self, options: WebOptions, name: str):

--- a/sist2-admin/sist2_admin/state.py
+++ b/sist2-admin/sist2_admin/state.py
@@ -1,6 +1,7 @@
 from typing import Dict
+import shutil

-from hexlib.db import Table
+from hexlib.db import Table, PersistentState
 import pickle

 from tesseract import get_tesseract_langs
@@ -9,7 +10,7 @@ RUNNING_FRONTENDS: Dict[str, int] = {}

 TESSERACT_LANGS = get_tesseract_langs()

-DB_SCHEMA_VERSION = "1"
+DB_SCHEMA_VERSION = "3"

 from pydantic import BaseModel

@@ -48,3 +49,31 @@ class PickleTable(Table):
        for row in super().sql(where_clause, *params):
            yield dict((k, _deserialize(v)) for k, v in row.items())

+
+def migrate_v1_to_v2(db: PersistentState):
+
+    shutil.copy(db.dbfile, db.dbfile + "-before-migrate-v2.bak")
+
+    # Frontends
+    db._table_factory = PickleTable
+    frontends = [row["frontend"] for row in db["frontends"]]
+    del db["frontends"]
+
+    db._table_factory = Table
+    for frontend in frontends:
+        db["frontends"][frontend.name] = frontend
+    list(db["frontends"])
+
+    # Jobs
+    db._table_factory = PickleTable
+    jobs = [row["job"] for row in db["jobs"]]
+    del db["jobs"]
+
+    db._table_factory = Table
+    for job in jobs:
+        db["jobs"][job.name] = job
+    list(db["jobs"])
+
+    db["sist2_admin"]["info"] = {
+        "version": "2"
+    }
--- a/sist2-vue/dist/css/chunk-vendors.css
+++ b/sist2-vue/dist/css/chunk-vendors.css
--- a/sist2-vue/dist/css/index.css
+++ b/sist2-vue/dist/css/index.css
--- a/sist2-vue/dist/index.html
+++ b/sist2-vue/dist/index.html
@@ -1,3 +0,0 @@
-<!doctype html><html lang="en"><head><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1,user-scalable=no"/><title>sist2</title><script defer="defer" src="js/chunk-vendors.js"></script><script defer="defer" src="js/index.js"></script><link href="css/chunk-vendors.css" rel="stylesheet"><link href="css/index.css" rel="stylesheet"></head><body><noscript><style>body {
-            height: initial;
-        }</style><div style="text-align: center; margin-top: 100px"><strong>We're sorry but sist2 doesn't work properly without JavaScript enabled. Please enable it to continue.</strong><br/><strong>Nous sommes désolés mais sist2 ne fonctionne pas correctement si JavaScript est activé. Veuillez l'activer pour continuer.</strong></div></noscript><div id="app"></div></body></html>
--- a/sist2-vue/dist/js/chunk-vendors.js
+++ b/sist2-vue/dist/js/chunk-vendors.js
--- a/sist2-vue/dist/js/index.js
+++ b/sist2-vue/dist/js/index.js
--- a/sist2-vue/fslightbox-vue.tgz
+++ b/sist2-vue/fslightbox-vue.tgz
--- a/sist2-vue/package-lock.json
+++ b/sist2-vue/package-lock.json
--- a/sist2-vue/package.json
+++ b/sist2-vue/package.json
@@ -9,13 +9,14 @@
  "dependencies": {
    "@auth0/auth0-spa-js": "^2.0.2",
    "@egjs/vue-infinitegrid": "3.3.0",
+    "@tensorflow/tfjs": "^4.4.0",
    "axios": "^0.25.0",
    "bootstrap-vue": "^2.21.2",
    "core-js": "^3.6.5",
-    "d3": "^5.6.1",
+    "d3": "^7.8.4",
    "date-fns": "^2.21.3",
    "dom-to-image": "^2.6.0",
-    "fslightbox-vue": "file:../../../../mnt/Hatchery/projects/sist2/fslightbox-vue-pro-1.3.1.tgz",
+    "fslightbox-vue": "fslightbox-vue.tgz",
    "nouislider": "^15.2.0",
    "underscore": "^1.13.1",
    "vue": "^2.6.12",
--- a/sist2-vue/src/App.vue
+++ b/sist2-vue/src/App.vue
@@ -1,383 +1,395 @@
 <template>
-  <div id="app" :class="getClass()" v-if="!authLoading">
-    <NavBar></NavBar>
-    <router-view v-if="!configLoading"/>
-  </div>
-  <div class="loading-page" v-else>
-    <div class="loading-spinners">
-      <b-spinner type="grow" variant="primary"></b-spinner>
-      <b-spinner type="grow" variant="primary"></b-spinner>
-      <b-spinner type="grow" variant="primary"></b-spinner>
+    <div id="app" :class="getClass()" v-if="!authLoading">
+        <NavBar></NavBar>
+        <router-view v-if="!configLoading"/>
    </div>
-    <div class="loading-text">
-      Loading • Chargement • 装载
+    <div class="loading-page" v-else>
+        <div class="loading-spinners">
+            <b-spinner type="grow" variant="primary"></b-spinner>
+            <b-spinner type="grow" variant="primary"></b-spinner>
+            <b-spinner type="grow" variant="primary"></b-spinner>
+        </div>
+        <div class="loading-text">
+            Loading • Chargement • 装载 • Wird geladen
+        </div>
    </div>
-  </div>
 </template>

 <script>
 import NavBar from "@/components/NavBar";
 import {mapActions, mapGetters, mapMutations} from "vuex";
 import Sist2Api from "@/Sist2Api";
+import ModelsRepo from "@/ml/modelsRepo";
 import {setupAuth0} from "@/main";

 export default {
-  components: {NavBar},
-  data() {
-    return {
-      configLoading: false,
-      authLoading: true,
-      sist2InfoLoading: true
-    }
-  },
-  computed: {
-    ...mapGetters(["optTheme"]),
-  },
-  mounted() {
-    this.$store.dispatch("loadConfiguration").then(() => {
-      this.$root.$i18n.locale = this.$store.state.optLang;
-    });
+    components: {NavBar},
+    data() {
+        return {
+            configLoading: false,
+            authLoading: true,
+            sist2InfoLoading: true
+        }
+    },
+    computed: {
+        ...mapGetters(["optTheme"]),
+    },
+    mounted() {
+        this.$store.dispatch("loadConfiguration").then(() => {
+            this.$root.$i18n.locale = this.$store.state.optLang;
+            ModelsRepo.init(this.$store.getters.mlRepositoryList).catch(err => {
+                this.$bvToast.toast(
+                    this.$t("ml.repoFetchError"),
+                    {
+                        title: this.$t("ml.repoFetchErrorTitle"),
+                        noAutoHide: true,
+                        toaster: "b-toaster-bottom-right",
+                        headerClass: "toast-header-warning",
+                        bodyClass: "toast-body-warning",
+                    });
+            });
+        });

-    this.$store.subscribe((mutation) => {
-      if (mutation.type === "setOptLang") {
-        this.$root.$i18n.locale = mutation.payload;
-        this.configLoading = true;
-        window.setTimeout(() => this.configLoading = false, 10);
-      }
-
-      if (mutation.type === "setAuth0Token") {
-        this.authLoading = false;
-      }
-    });
-
-    Sist2Api.getSist2Info().then(data => {
-
-      if (data.auth0Enabled) {
-        this.authLoading = true;
-        setupAuth0(data.auth0Domain, data.auth0ClientId, data.auth0Audience)
-
-        this.$auth.$watch("loading", loading => {
-          if (loading === false) {
-
-            if (!this.$auth.isAuthenticated) {
-              this.$auth.loginWithRedirect();
-              return;
+        this.$store.subscribe((mutation) => {
+            if (mutation.type === "setOptLang") {
+                this.$root.$i18n.locale = mutation.payload;
+                this.configLoading = true;
+                window.setTimeout(() => this.configLoading = false, 10);
            }

-            // Remove "code" param
-            window.history.replaceState({}, "", "/" + window.location.hash);
-
-            this.$store.dispatch("loadAuth0Token");
-          }
+            if (mutation.type === "setAuth0Token") {
+                this.authLoading = false;
+            }
        });
-      } else {
-        this.authLoading = false;
-      }

-      this.setSist2Info(data);
-      this.setIndices(data.indices)
-    });
-  },
-  methods: {
-    ...mapActions(["setSist2Info",]),
-    ...mapMutations(["setIndices",]),
-    getClass() {
-      return {
-        "theme-light": this.optTheme === "light",
-        "theme-black": this.optTheme === "black",
-      }
+        Sist2Api.getSist2Info().then(data => {
+
+            if (data.auth0Enabled) {
+                this.authLoading = true;
+                setupAuth0(data.auth0Domain, data.auth0ClientId, data.auth0Audience)
+
+                this.$auth.$watch("loading", loading => {
+                    if (loading === false) {
+
+                        if (!this.$auth.isAuthenticated) {
+                            this.$auth.loginWithRedirect();
+                            return;
+                        }
+
+                        // Remove "code" param
+                        window.history.replaceState({}, "", "/" + window.location.hash);
+
+                        this.$store.dispatch("loadAuth0Token");
+                    }
+                });
+            } else {
+                this.authLoading = false;
+            }
+
+            this.setSist2Info(data);
+            this.setIndices(data.indices)
+        });
+    },
+    methods: {
+        ...mapActions(["setSist2Info",]),
+        ...mapMutations(["setIndices",]),
+        getClass() {
+            return {
+                "theme-light": this.optTheme === "light",
+                "theme-black": this.optTheme === "black",
+            }
+        }
    }
-  }
-  ,
+    ,
 }
 </script>

 <style>
 html, body {
-  height: 100%;
+    height: 100%;
 }

 #app {
-  /*font-family: Avenir, Helvetica, Arial, sans-serif;*/
-  -webkit-font-smoothing: antialiased;
-  -moz-osx-font-smoothing: grayscale;
-  /*text-align: center;*/
-  color: #2c3e50;
-  padding-bottom: 1em;
-  min-height: 100%;
+    /*font-family: Avenir, Helvetica, Arial, sans-serif;*/
+    -webkit-font-smoothing: antialiased;
+    -moz-osx-font-smoothing: grayscale;
+    /*text-align: center;*/
+    color: #2c3e50;
+    padding-bottom: 1em;
+    min-height: 100%;
 }

 /*Black theme*/
 .theme-black {
-  background-color: #000;
+    background-color: #000;
 }

 .theme-black .card, .theme-black .modal-content {
-  background: #212121;
-  color: #e0e0e0;
-  border-radius: 1px;
-  border: none;
+    background: #212121;
+    color: #e0e0e0;
+    border-radius: 1px;
+    border: none;
 }


 .theme-black .table {
-  color: #e0e0e0;
+    color: #e0e0e0;
 }

 .theme-black .table td, .theme-black .table th {
-  border: none;
+    border: none;
 }

 .theme-black .table thead th {
-  border-bottom: 1px solid #646464;
+    border-bottom: 1px solid #646464;
 }

 .theme-black .custom-select {
-  overflow: auto;
-  background-color: #37474F;
-  border: 1px solid #616161;
-  color: #bdbdbd;
+    overflow: auto;
+    background-color: #37474F;
+    border: 1px solid #616161;
+    color: #bdbdbd;
 }

 .theme-black .custom-select:focus {
-  border-color: #757575;
-  outline: 0;
-  box-shadow: 0 0 0 .2rem rgba(0, 123, 255, .25);
+    border-color: #757575;
+    outline: 0;
+    box-shadow: 0 0 0 .2rem rgba(0, 123, 255, .25);
 }

 .theme-black .inspire-tree .selected > .wholerow, .theme-black .inspire-tree .selected > .title-wrap:hover + .wholerow {
-  background: none !important;
+    background: none !important;
 }

 .theme-black .inspire-tree .icon-expand::before, .theme-black .inspire-tree .icon-collapse::before {
-  background-color: black !important;
+    background-color: black !important;
 }

 .theme-black .inspire-tree .title {
-  color: #eee;
+    color: #eee;
 }

 .theme-black .inspire-tree {
-  font-weight: 400;
-  font-size: 14px;
-  font-family: Helvetica, Nueue, Verdana, sans-serif;
-  max-height: 350px;
-  overflow: auto;
+    font-weight: 400;
+    font-size: 14px;
+    font-family: Helvetica, Nueue, Verdana, sans-serif;
+    max-height: 350px;
+    overflow: auto;
 }

 .inspire-tree [type=checkbox] {
-  left: 22px !important;
-  top: 7px !important;
+    left: 22px !important;
+    top: 7px !important;
 }

 .theme-black .form-control {
-  background-color: #37474F;
-  border: 1px solid #616161;
-  color: #dbdbdb !important;
+    background-color: #37474F;
+    border: 1px solid #616161;
+    color: #dbdbdb !important;
 }

 .theme-black .form-control:focus {
-  background-color: #546E7A;
-  color: #fff;
+    background-color: #546E7A;
+    color: #fff;
 }

 .theme-black .input-group-text, .theme-black .default-input {
-  background: #37474F !important;
-  border: 1px solid #616161 !important;
-  color: #dbdbdb !important;
+    background: #37474F !important;
+    border: 1px solid #616161 !important;
+    color: #dbdbdb !important;
 }

 .theme-black ::placeholder {
-  color: #BDBDBD !important;
-  opacity: 1;
+    color: #BDBDBD !important;
+    opacity: 1;
 }

 .theme-black .nav-tabs .nav-link {
-  color: #e0e0e0;
-  border-radius: 0;
+    color: #e0e0e0;
+    border-radius: 0;
 }

 .theme-black .nav-tabs .nav-item.show .nav-link, .theme-black .nav-tabs .nav-link.active {
-  background-color: #212121;
-  border-color: #616161 #616161 #212121;
-  color: #e0e0e0;
+    background-color: #212121;
+    border-color: #616161 #616161 #212121;
+    color: #e0e0e0;
 }

 .theme-black .nav-tabs .nav-link:focus, .theme-black .nav-tabs .nav-link:focus {
-  border-color: #616161 #616161 #212121;
-  color: #e0e0e0;
+    border-color: #616161 #616161 #212121;
+    color: #e0e0e0;
 }

 .theme-black .nav-tabs .nav-link:focus, .theme-black .nav-tabs .nav-link:hover {
-  border-color: #e0e0e0 #e0e0e0 #212121;
-  color: #e0e0e0;
+    border-color: #e0e0e0 #e0e0e0 #212121;
+    color: #e0e0e0;
 }

 .theme-black .nav-tabs {
-  border-bottom: #616161;
+    border-bottom: #616161;
 }

 .theme-black a:hover, .theme-black .btn:hover {
-  color: #fff;
+    color: #fff;
 }

 .theme-black .b-dropdown a:hover {
-  color: inherit;
+    color: inherit;
 }

 .theme-black .btn {
-  color: #eee;
+    color: #eee;
 }

 .theme-black .modal-header .close {
-  color: #e0e0e0;
-  text-shadow: none;
+    color: #e0e0e0;
+    text-shadow: none;
 }

 .theme-black .modal-header {
-  border-bottom: 1px solid #646464;
+    border-bottom: 1px solid #646464;
 }

 /* -------------------------- */

 #nav {
-  padding: 30px;
+    padding: 30px;
 }

 #nav a {
-  font-weight: bold;
-  color: #2c3e50;
+    font-weight: bold;
+    color: #2c3e50;
 }

 #nav a.router-link-exact-active {
-  color: #42b983;
+    color: #42b983;
 }

 .mobile {
-  display: none;
+    display: none;
 }

 .container {
-  padding-top: 1em;
+    padding-top: 1em;
 }

@media (max-width: 650px) {
-  .mobile {
-    display: initial;
-  }
+    .mobile {
+        display: initial;
+    }

-  .not-mobile {
-    display: none;
-  }
+    .not-mobile {
+        display: none;
+    }

-  .grid-single-column .fit {
-    max-height: none !important;
-  }
+    .grid-single-column .fit {
+        max-height: none !important;
+    }

-  .container {
-    padding-left: 0;
-    padding-right: 0;
-    padding-top: 0
-  }
+    .container {
+        padding-left: 0;
+        padding-right: 0;
+        padding-top: 0
+    }

-  .lightbox-caption {
-    display: none;
-  }
+    .lightbox-caption {
+        display: none;
+    }
 }

 .info-icon {
-  width: 1rem;
-  margin-right: 0.2rem;
-  cursor: pointer;
-  line-height: 1rem;
-  height: 1rem;
-  background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hsaW5rIiB4PSIwcHgiIHk9IjBweCIKICAgICB2aWV3Qm94PSIwIDAgNDI2LjY2NyA0MjYuNjY3IiBzdHlsZT0iZW5hYmxlLWJhY2tncm91bmQ6bmV3IDAgMCA0MjYuNjY3IDQyNi42Njc7IiBmaWxsPSIjZmZmIj4KPGc+CiAgICA8Zz4KICAgICAgICA8Zz4KICAgICAgICAgICAgPHJlY3QgeD0iMTkyIiB5PSIxOTIiIHdpZHRoPSI0Mi42NjciIGhlaWdodD0iMTI4Ii8+CiAgICAgICAgICAgIDxwYXRoIGQ9Ik0yMTMuMzMzLDBDOTUuNDY3LDAsMCw5NS40NjcsMCwyMTMuMzMzczk1LjQ2NywyMTMuMzMzLDIxMy4zMzMsMjEzLjMzM1M0MjYuNjY3LDMzMS4yLDQyNi42NjcsMjEzLjMzMwogICAgICAgICAgICAgICAgUzMzMS4yLDAsMjEzLjMzMywweiBNMjEzLjMzMywzODRjLTk0LjA4LDAtMTcwLjY2Ny03Ni41ODctMTcwLjY2Ny0xNzAuNjY3UzExOS4yNTMsNDIuNjY3LDIxMy4zMzMsNDIuNjY3CiAgICAgICAgICAgICAgICBTMzg0LDExOS4yNTMsMzg0LDIxMy4zMzNTMzA3LjQxMywzODQsMjEzLjMzMywzODR6Ii8+CiAgICAgICAgICAgIDxyZWN0IHg9IjE5MiIgeT0iMTA2LjY2NyIgd2lkdGg9IjQyLjY2NyIgaGVpZ2h0PSI0Mi42NjciLz4KICAgICAgICA8L2c+CiAgICA8L2c+CjwvZz4KPC9zdmc+Cg==);
-  filter: brightness(45%);
-  display: block;
+    width: 1rem;
+    margin-right: 0.2rem;
+    cursor: pointer;
+    line-height: 1rem;
+    height: 1rem;
+    background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHhtbG5zOnhsaW5rPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5L3hsaW5rIiB4PSIwcHgiIHk9IjBweCIKICAgICB2aWV3Qm94PSIwIDAgNDI2LjY2NyA0MjYuNjY3IiBzdHlsZT0iZW5hYmxlLWJhY2tncm91bmQ6bmV3IDAgMCA0MjYuNjY3IDQyNi42Njc7IiBmaWxsPSIjZmZmIj4KPGc+CiAgICA8Zz4KICAgICAgICA8Zz4KICAgICAgICAgICAgPHJlY3QgeD0iMTkyIiB5PSIxOTIiIHdpZHRoPSI0Mi42NjciIGhlaWdodD0iMTI4Ii8+CiAgICAgICAgICAgIDxwYXRoIGQ9Ik0yMTMuMzMzLDBDOTUuNDY3LDAsMCw5NS40NjcsMCwyMTMuMzMzczk1LjQ2NywyMTMuMzMzLDIxMy4zMzMsMjEzLjMzM1M0MjYuNjY3LDMzMS4yLDQyNi42NjcsMjEzLjMzMwogICAgICAgICAgICAgICAgUzMzMS4yLDAsMjEzLjMzMywweiBNMjEzLjMzMywzODRjLTk0LjA4LDAtMTcwLjY2Ny03Ni41ODctMTcwLjY2Ny0xNzAuNjY3UzExOS4yNTMsNDIuNjY3LDIxMy4zMzMsNDIuNjY3CiAgICAgICAgICAgICAgICBTMzg0LDExOS4yNTMsMzg0LDIxMy4zMzNTMzA3LjQxMywzODQsMjEzLjMzMywzODR6Ii8+CiAgICAgICAgICAgIDxyZWN0IHg9IjE5MiIgeT0iMTA2LjY2NyIgd2lkdGg9IjQyLjY2NyIgaGVpZ2h0PSI0Mi42NjciLz4KICAgICAgICA8L2c+CiAgICA8L2c+CjwvZz4KPC9zdmc+Cg==);
+    filter: brightness(45%);
+    display: block;
 }

 .tabs {
-  margin-top: 10px;
+    margin-top: 10px;
 }

 .modal-title {
-  text-overflow: ellipsis;
-  overflow: hidden;
-  white-space: nowrap;
+    text-overflow: ellipsis;
+    overflow: hidden;
+    white-space: nowrap;
 }

@media screen and (min-width: 1500px) {
-  .container {
-    max-width: 1440px;
-  }
+    .container {
+        max-width: 1440px;
+    }
 }

 .noUi-connects {
-  border-radius: 1px !important;
+    border-radius: 1px !important;
 }

 mark {
-  background: #fff217;
-  border-radius: 0;
-  padding: 1px 0;
-  color: inherit;
+    background: #fff217;
+    border-radius: 0;
+    padding: 1px 0;
+    color: inherit;
 }

 .theme-black mark {
-  background: rgba(251, 191, 41, 0.25);
-  border-radius: 0;
-  padding: 1px 0;
-  color: inherit;
+    background: rgba(251, 191, 41, 0.25);
+    border-radius: 0;
+    padding: 1px 0;
+    color: inherit;
 }

 .theme-black .content-div mark {
-  background: rgba(251, 191, 41, 0.40);
-  color: white;
+    background: rgba(251, 191, 41, 0.40);
+    color: white;
 }

 .content-div {
-  font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
-  font-size: 13px;
-  padding: 1em;
-  background-color: #f5f5f5;
-  border: 1px solid #ccc;
-  border-radius: 4px;
-  margin: 3px;
-  white-space: normal;
-  color: #000;
-  overflow: hidden;
+    font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
+    font-size: 13px;
+    padding: 1em;
+    background-color: #f5f5f5;
+    border: 1px solid #ccc;
+    border-radius: 4px;
+    margin: 3px;
+    white-space: normal;
+    color: #000;
+    overflow: hidden;
 }

 .theme-black .content-div {
-  background-color: #37474F;
-  border: 1px solid #616161;
-  color: #E0E0E0FF;
+    background-color: #37474F;
+    border: 1px solid #616161;
+    color: #E0E0E0FF;
 }

 .graph {
-  display: inline-block;
-  width: 40%;
+    display: inline-block;
+    width: 40%;
 }

 .pointer {
-  cursor: pointer;
+    cursor: pointer;
 }

 .loading-page {
-  display: flex;
-  justify-content: center;
-  align-items: center;
-  flex-direction: column;
-  height: 100%;
-  gap: 15px
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    flex-direction: column;
+    height: 100%;
+    gap: 15px
 }

 .loading-spinners {
-  display: flex;
-  gap: 10px;
+    display: flex;
+    gap: 10px;
 }

 .loading-text {
-  text-align: center;
+    text-align: center;
 }
 </style>
--- a/sist2-vue/src/Sist2Api.ts
+++ b/sist2-vue/src/Sist2Api.ts
@@ -61,6 +61,7 @@ export interface EsHit {
        isAudio: boolean
        hasThumbnail: boolean
        hasVidPreview: boolean
+        imageAspectRatio: number
        /** Number of thumbnails available */
        tnNum: number
    }
@@ -155,6 +156,9 @@ class Sist2Api {
                    && hit._source.videoc !== "raw" && hit._source.videoc !== "ppm") {
                    hit._props.isPlayableImage = true;
                }
+                if ("width" in hit._source && "height" in hit._source) {
+                    hit._props.imageAspectRatio = hit._source.width / hit._source.height;
+                }
                break;
            case "video":
                if ("videoc" in hit._source) {
@@ -187,30 +191,6 @@ class Sist2Api {
    setHitTags(hit: EsHit): void {
        const tags = [] as Tag[];

-        const mimeCategory = hit._source.mime == null ? null : hit._source.mime.split("/")[0];
-
-        switch (mimeCategory) {
-            case "image":
-            case "video":
-                if ("videoc" in hit._source && hit._source.videoc) {
-                    tags.push({
-                        style: "video",
-                        text: hit._source.videoc.replace(" ", ""),
-                        userTag: false
-                    } as Tag);
-                }
-                break
-            case "audio":
-                if ("audioc" in hit._source && hit._source.audioc) {
-                    tags.push({
-                        style: "audio",
-                        text: hit._source.audioc,
-                        userTag: false
-                    } as Tag);
-                }
-                break;
-        }
-
        // User tags
        if ("tag" in hit._source) {
            hit._source.tag.forEach(tag => {
@@ -381,20 +361,20 @@ class Sist2Api {
        });
    }

-    getTreemapCsvUrl(indexId: string) {
-        return `${this.baseUrl}s/${indexId}/1`;
+    getTreemapStat(indexId: string) {
+        return `${this.baseUrl}s/${indexId}/TMAP`;
    }

-    getMimeCsvUrl(indexId: string) {
-        return `${this.baseUrl}s/${indexId}/2`;
+    getMimeStat(indexId: string) {
+        return `${this.baseUrl}s/${indexId}/MAGG`;
    }

-    getSizeCsv(indexId: string) {
-        return `${this.baseUrl}s/${indexId}/3`;
+    getSizeStat(indexId: string) {
+        return `${this.baseUrl}s/${indexId}/SAGG`;
    }

-    getDateCsv(indexId: string) {
-        return `${this.baseUrl}s/${indexId}/4`;
+    getDateStat(indexId: string) {
+        return `${this.baseUrl}s/${indexId}/DAGG`;
    }
 }

--- a/sist2-vue/src/components/AnalyzedContentSpan.vue
+++ b/sist2-vue/src/components/AnalyzedContentSpan.vue
@@ -0,0 +1,21 @@
+<template>
+    <span :style="getStyle()">{{span.text}}</span>
+</template>
+
+<script>
+
+
+import ModelsRepo from "@/ml/modelsRepo";
+
+export default {
+    name: "AnalyzedContentSpan",
+    props: ["span", "text"],
+    methods: {
+        getStyle() {
+            return ModelsRepo.data[this.$store.getters.mlModel.name].labelStyles[this.span.label];
+        }
+    }
+}
+</script>
+
+<style scoped></style>
--- a/sist2-vue/src/components/AnalyzedContentSpanContainer.vue
+++ b/sist2-vue/src/components/AnalyzedContentSpanContainer.vue
@@ -0,0 +1,75 @@
+<template>
+    <div>
+        <b-card class="mb-2">
+            <AnalyzedContentSpan v-for="span of legend" :key="span.id" :span="span"
+                                 class="mr-2"></AnalyzedContentSpan>
+        </b-card>
+        <div class="content-div">
+            <AnalyzedContentSpan v-for="span of mergedSpans" :key="span.id" :span="span"></AnalyzedContentSpan>
+        </div>
+    </div>
+</template>
+
+<script>
+
+
+import AnalyzedContentSpan from "@/components/AnalyzedContentSpan.vue";
+import ModelsRepo from "@/ml/modelsRepo";
+
+export default {
+    name: "AnalyzedContentSpanContainer",
+    components: {AnalyzedContentSpan},
+    props: ["spans", "text"],
+    computed: {
+        legend() {
+            return Object.entries(ModelsRepo.data[this.$store.state.mlModel.name].legend)
+                .map(([label, name]) => ({
+                    text: name,
+                    id: label,
+                    label: label
+                }));
+        },
+        mergedSpans() {
+            const spans = this.spans;
+
+            const merged = [];
+
+            let lastLabel = null;
+            let fixSpace = false;
+            for (let i = 0; i < spans.length; i++) {
+
+                if (spans[i].label !== lastLabel) {
+                    let start = spans[i].wordIndex;
+                    const nextSpan = spans.slice(i + 1).find(s => s.label !== spans[i].label)
+                    let end = nextSpan ? nextSpan.wordIndex : undefined;
+
+                    if (end !== undefined && this.text[end - 1] === " ") {
+                        end -= 1;
+                        fixSpace = true;
+                    }
+
+                    merged.push({
+                        text: this.text.slice(start, end),
+                        label: spans[i].label,
+                        id: spans[i].wordIndex
+                    });
+
+                    if (fixSpace) {
+                        merged.push({
+                            text: " ",
+                            label: "O",
+                            id: end
+                        });
+                        fixSpace = false;
+                    }
+                    lastLabel = spans[i].label;
+                }
+            }
+
+            return merged;
+        },
+    },
+}
+</script>
+
+<style scoped></style>
--- a/sist2-vue/src/components/D3DateHistogram.vue
+++ b/sist2-vue/src/components/D3DateHistogram.vue
@@ -120,7 +120,7 @@ export default {
    update(indexId) {
      const svg = d3.select("#date-histogram");

-      d3.csv(Sist2Api.getDateCsv(indexId)).then(tabularData => {
+      d3.json(Sist2Api.getDateStat(indexId)).then(tabularData => {
        dateHistogram(tabularData.slice(), svg, this.$t("d3.dateHistogram"));
      });
    }
--- a/sist2-vue/src/components/D3MimeBarCount.vue
+++ b/sist2-vue/src/components/D3MimeBarCount.vue
@@ -91,7 +91,7 @@ export default {
      const mimeSvgCount = d3.select("#agg-mime-count");
      const fillOpacity = this.$store.state.optTheme === "black" ? 0.9 : 0.6;

-      d3.csv(Sist2Api.getMimeCsvUrl(indexId)).then(tabularData => {
+      d3.json(Sist2Api.getMimeStat(indexId)).then(tabularData => {
        mimeBarCount(tabularData.slice(), mimeSvgCount, fillOpacity, this.$t("d3.mimeCount"));
      });
    }
--- a/sist2-vue/src/components/D3MimeBarSize.vue
+++ b/sist2-vue/src/components/D3MimeBarSize.vue
@@ -90,7 +90,7 @@ export default {
      const mimeSvgSize = d3.select("#agg-mime-size");
      const fillOpacity = this.$store.state.optTheme === "black" ? 0.9 : 0.6;

-      d3.csv(Sist2Api.getMimeCsvUrl(indexId)).then(tabularData => {
+      d3.json(Sist2Api.getMimeStat(indexId)).then(tabularData => {
        mimeBarSize(tabularData.slice(), mimeSvgSize, fillOpacity, this.$t("d3.mimeSize"));
      });
    }
--- a/sist2-vue/src/components/D3SizeHistogram.vue
+++ b/sist2-vue/src/components/D3SizeHistogram.vue
@@ -117,7 +117,7 @@ export default {
    update(indexId) {
      const svg = d3.select("#size-histogram");

-      d3.csv(Sist2Api.getSizeCsv(indexId)).then(tabularData => {
+      d3.json(Sist2Api.getSizeStat(indexId)).then(tabularData => {
        sizeHistogram(tabularData.slice(), svg, this.$t("d3.sizeHistogram"));
      });
    }
--- a/sist2-vue/src/components/D3Treemap.vue
+++ b/sist2-vue/src/components/D3Treemap.vue
@@ -240,7 +240,7 @@ export default {
          .style("overflow", "visible")
          .style("font", "10px sans-serif");

-      d3.csv(Sist2Api.getTreemapCsvUrl(indexId)).then(tabularData => {
+      d3.json(Sist2Api.getTreemapStat(indexId)).then(tabularData => {
        tabularData.forEach(row => {
          row.taxonomy = row.path.split("/");
          row.size = Number(row.size);
--- a/sist2-vue/src/components/DebugInfo.vue
+++ b/sist2-vue/src/components/DebugInfo.vue
@@ -1,5 +1,5 @@
 <template>
-  <b-card class="mb-4 mt-4">
+  <b-card v-if="$store.state.sist2Info.showDebugInfo" class="mb-4 mt-4">
    <b-card-title><DebugIcon class="mr-1"></DebugIcon>{{ $t("debug") }}</b-card-title>
    <p v-html="$t('debugDescription')"></p>

--- a/sist2-vue/src/components/DocCard.vue
+++ b/sist2-vue/src/components/DocCard.vue
@@ -27,6 +27,11 @@
          <DocFileTitle :doc="doc"></DocFileTitle>
        </div>

+        <!-- Featured line -->
+        <div style="display: flex">
+          <FeaturedFieldsLine :doc="doc"></FeaturedFieldsLine>
+        </div>
+
        <!-- Tags -->
        <div class="card-text">
          <TagContainer :hit="doc"></TagContainer>
@@ -43,10 +48,11 @@ import DocFileTitle from "@/components/DocFileTitle.vue";
 import DocInfoModal from "@/components/DocInfoModal.vue";
 import ContentDiv from "@/components/ContentDiv.vue";
 import FullThumbnail from "@/components/FullThumbnail";
+import FeaturedFieldsLine from "@/components/FeaturedFieldsLine";


 export default {
-  components: {FullThumbnail, ContentDiv, DocInfoModal, DocFileTitle, TagContainer},
+  components: {FeaturedFieldsLine, FullThumbnail, ContentDiv, DocInfoModal, DocFileTitle, TagContainer},
  props: ["doc", "width"],
  data() {
    return {
--- a/sist2-vue/src/components/DocListItem.vue
+++ b/sist2-vue/src/components/DocListItem.vue
@@ -50,6 +50,11 @@
          <span v-if="doc._source.author && doc._source.pages" class="mx-1">-</span>
          <span v-if="doc._source.author">{{ doc._source.author }}</span>
        </div>
+
+        <!-- Featured line -->
+        <div style="display: flex">
+          <FeaturedFieldsLine :doc="doc"></FeaturedFieldsLine>
+        </div>
      </div>
    </div>
  </b-list-group-item>
@@ -61,10 +66,11 @@ import DocFileTitle from "@/components/DocFileTitle";
 import DocInfoModal from "@/components/DocInfoModal";
 import ContentDiv from "@/components/ContentDiv";
 import FileIcon from "@/components/icons/FileIcon";
+import FeaturedFieldsLine from "@/components/FeaturedFieldsLine";

 export default {
  name: "DocListItem",
-  components: {FileIcon, ContentDiv, DocInfoModal, DocFileTitle, TagContainer},
+  components: {FileIcon, ContentDiv, DocInfoModal, DocFileTitle, TagContainer, FeaturedFieldsLine},
  props: ["doc"],
  data() {
    return {
--- a/sist2-vue/src/components/FeaturedFieldsLine.vue
+++ b/sist2-vue/src/components/FeaturedFieldsLine.vue
@@ -0,0 +1,46 @@
+<template>
+    <div class="featured-line" v-html="featuredLineHtml"></div>
+</template>
+
+<script>
+import {humanDate, humanFileSize} from "@/util";
+
+function scopedEval(context, expr) {
+    const evaluator = Function.apply(null, [...Object.keys(context), "expr", "return eval(expr)"]);
+    return evaluator.apply(null, [...Object.values(context), expr]);
+}
+
+
+export default {
+    name: "FeaturedFieldsLine",
+    props: ["doc"],
+    computed: {
+        featuredLineHtml() {
+            if (this.$store.getters.optFeaturedFields === undefined) {
+                return "";
+            }
+
+            const scope = {doc: this.doc._source, humanDate: humanDate, humanFileSize: humanFileSize};
+
+            return this.$store.getters.optFeaturedFields
+                .replaceAll(/\$\{([^}]*)}/g, (match, g1) => {
+                    return scopedEval(scope, g1);
+                });
+        }
+    }
+}
+</script>
+
+<style scoped>
+
+.featured-line {
+    font-size: 90%;
+    font-family: 'Source Sans Pro', 'Helvetica Neue', Arial, sans-serif;
+    color: #424242;
+    padding-left: 2px;
+}
+
+.theme-black .featured-line {
+    color: #bebebe;
+}
+</style>
--- a/sist2-vue/src/components/FullThumbnail.vue
+++ b/sist2-vue/src/components/FullThumbnail.vue
@@ -6,13 +6,13 @@
    </div>

    <div
-        v-if="doc._props.isImage && !hover && doc._props.tnW / doc._props.tnH < 5"
+        v-if="doc._props.isImage && doc._props.imageAspectRatio < 5"
        class="card-img-overlay"
        :class="{'small-badge': smallBadge}">
      <span class="badge badge-resolution">{{ `${doc._source.width}x${doc._source.height}` }}</span>
    </div>

-    <div v-if="(doc._props.isVideo || doc._props.isGif) && doc._source.duration > 0 && !hover"
+    <div v-if="(doc._props.isVideo || doc._props.isGif) && doc._source.duration > 0"
         class="card-img-overlay"
         :class="{'small-badge': smallBadge}">
      <span class="badge badge-resolution">{{ humanTime(doc._source.duration) }}</span>
@@ -63,6 +63,11 @@ export default {
  },
  computed: {
    tnSrc() {
+      return this.getThumbnailSrc(this.currentThumbnailNum);
+    },
+  },
+  methods: {
+    getThumbnailSrc(thumbnailNum) {
      const doc = this.doc;
      const props = doc._props;
      if (props.isGif && this.hover) {
@@ -70,10 +75,8 @@ export default {
      }
      return (this.currentThumbnailNum === 0)
          ? `t/${doc._source.index}/${doc._id}`
-          : `t/${doc._source.index}/${doc._id}${String(this.currentThumbnailNum).padStart(4, "0")}`;
+          : `t/${doc._source.index}/${doc._id}/${String(thumbnailNum).padStart(4, "0")}`;
    },
-  },
-  methods: {
    humanTime: humanTime,
    onThumbnailClick() {
      this.$emit("onThumbnailClick");
@@ -86,9 +89,14 @@ export default {
    },
    onTnEnter() {
      this.hover = true;
+      const start = Date.now()
      if (this.doc._props.hasVidPreview) {
-        this.currentThumbnailNum += 1;
-        this.scheduleNextTnNum();
+        let img = new Image();
+        img.src = this.getThumbnailSrc(this.currentThumbnailNum + 1);
+        img.onload = () => {
+          this.currentThumbnailNum += 1;
+          this.scheduleNextTnNum(Date.now() - start);
+        }
      }
    },
    onTnLeave() {
@@ -99,17 +107,23 @@ export default {
        this.timeoutId = null;
      }
    },
-    scheduleNextTnNum() {
-      const INTERVAL = this.$store.state.optVidPreviewInterval ?? 700;
+    scheduleNextTnNum(offset = 0) {
+      const INTERVAL = (this.$store.state.optVidPreviewInterval ?? 700) - offset;
      this.timeoutId = window.setTimeout(() => {
+        const start = Date.now();
        if (!this.hover) {
          return;
        }
-        this.scheduleNextTnNum();
        if (this.currentThumbnailNum === this.doc._props.tnNum - 1) {
          this.currentThumbnailNum = 0;
+          this.scheduleNextTnNum();
        } else {
-          this.currentThumbnailNum += 1;
+          let img = new Image();
+          img.src = this.getThumbnailSrc(this.currentThumbnailNum + 1);
+          img.onload = () => {
+            this.currentThumbnailNum += 1;
+            this.scheduleNextTnNum(Date.now() - start);
+          }
        }
      }, INTERVAL);
    },
@@ -152,17 +166,18 @@ export default {
 }

 .badge-resolution {
-  color: #212529;
-  background-color: #FFC107;
+  color: #c6c6c6;
+  background-color: #272727CC;
+  padding: 2px 3px;
 }

 .card-img-overlay {
  pointer-events: none;
-  padding: 0.75rem;
-  bottom: unset;
-  top: 0;
+  padding: 2px 6px;
+  bottom: 4px;
+  top: unset;
  left: unset;
-  right: unset;
+  right: 0;
 }

 .small-badge {
--- a/sist2-vue/src/components/LazyContentDiv.vue
+++ b/sist2-vue/src/components/LazyContentDiv.vue
@@ -1,6 +1,36 @@
 <template>
-  <Preloader v-if="loading"></Preloader>
-  <div v-else-if="content" class="content-div" v-html="content"></div>
+    <Preloader v-if="loading"></Preloader>
+    <div v-else-if="content">
+        <b-form inline class="my-2" v-if="ModelsRepo.getOptions().length > 0">
+            <b-checkbox class="ml-auto mr-2" :checked="optAutoAnalyze"
+                        @input="setOptAutoAnalyze($event); $store.dispatch('updateConfiguration')">
+                {{ $t("ml.auto") }}
+            </b-checkbox>
+            <b-button :disabled="mlPredictionsLoading || mlLoading" @click="mlAnalyze" variant="primary"
+            >{{ $t("ml.analyzeText") }}
+            </b-button>
+            <b-select :disabled="mlPredictionsLoading || mlLoading" class="ml-2" v-model="mlModel">
+                <b-select-option :value="opt.value" v-for="opt of ModelsRepo.getOptions()">{{ opt.text }}
+                </b-select-option>
+            </b-select>
+        </b-form>
+
+        <b-progress v-if="mlLoading" variant="warning" show-progress :max="1" class="mb-3"
+        >
+            <b-progress-bar :value="modelLoadingProgress">
+                <strong>{{ ((modelLoadingProgress * modelSize) / (1024*1024)).toFixed(1) }}MB / {{
+                    (modelSize / (1024 * 1024)).toFixed(1)
+                    }}MB</strong>
+            </b-progress-bar>
+        </b-progress>
+
+        <b-progress v-if="mlPredictionsLoading" variant="primary" :value="modelPredictionProgress"
+                    :max="content.length" class="mb-3"></b-progress>
+
+        <AnalyzedContentSpansContainer v-if="analyzedContentSpans.length > 0"
+                                       :spans="analyzedContentSpans" :text="rawContent"></AnalyzedContentSpansContainer>
+        <div v-else class="content-div" v-html="content"></div>
+    </div>
 </template>

 <script>
@@ -8,87 +38,169 @@ import Sist2Api from "@/Sist2Api";
 import Preloader from "@/components/Preloader";
 import Sist2Query from "@/Sist2Query";
 import store from "@/store";
+import BertNerModel from "@/ml/BertNerModel";
+import AnalyzedContentSpansContainer from "@/components/AnalyzedContentSpanContainer.vue";
+import ModelsRepo from "@/ml/modelsRepo";
+import {mapGetters, mapMutations} from "vuex";

 export default {
-  name: "LazyContentDiv",
-  components: {Preloader},
-  props: ["docId"],
-  data() {
-    return {
-      content: "",
-      loading: true
+    name: "LazyContentDiv",
+    components: {AnalyzedContentSpansContainer, Preloader},
+    props: ["docId"],
+    data() {
+        return {
+            ModelsRepo,
+            content: "",
+            rawContent: "",
+            loading: true,
+            modelLoadingProgress: 0,
+            modelPredictionProgress: 0,
+            mlPredictionsLoading: false,
+            mlLoading: false,
+            mlModel: null,
+            analyzedContentSpans: []
+        }
+    },
+    mounted() {
+
+        if (this.$store.getters.optMlDefaultModel) {
+            this.mlModel = this.$store.getters.optMlDefaultModel
+        } else {
+            this.mlModel = ModelsRepo.getDefaultModel();
+        }
+
+        const query = Sist2Query.searchQuery();
+
+        if (this.$store.state.optHighlight) {
+            const fields = this.$store.state.fuzzy
+                ? {"content.nGram": {}}
+                : {content: {}};
+
+            query.highlight = {
+                pre_tags: ["<mark>"],
+                post_tags: ["</mark>"],
+                number_of_fragments: 0,
+                fields,
+            };
+
+            if (!store.state.sist2Info.esVersionLegacy) {
+                query.highlight.max_analyzed_offset = 999_999;
+            }
+        }
+
+        if ("function_score" in query.query) {
+            query.query = query.query.function_score.query;
+        }
+
+        if (!("must" in query.query.bool)) {
+            query.query.bool.must = [];
+        } else if (!Array.isArray(query.query.bool.must)) {
+            query.query.bool.must = [query.query.bool.must];
+        }
+
+        query.query.bool.must.push({match: {_id: this.docId}});
+
+        delete query["sort"];
+        delete query["aggs"];
+        delete query["search_after"];
+        delete query.query["function_score"];
+
+        query._source = {
+            includes: ["content", "name", "path", "extension"]
+        }
+
+        query.size = 1;
+
+        Sist2Api.esQuery(query).then(resp => {
+            this.loading = false;
+            if (resp.hits.hits.length === 1) {
+                this.content = this.getContent(resp.hits.hits[0]);
+            }
+
+            if (this.optAutoAnalyze) {
+                this.mlAnalyze();
+            }
+        });
+    },
+    computed: {
+        ...mapGetters(["optAutoAnalyze"]),
+        modelSize() {
+            const modelData = ModelsRepo.data[this.mlModel];
+            if (!modelData) {
+                return 0;
+            }
+            return modelData.size;
+        }
+    },
+    methods: {
+        ...mapMutations(["setOptAutoAnalyze"]),
+        getContent(doc) {
+            this.rawContent = doc._source.content;
+
+            if (!doc.highlight) {
+                return doc._source.content;
+            }
+
+            if (doc.highlight["content.nGram"]) {
+                return doc.highlight["content.nGram"][0];
+            }
+            if (doc.highlight.content) {
+                return doc.highlight.content[0];
+            }
+        },
+        async getMlModel() {
+            if (this.$store.getters.mlModel.name !== this.mlModel) {
+                this.mlLoading = true;
+                this.modelLoadingProgress = 0;
+                const modelInfo = ModelsRepo.data[this.mlModel];
+
+                const model = new BertNerModel(
+                    modelInfo.vocabUrl,
+                    modelInfo.modelUrl,
+                    modelInfo.id2label,
+                )
+
+                await model.init(progress => this.modelLoadingProgress = progress);
+                this.$store.commit("setMlModel", {model, name: this.mlModel});
+
+                this.mlLoading = false;
+                return model
+            }
+
+            return this.$store.getters.mlModel.model;
+        },
+        async mlAnalyze() {
+            if (!this.content) {
+                return;
+            }
+
+            const modelInfo = ModelsRepo.data[this.mlModel];
+            if (modelInfo === undefined) {
+                return;
+            }
+
+            this.$store.commit("setOptMlDefaultModel", this.mlModel);
+            await this.$store.dispatch("updateConfiguration");
+
+            const model = await this.getMlModel();
+
+            this.analyzedContentSpans = [];
+
+            this.mlPredictionsLoading = true;
+
+            await model.predict(this.rawContent, results => {
+                results.forEach(result => result.label = modelInfo.humanLabels[result.label]);
+                this.analyzedContentSpans.push(...results);
+                this.modelPredictionProgress = results[results.length - 1].wordIndex;
+            });
+            this.mlPredictionsLoading = false;
+        }
    }
-  },
-  mounted() {
-    const query = Sist2Query.searchQuery();
-
-    if (this.$store.state.optHighlight) {
-
-      const fields = this.$store.state.fuzzy
-          ? {"content.nGram": {}}
-          : {content: {}};
-
-      query.highlight = {
-        pre_tags: ["<mark>"],
-        post_tags: ["</mark>"],
-        number_of_fragments: 0,
-        fields,
-      };
-
-      if (!store.state.sist2Info.esVersionLegacy) {
-        query.highlight.max_analyzed_offset = 999_999;
-      }
-    }
-
-    if ("function_score" in query.query) {
-      query.query = query.query.function_score.query;
-    }
-
-    if (!("must" in query.query.bool)) {
-      query.query.bool.must = [];
-    } else if (!Array.isArray(query.query.bool.must)) {
-      query.query.bool.must = [query.query.bool.must];
-    }
-
-    query.query.bool.must.push({match: {_id: this.docId}});
-
-    delete query["sort"];
-    delete query["aggs"];
-    delete query["search_after"];
-    delete query.query["function_score"];
-
-    query._source = {
-      includes: ["content", "name", "path", "extension"]
-    }
-
-    query.size = 1;
-
-    Sist2Api.esQuery(query).then(resp => {
-      this.loading = false;
-      if (resp.hits.hits.length === 1) {
-        this.content = this.getContent(resp.hits.hits[0]);
-      } else {
-        console.log("FIXME: could not get content")
-        console.log(resp)
-      }
-    });
-  },
-  methods: {
-    getContent(doc) {
-      if (!doc.highlight) {
-        return doc._source.content;
-      }
-
-      if (doc.highlight["content.nGram"]) {
-        return doc.highlight["content.nGram"][0];
-      }
-      if (doc.highlight.content) {
-        return doc.highlight.content[0];
-      }
-    }
-  }
 }
 </script>

-<style scoped>
+<style>
+.progress-bar {
+    transition: none;
+}
 </style>
--- a/sist2-vue/src/components/Lightbox.vue
+++ b/sist2-vue/src/components/Lightbox.vue
@@ -160,9 +160,13 @@ export default {
    },
    onSlideChange() {
      // Pause all videos when changing slide
-      document.getElementsByTagName("video").forEach((el) => {
+      const videos = document.getElementsByTagName("video");
+      if (videos.length === 0) {
+        return
+      }
+      for (let el of videos) {
        el.pause();
-      });
+      }
    },
  }

--- a/sist2-vue/src/components/TagContainer.vue
+++ b/sist2-vue/src/components/TagContainer.vue
@@ -40,6 +40,7 @@


    <template v-for="tag in hit._tags">
+      <!-- User tag-->
      <div v-if="tag.userTag" :key="tag.rawText" style="display: inline-block">
        <span
            :id="hit._id+tag.rawText"
@@ -51,7 +52,7 @@
        >{{ tag.text.split(".").pop() }}</span>

        <b-popover :target="hit._id+tag.rawText" triggers="focus blur" placement="top">
-          <b-button variant="danger" @click="onTagDeleteClick(tag, $event)">{{$t("deleteTag")}}</b-button>
+          <b-button variant="danger" @click="onTagDeleteClick(tag, $event)">{{ $t("deleteTag") }}</b-button>
        </b-popover>
      </div>

@@ -66,7 +67,7 @@
    <small v-if="showAddButton" class="badge add-tag-button" @click="tagAdd()">{{$t("addTag")}}</small>

    <!-- Size tag-->
-    <small v-else class="text-muted badge-size">{{
+    <small v-else class="text-muted badge-size" style="padding-left: 2px">{{
        humanFileSize(hit._source.size)
      }}</small>
  </div>
@@ -211,7 +212,7 @@ export default Vue.extend({

      return matches.sort().map(match => {
        return {
-          title: match.split(".").slice(0,-1).join("."),
+          title: match.split(".").slice(0, -1).join("."),
          id: match
        }
      });
--- a/sist2-vue/src/i18n/messages.ts
+++ b/sist2-vue/src/i18n/messages.ts
@@ -8,7 +8,7 @@ export default {
            advanced: "Advanced search",
            fuzzy: "Fuzzy"
        },
-        addTag: "Add",
+        addTag: "Tag",
        deleteTag: "Delete",
        download: "Download",
        and: "and",
@@ -17,6 +17,7 @@ export default {
        mimeTypes: "Media types",
        tags: "Tags",
        tagFilter: "Filter tags",
+        forExample: "For example:",
        help: {
            simpleSearch: "Simple search",
            advancedSearch: "Advanced search",
@@ -48,6 +49,7 @@ export default {
        configReset: "Reset configuration",
        searchOptions: "Search options",
        treemapOptions: "Treemap options",
+        mlOptions: "Machine learning options",
        displayOptions: "Display options",
        opt: {
            lang: "Language",
@@ -75,7 +77,12 @@ export default {
            useDatePicker: "Use a Date Picker component rather than a slider",
            vidPreviewInterval: "Video preview frame duration in ms",
            simpleLightbox: "Disable animations in image viewer",
-            showTagPickerFilter: "Display the tag filter bar"
+            showTagPickerFilter: "Display the tag filter bar",
+            featuredFields: "Featured fields Javascript template string. Will appear in the search results.",
+            featuredFieldsList: "Available variables",
+            autoAnalyze: "Automatically analyze text",
+            defaultModel: "Default model",
+            mlRepositories: "Model repositories (one per line)"
        },
        queryMode: {
            simple: "Simple",
@@ -83,6 +90,7 @@ export default {
        },
        lang: {
            en: "English",
+            de: "Deutsch",
            fr: "Français",
            "zh-CN": "简体中文",
        },
@@ -167,6 +175,185 @@ export default {
            selectedIndex: "selected index",
            selectedIndices: "selected indices",
        },
+        ml: {
+            analyzeText: "Analyze",
+            auto: "Auto",
+            repoFetchError: "Failed to get list of models. Check browser console for more details.",
+            repoFetchErrorTitle: "Could not fetch model repositories",
+        }
+    },
+    de: {
+        filePage: {
+          notFound: "Nicht gefunden"
+        },
+        searchBar: {
+            simple: "Suche",
+            advanced: "Erweiterte Suche",
+            fuzzy: "Fuzzy"
+        },
+        addTag: "Tag",
+        deleteTag: "Löschen",
+        download: "Herunterladen",
+        and: "und",
+        page: "Seite",
+        pages: "Seiten",
+        mimeTypes: "Medientypen",
+        tags: "Tags",
+        tagFilter: "Tags filtern",
+        forExample: "Zum Beispiel:",
+        help: {
+            simpleSearch: "Einfache Suche",
+            advancedSearch: "Erweiterte Suche",
+            help: "Hilfe",
+            term: "<BEGRIFF>",
+            and: "UND Operator",
+            or: "ODER Operator",
+            not: "negiert einen einzelnen Begriff",
+            quotes: "liefert Treffer, wenn die Abfolge in der genauen Reihenfolge gefunden wird",
+            prefix: "liefert Treffer, wenn die Abfolge einen solchen Präfix hat",
+            parens: "gruppiert Ausdrücke",
+            tildeTerm: "liefert Treffer, im gegebenen 'Editierabstand'",
+            tildePhrase: "liefert Treffer, mit dem Ausdruck. Erfolgt die gegebene Anzahl zwischenstehnde Nicht-Treffer-Wörter.",
+            example1:
+                "Zum Beispiel: <code>\"fried eggs\" +(eggplant | potato) -frittata</code> wird " +
+                "<i>fried eggs</i> und <i>eggplant</i> oder <i>potato</i> finden, aber keine Ergebnisse, " +
+                "die <i>frittata</i> enthalten.",
+            defaultOperator:
+                "Wenn weder <code>+</code> noch <code>|</code> angegeben sind, ist " +
+                "<code>+</code> (and) der Standard.",
+            fuzzy:
+                "Wenn <b>Fuzzy</b> aktiviert ist, werden Teil-Treffer (3-grams) ebenfalls akzeptiert.",
+            moreInfoSimple: "Für weitere Informationen s.<a target=\"_blank\" " +
+                "rel=\"noreferrer\" href=\"//www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html\">Elasticsearch Dokumentation</a>",
+            moreInfoAdvanced: "Für die Dokumentation der erweiterten Suche s. <a target=\"_blank\" rel=\"noreferrer\" href=\"//www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax\">Elasticsearch Dokumentation</a>"
+        },
+        config: "Konfiguration",
+        configDescription: "Konfiguration wird in Echtzeit für diesen Browser gespeichert.",
+        configReset: "Konfiguration zurücksetzen",
+        searchOptions: "Such-Optionen",
+        treemapOptions: "Kacheldiagramm-Optionen",
+        displayOptions: "Anzeige-Optionen",
+        opt: {
+            lang: "Sprache",
+            highlight: "Aktiviere Hervorhebung von Treffern",
+            fuzzy: "Aktiviere Fuzzy-Suche standardmäßig",
+            searchInPath: "Abgleich der Abfrage mit dem Dokumentpfad aktivieren",
+            suggestPath: "Aktiviere Auto-Vervollständigung in Pfadfilter-Leiste",
+            fragmentSize: "Kontextgröße in Zeichen hervorheben",
+            queryMode: "Such-Modus",
+            displayMode: "Ansicht",
+            columns: "Anzahl Spalten",
+            treemapType: "Kacheldiagramme Typ",
+            treemapTiling: "Kacheldiagramm Tiling",
+            treemapColorGroupingDepth: "Kacheldiagramme Gruppierungsfarbe Tiefe (flach)",
+            treemapColor: "Kacheldiagramme Farbe (kaskadiert)",
+            treemapSize: "Kacheldiagramm Größe",
+            theme: "Theme",
+            lightboxLoadOnlyCurrent: "keine Bilder in voller Größe für benachbachte Slides im Image-Viewer vorab laden.",
+            slideDuration: "Slide Dauer",
+            resultSize: "Anzahl Treffer pro Seite",
+            tagOrOperator: "Verwende ODER Operator bei der Angabe mehrere Tags.",
+            hideDuplicates: "Verstecke Duplikate basierend auf der Prüfsumme",
+            hideLegacy: "Verstecke die 'legacyES' Elasticsearch Notiz",
+            updateMimeMap: "Aktualisiere Medientyp-Baum in Echtzeit",
+            useDatePicker: "Benutze Datumswähler statt Schieber",
+            vidPreviewInterval: "Videovorschau Framedauer in ms",
+            simpleLightbox: "Schalte Animationen im Image-Viewer ab",
+            showTagPickerFilter: "Zeige die Tag-Filter-Leiste",
+            featuredFields: "Variablen, welche zusätzlich in den Suchergebnissen angezeigt werden können.",
+            featuredFieldsList: "verfügbare Variablen"
+        },
+        queryMode: {
+            simple: "Einfach",
+            advanced: "Erweitert",
+        },
+        lang: {
+            en: "English",
+            de: "Deutsch",
+            fr: "Français",
+            "zh-CN": "简体中文",
+        },
+        displayMode: {
+            grid: "Gitter",
+            list: "Liste",
+        },
+        columns: {
+            auto: "Auto"
+        },
+        treemapType: {
+            cascaded: "kaskadiert",
+            flat: "flach (kompakt)"
+        },
+        treemapSize: {
+            small: "klein",
+            medium: "mittel",
+            large: "groß",
+            xLarge: "sehr groß",
+            xxLarge: "riesig",
+            custom: "eigene",
+        },
+        treemapTiling: {
+            binary: "binär",
+            squarify: "quadratisch",
+            slice: "Slice",
+            dice: "Dice",
+            sliceDice: "Slice & Dice",
+        },
+        theme: {
+            light: "Hell",
+            black: "Dunkel"
+        },
+        hit: "Treffer",
+        hits: "Treffer",
+        details: "Details",
+        stats: "Statistiken",
+        queryTime: "Abfragedauer",
+        totalSize: "Gesamtgröße",
+        pathBar: {
+            placeholder: "Filter Pfad",
+            modalTitle: "Wähle Pfad"
+        },
+        debug: "Debug Informationen",
+        debugDescription: "Informationen für das Debugging. Wenn du Bugs gefunden oder Anregungen für " +
+            "neue Features hast, poste sie bitte <a href='https://github.com/simon987/sist2/issues/new/choose'>hier</a>.",
+        tagline: "Tagline",
+        toast: {
+            esConnErrTitle: "Elasticsearch Verbindungsfehler",
+            esConnErr: "sist2 Web-Modul stellte einen Fehler beim Verbinden mit Elasticsearch fest. " +
+                "Schau in die Server-Logs für weitere Informationen.",
+            esQueryErrTitle: "Query Fehler",
+            esQueryErr: "Konnte Query nicht verarbeiten/ausführen, bitte schaue in die Dokumentation zur erweiterten Suche. " +
+                "Schau in die Server-Logs für weitere Informationen.",
+            dupeTagTitle: "Tag Duplikat",
+            dupeTag: "Dieser Tag existiert bereits für das Dokument.",
+            copiedToClipboard: "In die Zwischenablage kopiert."
+        },
+        saveTagModalTitle: "Tag hinzufügen",
+        saveTagPlaceholder: "Tag Name",
+        confirm: "Bestätigen",
+        indexPickerPlaceholder: "Index auswählen",
+        sort: {
+            relevance: "Relevanz",
+            dateAsc: "Datum (älteste zuerst)",
+            dateDesc: "Datum (neuste zuerst)",
+            sizeAsc: "Größe (kleinste zuerst)",
+            sizeDesc: "Größe (größte zuerst)",
+            nameAsc: "Name (A-z)",
+            nameDesc: "Name (Z-a)",
+            random: "zufällig",
+        },
+        d3: {
+            mimeCount: "Anzahl nach Medientyp",
+            mimeSize: "Größen nach Medientyp",
+            dateHistogram: "Änderungszeiten",
+            sizeHistogram: "Dateigrößen",
+        },
+        indexPicker: {
+            selectNone: "keinen auswählen",
+            selectAll: "alle auswählen",
+            selectedIndex: "ausgewählter Index",
+            selectedIndices: "ausgewählte Indizes",
+        },
    },
    fr: {
        filePage: {
@@ -177,7 +364,7 @@ export default {
            advanced: "Recherche avancée",
            fuzzy: "Approximatif"
        },
-        addTag: "Ajouter",
+        addTag: "Taguer",
        deleteTag: "Supprimer",
        download: "Télécharger",
        and: "et",
@@ -186,6 +373,7 @@ export default {
        mimeTypes: "Types de médias",
        tags: "Tags",
        tagFilter: "Filtrer les tags",
+        forExample: "Par exemple:",
        help: {
            simpleSearch: "Recherche simple",
            advancedSearch: "Recherche avancée",
@@ -245,7 +433,9 @@ export default {
            useDatePicker: "Afficher un composant « Date Picker » plutôt qu'un slider",
            vidPreviewInterval: "Durée des images d'aperçu video en millisecondes",
            simpleLightbox: "Désactiver les animations du visualiseur d'images",
-            showTagPickerFilter: "Afficher le filtre dans l'onglet Tags"
+            showTagPickerFilter: "Afficher le filtre dans l'onglet Tags",
+            featuredFields: "Expression Javascript pour les variables mises en évidence. Sera affiché dans les résultats de recherche.",
+            featuredFieldsList: "Variables disponibles"
        },
        queryMode: {
            simple: "Simple",
@@ -253,6 +443,7 @@ export default {
        },
        lang: {
            en: "English",
+            de: "Deutsch",
            fr: "Français",
            "zh-CN": "简体中文",
        },
@@ -348,7 +539,7 @@ export default {
            advanced: "高级搜索",
            fuzzy: "模糊搜索"
        },
-        addTag: "添加",
+        addTag: "签条",
        deleteTag: "删除",
        download: "下载",
        and: "与",
@@ -357,6 +548,7 @@ export default {
        mimeTypes: "文件类型",
        tags: "标签",
        tagFilter: "筛选标签",
+        forExample: "例如:",
        help: {
            simpleSearch: "简易搜索",
            advancedSearch: "高级搜索",
@@ -415,7 +607,9 @@ export default {
            useDatePicker: "使用日期选择器组件而不是滑块",
            vidPreviewInterval: "视频预览帧的持续时间，以毫秒为单位",
            simpleLightbox: "在图片查看器中，禁用动画",
-            showTagPickerFilter: "显示标签过滤栏"
+            showTagPickerFilter: "显示标签过滤栏",
+            featuredFields: "特色领域的Javascript模板字符串。将出现在搜索结果中。",
+            featuredFieldsList: "可利用的变量"
        },
        queryMode: {
            simple: "简单",
@@ -423,6 +617,7 @@ export default {
        },
        lang: {
            en: "English",
+            de: "Deutsch",
            fr: "Français",
            "zh-CN": "简体中文",
        },
--- a/sist2-vue/src/ml/BertNerModel.js
+++ b/sist2-vue/src/ml/BertNerModel.js
@@ -0,0 +1,77 @@
+import BertTokenizer from "@/ml/BertTokenizer";
+import * as tf from "@tensorflow/tfjs";
+import axios from "axios";
+
+export default class BertNerModel {
+    vocabUrl;
+    modelUrl;
+
+    id2label;
+    _tokenizer;
+    _model;
+    inputSize = 128;
+
+    _previousWordId = null;
+
+    constructor(vocabUrl, modelUrl, id2label) {
+        this.vocabUrl = vocabUrl;
+        this.modelUrl = modelUrl;
+        this.id2label = id2label;
+    }
+
+    async init(onProgress) {
+        await Promise.all([this.loadTokenizer(), this.loadModel(onProgress)]);
+    }
+
+    async loadTokenizer() {
+        const vocab = (await axios.get(this.vocabUrl)).data;
+        this._tokenizer = new BertTokenizer(vocab);
+    }
+
+    async loadModel(onProgress) {
+        this._model = await tf.loadGraphModel(this.modelUrl, {onProgress});
+    }
+
+    alignLabels(labels, wordIds, words) {
+        const result = [];
+
+        for (let i = 0; i < this.inputSize; i++) {
+            const label = labels[i];
+            const wordId = wordIds[i];
+
+            if (wordId === -1) {
+                continue;
+            }
+            if (wordId === this._previousWordId) {
+                continue;
+            }
+
+            result.push({
+                word: words[wordId].text, wordIndex: words[wordId].index, label: label
+            });
+            this._previousWordId = wordId;
+        }
+
+        return result;
+    }
+
+    async predict(text, callback) {
+        this._previousWordId = null;
+        const encoded = this._tokenizer.encodeText(text, this.inputSize)
+
+        for (let chunk of encoded.inputChunks) {
+            const rawResult = tf.tidy(() => this._model.execute({
+                input_ids: tf.tensor2d(chunk.inputIds, [1, this.inputSize], "int32"),
+                token_type_ids: tf.tensor2d(chunk.segmentIds, [1, this.inputSize], "int32"),
+                attention_mask: tf.tensor2d(chunk.inputMask, [1, this.inputSize], "int32"),
+            }));
+
+            const labelIds = await tf.argMax(rawResult, -1);
+            const labelIdsArray = await labelIds.array();
+            const labels = labelIdsArray[0].map(id => this.id2label[id]);
+            rawResult.dispose()
+
+            callback(this.alignLabels(labels, chunk.wordIds, encoded.words))
+        }
+    }
+}
--- a/sist2-vue/src/ml/BertTokenizer.js
+++ b/sist2-vue/src/ml/BertTokenizer.js
@@ -0,0 +1,184 @@
+import {zip, chunk} from "underscore";
+
+const UNK_INDEX = 100;
+const CLS_INDEX = 101;
+const SEP_INDEX = 102;
+const CONTINUING_SUBWORD_PREFIX = "##";
+
+function isWhitespace(ch) {
+    return /\s/.test(ch);
+}
+
+function isInvalid(ch) {
+    return (ch.charCodeAt(0) === 0 || ch.charCodeAt(0) === 0xfffd);
+}
+
+const punctuations = '[~`!@#$%^&*(){}[];:"\'<,.>?/\\|-_+=';
+
+/** To judge whether it's a punctuation. */
+function isPunctuation(ch) {
+    return punctuations.indexOf(ch) !== -1;
+}
+
+export default class BertTokenizer {
+    vocab;
+
+    constructor(vocab) {
+        this.vocab = vocab;
+    }
+
+    tokenize(text) {
+        const charOriginalIndex = [];
+        const cleanedText = this.cleanText(text, charOriginalIndex);
+        const origTokens = cleanedText.split(' ');
+
+        let charCount = 0;
+        const tokens = origTokens.map((token) => {
+            token = token.toLowerCase();
+            const tokens = this.runSplitOnPunctuation(token, charCount, charOriginalIndex);
+            charCount += token.length + 1;
+            return tokens;
+        });
+
+        let flattenTokens = [];
+        for (let index = 0; index < tokens.length; index++) {
+            flattenTokens = flattenTokens.concat(tokens[index]);
+        }
+        return flattenTokens;
+    }
+
+    /* Performs invalid character removal and whitespace cleanup on text. */
+    cleanText(text, charOriginalIndex) {
+        text = text.replace(/\?/g, "").trim();
+
+        const stringBuilder = [];
+        let originalCharIndex = 0;
+        let newCharIndex = 0;
+
+        for (const ch of text) {
+            // Skip the characters that cannot be used.
+            if (isInvalid(ch)) {
+                originalCharIndex += ch.length;
+                continue;
+            }
+            if (isWhitespace(ch)) {
+                if (stringBuilder.length > 0 && stringBuilder[stringBuilder.length - 1] !== ' ') {
+                    stringBuilder.push(' ');
+                    charOriginalIndex[newCharIndex] = originalCharIndex;
+                    originalCharIndex += ch.length;
+                } else {
+                    originalCharIndex += ch.length;
+                    continue;
+                }
+            } else {
+                stringBuilder.push(ch);
+                charOriginalIndex[newCharIndex] = originalCharIndex;
+                originalCharIndex += ch.length;
+            }
+            newCharIndex++;
+        }
+        return stringBuilder.join('');
+    }
+
+    /* Splits punctuation on a piece of text. */
+    runSplitOnPunctuation(text, count, charOriginalIndex) {
+        const tokens = [];
+        let startNewWord = true;
+        for (const ch of text) {
+            if (isPunctuation(ch)) {
+                tokens.push({text: ch, index: charOriginalIndex[count]});
+                count += ch.length;
+                startNewWord = true;
+            } else {
+                if (startNewWord) {
+                    tokens.push({text: '', index: charOriginalIndex[count]});
+                    startNewWord = false;
+                }
+                tokens[tokens.length - 1].text += ch;
+                count += ch.length;
+            }
+        }
+        return tokens;
+    }
+
+    encode(words) {
+        let outputTokens = [];
+        const wordIds = [];
+
+        for (let i = 0; i < words.length; i++) {
+            let chars = [...words[i].text];
+
+            let isUnknown = false;
+            let start = 0;
+            let subTokens = [];
+
+            while (start < chars.length) {
+                let end = chars.length;
+                let currentSubstring = null;
+                while (start < end) {
+                    let substr = chars.slice(start, end).join('');
+
+                    if (start > 0) {
+                        substr = CONTINUING_SUBWORD_PREFIX + substr;
+                    }
+                    if (this.vocab.includes(substr)) {
+                        currentSubstring = this.vocab.indexOf(substr);
+                        break;
+                    }
+
+                    --end;
+                }
+                if (currentSubstring == null) {
+                    isUnknown = true;
+                    break;
+                }
+                subTokens.push(currentSubstring);
+                start = end;
+            }
+
+            if (isUnknown) {
+                outputTokens.push(UNK_INDEX);
+                wordIds.push(i);
+            } else {
+                subTokens.forEach(tok => {
+                    outputTokens.push(tok);
+                    wordIds.push(i)
+                });
+            }
+        }
+
+        return {tokens: outputTokens, wordIds};
+    }
+
+    encodeText(inputText, inputSize) {
+
+        const tokenized = this.tokenize(inputText);
+        const encoded = this.encode(tokenized);
+
+        const encodedTokenChunks = chunk(encoded.tokens, inputSize - 2);
+        const encodedWordIdChunks = chunk(encoded.wordIds, inputSize - 2);
+
+        const chunks = [];
+
+        zip(encodedTokenChunks, encodedWordIdChunks).forEach(([tokens, wordIds]) => {
+            const inputIds = [CLS_INDEX, ...tokens, SEP_INDEX];
+            const segmentIds = Array(inputIds.length).fill(0);
+            const inputMask = Array(inputIds.length).fill(1);
+            wordIds = [-1, ...wordIds, -1];
+
+            while (inputIds.length < inputSize) {
+                inputIds.push(0);
+                inputMask.push(0);
+                segmentIds.push(0);
+                wordIds.push(-1);
+            }
+
+            chunks.push({inputIds, inputMask, segmentIds, wordIds})
+        });
+
+        return {
+            inputChunks: chunks,
+            words: tokenized
+        };
+    }
+}
--- a/sist2-vue/src/ml/modelsRepo.js
+++ b/sist2-vue/src/ml/modelsRepo.js
@@ -0,0 +1,43 @@
+import axios from "axios";
+
+class ModelsRepo {
+    _repositories;
+    data = {};
+
+    async init(repositories) {
+        this._repositories = repositories;
+
+        const data = await Promise.all(this._repositories.map(this._loadRepository));
+
+        data.forEach(models => {
+            models.forEach(model => {
+                this.data[model.name] = model;
+            })
+        });
+    }
+
+    async _loadRepository(repository) {
+        const data = (await axios.get(repository)).data;
+        data.forEach(model => {
+            model["modelUrl"] = new URL(model["modelPath"], repository).href;
+            model["vocabUrl"] = new URL(model["vocabPath"], repository).href;
+        });
+        return data;
+    }
+
+    getOptions() {
+        return Object.values(this.data).map(model => ({
+            text: `${model.name} (${Math.round(model.size / (1024*1024))}MB)`,
+            value: model.name
+        }));
+    }
+
+    getDefaultModel() {
+        if (Object.values(this.data).length === 0) {
+            return null;
+        }
+        return Object.values(this.data).find(model => model.default).name;
+    }
+}
+
+export default new ModelsRepo();
--- a/sist2-vue/src/store/index.ts
+++ b/sist2-vue/src/store/index.ts
@@ -33,6 +33,7 @@ export default new Vuex.Store({
        optHideDuplicates: true,
        optTheme: "light",
        optDisplay: "grid",
+        optFeaturedFields: "",

        optSize: 60,
        optHighlight: true,
@@ -56,6 +57,9 @@ export default new Vuex.Store({
        optVidPreviewInterval: 700,
        optSimpleLightbox: true,
        optShowTagPickerFilter: true,
+        optMlRepositories: "https://raw.githubusercontent.com/simon987/sist2-ner-models/main/repo.json",
+        optAutoAnalyze: false,
+        optMlDefaultModel: null,

        _onLoadSelectedIndices: [] as string[],
        _onLoadSelectedMimeTypes: [] as string[],
@@ -85,7 +89,11 @@ export default new Vuex.Store({

        uiMimeMap: [] as any[],

-        auth0Token: null
+        auth0Token: null,
+        mlModel: {
+            model: null,
+            name: null
+        },
    },
    mutations: {
        setUiShowDetails: (state, val) => state.uiShowDetails = val,
@@ -158,6 +166,7 @@ export default new Vuex.Store({
        setOptQueryMode: (state, val) => state.optQueryMode = val,
        setOptResultSize: (state, val) => state.optSize = val,
        setOptTagOrOperator: (state, val) => state.optTagOrOperator = val,
+        setOptFeaturedFields: (state, val) => state.optFeaturedFields = val,

        setOptTreemapType: (state, val) => state.optTreemapType = val,
        setOptTreemapTiling: (state, val) => state.optTreemapTiling = val,
@@ -170,6 +179,9 @@ export default new Vuex.Store({
        setOptVidPreviewInterval: (state, val) => state.optVidPreviewInterval = val,
        setOptSimpleLightbox: (state, val) => state.optSimpleLightbox = val,
        setOptShowTagPickerFilter: (state, val) => state.optShowTagPickerFilter = val,
+        setOptAutoAnalyze: (state, val) => {state.optAutoAnalyze = val},
+        setOptMlRepositories: (state, val) => {state.optMlRepositories = val},
+        setOptMlDefaultModel: (state, val) => {state.optMlDefaultModel = val},

        setOptLightboxLoadOnlyCurrent: (state, val) => state.optLightboxLoadOnlyCurrent = val,
        setOptLightboxSlideDuration: (state, val) => state.optLightboxSlideDuration = val,
@@ -192,6 +204,7 @@ export default new Vuex.Store({
            // noop
        },
        setAuth0Token: (state, val) => state.auth0Token = val,
+        setMlModel: (state, val) => state.mlModel = val,
    },
    actions: {
        setSist2Info: (store, val) => {
@@ -348,6 +361,7 @@ export default new Vuex.Store({
    },
    modules: {},
    getters: {
+        mlModel: (state) => state.mlModel,
        seed: (state) => state.seed,
        getPathText: (state) => state.pathText,
        indices: state => state.indices,
@@ -413,5 +427,13 @@ export default new Vuex.Store({
        optVidPreviewInterval: state => state.optVidPreviewInterval,
        optSimpleLightbox: state => state.optSimpleLightbox,
        optShowTagPickerFilter: state => state.optShowTagPickerFilter,
+        optFeaturedFields: state => state.optFeaturedFields,
+        optMlRepositories: state => state.optMlRepositories,
+        mlRepositoryList: state => {
+            const repos = state.optMlRepositories.split("\n")
+            return repos[0] == "" ? [] : repos;
+        },
+        optMlDefaultModel: state => state.optMlDefaultModel,
+        optAutoAnalyze: state => state.optAutoAnalyze,
    }
 })
--- a/sist2-vue/src/util.ts
+++ b/sist2-vue/src/util.ts
@@ -57,6 +57,14 @@ export function humanTime(sec_num: number): string {
    const minutes = Math.floor((sec_num - (hours * 3600)) / 60);
    const seconds = sec_num - (hours * 3600) - (minutes * 60);

+    if (sec_num < 60) {
+        return `${sec_num}s`
+    }
+
+    if (sec_num < 3600) {
+        return `${minutes < 10 ? "0" : ""}${minutes}:${seconds < 10 ? "0" : ""}${seconds}`;
+    }
+
    return `${hours < 10 ? "0" : ""}${hours}:${minutes < 10 ? "0" : ""}${minutes}:${seconds < 10 ? "0" : ""}${seconds}`;
 }

--- a/sist2-vue/src/views/Configuration.vue
+++ b/sist2-vue/src/views/Configuration.vue
@@ -1,144 +1,218 @@
 <template>
  <!--  <div :style="{width: `${$store.getters.optContainerWidth}px`}"-->
-  <div
-      v-if="!configLoading"
-      style="margin-left: auto; margin-right: auto;" class="container">
-
-    <b-card>
-      <b-card-title>
-        <GearIcon></GearIcon>
-        {{ $t("config") }}
-      </b-card-title>
-      <p>{{ $t("configDescription") }}</p>
-
-      <b-card-body>
-        <h4>{{ $t("displayOptions") }}</h4>
+    <div
+            v-if="!configLoading"
+            style="margin-left: auto; margin-right: auto;" class="container">

        <b-card>
+            <b-card-title>
+                <GearIcon></GearIcon>
+                {{ $t("config") }}
+            </b-card-title>
+            <p>{{ $t("configDescription") }}</p>

-          <label><LanguageIcon/><span style="vertical-align: middle">&nbsp;{{ $t("opt.lang") }}</span></label>
-          <b-form-select :options="langOptions" :value="optLang" @input="setOptLang"></b-form-select>
+            <b-card-body>
+                <h4>{{ $t("displayOptions") }}</h4>

-          <label>{{ $t("opt.theme") }}</label>
-          <b-form-select :options="themeOptions" :value="optTheme" @input="setOptTheme"></b-form-select>
+                <b-card>

-          <label>{{ $t("opt.displayMode") }}</label>
-          <b-form-select :options="displayModeOptions" :value="optDisplay" @input="setOptDisplay"></b-form-select>
+                    <label>
+                        <LanguageIcon/>
+                        <span style="vertical-align: middle">&nbsp;{{ $t("opt.lang") }}</span></label>
+                    <b-form-select :options="langOptions" :value="optLang" @input="setOptLang"></b-form-select>

-          <label>{{ $t("opt.columns") }}</label>
-          <b-form-select :options="columnsOptions" :value="optColumns" @input="setOptColumns"></b-form-select>
+                    <label>{{ $t("opt.theme") }}</label>
+                    <b-form-select :options="themeOptions" :value="optTheme" @input="setOptTheme"></b-form-select>

-          <div style="height: 10px"></div>
+                    <label>{{ $t("opt.displayMode") }}</label>
+                    <b-form-select :options="displayModeOptions" :value="optDisplay"
+                                   @input="setOptDisplay"></b-form-select>

-          <b-form-checkbox :checked="optLightboxLoadOnlyCurrent" @input="setOptLightboxLoadOnlyCurrent">
-            {{ $t("opt.lightboxLoadOnlyCurrent") }}
-          </b-form-checkbox>
+                    <label>{{ $t("opt.columns") }}</label>
+                    <b-form-select :options="columnsOptions" :value="optColumns" @input="setOptColumns"></b-form-select>

-          <b-form-checkbox :checked="optHideLegacy" @input="setOptHideLegacy">
-            {{ $t("opt.hideLegacy") }}
-          </b-form-checkbox>
+                    <div style="height: 10px"></div>

-          <b-form-checkbox :checked="optUpdateMimeMap" @input="setOptUpdateMimeMap">
-            {{ $t("opt.updateMimeMap") }}
-          </b-form-checkbox>
+                    <b-form-checkbox :checked="optLightboxLoadOnlyCurrent" @input="setOptLightboxLoadOnlyCurrent">
+                        {{ $t("opt.lightboxLoadOnlyCurrent") }}
+                    </b-form-checkbox>

-          <b-form-checkbox :checked="optUseDatePicker" @input="setOptUseDatePicker">
-            {{ $t("opt.useDatePicker") }}
-          </b-form-checkbox>
+                    <b-form-checkbox :checked="optHideLegacy" @input="setOptHideLegacy">
+                        {{ $t("opt.hideLegacy") }}
+                    </b-form-checkbox>

-          <b-form-checkbox :checked="optSimpleLightbox" @input="setOptSimpleLightbox">{{
-              $t("opt.simpleLightbox")
-            }}
-          </b-form-checkbox>
+                    <b-form-checkbox :checked="optUpdateMimeMap" @input="setOptUpdateMimeMap">
+                        {{ $t("opt.updateMimeMap") }}
+                    </b-form-checkbox>

-          <b-form-checkbox :checked="optShowTagPickerFilter" @input="setOptShowTagPickerFilter">{{
-              $t("opt.showTagPickerFilter")
-            }}
-          </b-form-checkbox>
+                    <b-form-checkbox :checked="optUseDatePicker" @input="setOptUseDatePicker">
+                        {{ $t("opt.useDatePicker") }}
+                    </b-form-checkbox>
+
+                    <b-form-checkbox :checked="optSimpleLightbox" @input="setOptSimpleLightbox">{{
+                        $t("opt.simpleLightbox")
+                        }}
+                    </b-form-checkbox>
+
+                    <b-form-checkbox :checked="optShowTagPickerFilter" @input="setOptShowTagPickerFilter">{{
+                        $t("opt.showTagPickerFilter")
+                        }}
+                    </b-form-checkbox>
+
+                    <br/>
+                    <label>{{ $t("opt.featuredFields") }}</label>
+
+                    <br>
+                    <b-button v-b-toggle.collapse-1 variant="secondary" class="dropdown-toggle">{{
+                        $t("opt.featuredFieldsList")
+                        }}
+                    </b-button>
+                    <b-collapse id="collapse-1" class="mt-2">
+                        <ul>
+                            <li><code>doc.checksum</code></li>
+                            <li><code>doc.path</code></li>
+                            <li><code>doc.mime</code></li>
+                            <li><code>doc.videoc</code></li>
+                            <li><code>doc.audioc</code></li>
+                            <li><code>doc.pages</code></li>
+                            <li><code>doc.mtime</code></li>
+                            <li><code>doc.font_name</code></li>
+                            <li><code>doc.album</code></li>
+                            <li><code>doc.artist</code></li>
+                            <li><code>doc.title</code></li>
+                            <li><code>doc.genre</code></li>
+                            <li><code>doc.album_artist</code></li>
+                            <li><code>doc.exif_make</code></li>
+                            <li><code>doc.exif_model</code></li>
+                            <li><code>doc.exif_software</code></li>
+                            <li><code>doc.exif_exposure_time</code></li>
+                            <li><code>doc.exif_fnumber</code></li>
+                            <li><code>doc.exif_iso_speed_ratings</code></li>
+                            <li><code>doc.exif_focal_length</code></li>
+                            <li><code>doc.exif_user_comment</code></li>
+                            <li><code>doc.exif_user_comment</code></li>
+                            <li><code>doc.exif_gps_longitude_ref</code></li>
+                            <li><code>doc.exif_gps_longitude_dms</code></li>
+                            <li><code>doc.exif_gps_longitude_dec</code></li>
+                            <li><code>doc.exif_gps_latitude_ref</code></li>
+                            <li><code>doc.exif_gps_latitude_dec</code></li>
+                            <li><code>humanDate()</code></li>
+                            <li><code>humanFileSize()</code></li>
+                        </ul>
+
+                        <p>{{ $t("forExample") }}</p>
+
+                        <ul>
+                            <li>
+                                <code>&lt;b&gt;${humanDate(doc.mtime)}&lt;/b&gt; • ${doc.videoc || ''}</code>
+                            </li>
+                            <li>
+                                <code>${doc.pages ? (doc.pages + ' pages') : ''}</code>
+                            </li>
+                        </ul>
+                    </b-collapse>
+                    <br/>
+                    <br/>
+                    <b-textarea rows="3" :value="optFeaturedFields" @input="setOptFeaturedFields"></b-textarea>
+                </b-card>
+
+                <br/>
+                <h4>{{ $t("searchOptions") }}</h4>
+                <b-card>
+                    <b-form-checkbox :checked="optHideDuplicates" @input="setOptHideDuplicates">{{
+                        $t("opt.hideDuplicates")
+                        }}
+                    </b-form-checkbox>
+
+                    <b-form-checkbox :checked="optHighlight" @input="setOptHighlight">{{
+                        $t("opt.highlight")
+                        }}
+                    </b-form-checkbox>
+                    <b-form-checkbox :checked="optTagOrOperator" @input="setOptTagOrOperator">{{
+                        $t("opt.tagOrOperator")
+                        }}
+                    </b-form-checkbox>
+                    <b-form-checkbox :checked="optFuzzy" @input="setOptFuzzy">{{ $t("opt.fuzzy") }}</b-form-checkbox>
+                    <b-form-checkbox :checked="optSearchInPath" @input="setOptSearchInPath">{{
+                        $t("opt.searchInPath")
+                        }}
+                    </b-form-checkbox>
+                    <b-form-checkbox :checked="optSuggestPath" @input="setOptSuggestPath">{{
+                        $t("opt.suggestPath")
+                        }}
+                    </b-form-checkbox>
+
+                    <br/>
+                    <label>{{ $t("opt.fragmentSize") }}</label>
+                    <b-form-input :value="optFragmentSize" step="10" type="number" min="0"
+                                  @input="setOptFragmentSize"></b-form-input>
+
+                    <label>{{ $t("opt.resultSize") }}</label>
+                    <b-form-input :value="optResultSize" type="number" min="10"
+                                  @input="setOptResultSize"></b-form-input>
+
+                    <label>{{ $t("opt.queryMode") }}</label>
+                    <b-form-select :options="queryModeOptions" :value="optQueryMode"
+                                   @input="setOptQueryMode"></b-form-select>
+
+                    <label>{{ $t("opt.slideDuration") }}</label>
+                    <b-form-input :value="optLightboxSlideDuration" type="number" min="1"
+                                  @input="setOptLightboxSlideDuration"></b-form-input>
+
+                    <label>{{ $t("opt.vidPreviewInterval") }}</label>
+                    <b-form-input :value="optVidPreviewInterval" type="number" min="50"
+                                  @input="setOptVidPreviewInterval"></b-form-input>
+                </b-card>
+
+                <h4 class="mt-3">{{ $t("mlOptions") }}</h4>
+                <b-card>
+                    <label>{{ $t("opt.mlRepositories") }}</label>
+                    <b-textarea rows="3" :value="optMlRepositories" @input="setOptMlRepositories"></b-textarea>
+                    <br>
+                    <b-form-checkbox :checked="optAutoAnalyze" @input="setOptAutoAnalyze">{{
+                            $t("opt.autoAnalyze")
+                        }}
+                    </b-form-checkbox>
+                </b-card>
+
+                <h4 class="mt-3">{{ $t("treemapOptions") }}</h4>
+                <b-card>
+                    <label>{{ $t("opt.treemapType") }}</label>
+                    <b-form-select :value="optTreemapType" :options="treemapTypeOptions"
+                                   @input="setOptTreemapType"></b-form-select>
+
+                    <label>{{ $t("opt.treemapTiling") }}</label>
+                    <b-form-select :value="optTreemapTiling" :options="treemapTilingOptions"
+                                   @input="setOptTreemapTiling"></b-form-select>
+
+                    <label>{{ $t("opt.treemapColorGroupingDepth") }}</label>
+                    <b-form-input :value="optTreemapColorGroupingDepth" type="number" min="1"
+                                  @input="setOptTreemapColorGroupingDepth"></b-form-input>
+
+                    <label>{{ $t("opt.treemapSize") }}</label>
+                    <b-form-select :value="optTreemapSize" :options="treemapSizeOptions"
+                                   @input="setOptTreemapSize"></b-form-select>
+
+                    <template v-if="$store.getters.optTreemapSize === 'custom'">
+                        <!-- TODO Width/Height input -->
+                        <b-form-input type="number" min="0" step="10"></b-form-input>
+                        <b-form-input type="number" min="0" step="10"></b-form-input>
+                    </template>
+
+                    <label>{{ $t("opt.treemapColor") }}</label>
+                    <b-form-select :value="optTreemapColor" :options="treemapColorOptions"
+                                   @input="setOptTreemapColor"></b-form-select>
+                </b-card>
+
+                <b-button variant="danger" class="mt-4" @click="onResetClick()">{{ $t("configReset") }}</b-button>
+            </b-card-body>
        </b-card>

-        <br/>
-        <h4>{{ $t("searchOptions") }}</h4>
-        <b-card>
-          <b-form-checkbox :checked="optHideDuplicates" @input="setOptHideDuplicates">{{
-              $t("opt.hideDuplicates")
-            }}
-          </b-form-checkbox>
-
-          <b-form-checkbox :checked="optHighlight" @input="setOptHighlight">{{ $t("opt.highlight") }}</b-form-checkbox>
-          <b-form-checkbox :checked="optTagOrOperator" @input="setOptTagOrOperator">{{
-              $t("opt.tagOrOperator")
-            }}
-          </b-form-checkbox>
-          <b-form-checkbox :checked="optFuzzy" @input="setOptFuzzy">{{ $t("opt.fuzzy") }}</b-form-checkbox>
-          <b-form-checkbox :checked="optSearchInPath" @input="setOptSearchInPath">{{
-              $t("opt.searchInPath")
-            }}
-          </b-form-checkbox>
-          <b-form-checkbox :checked="optSuggestPath" @input="setOptSuggestPath">{{
-              $t("opt.suggestPath")
-            }}
-          </b-form-checkbox>
-
-          <br/>
-          <label>{{ $t("opt.fragmentSize") }}</label>
-          <b-form-input :value="optFragmentSize" step="10" type="number" min="0"
-                        @input="setOptFragmentSize"></b-form-input>
-
-          <label>{{ $t("opt.resultSize") }}</label>
-          <b-form-input :value="optResultSize" type="number" min="10"
-                        @input="setOptResultSize"></b-form-input>
-
-          <label>{{ $t("opt.queryMode") }}</label>
-          <b-form-select :options="queryModeOptions" :value="optQueryMode" @input="setOptQueryMode"></b-form-select>
-
-          <label>{{ $t("opt.slideDuration") }}</label>
-          <b-form-input :value="optLightboxSlideDuration" type="number" min="1"
-                        @input="setOptLightboxSlideDuration"></b-form-input>
-
-          <label>{{ $t("opt.vidPreviewInterval") }}</label>
-          <b-form-input :value="optVidPreviewInterval" type="number" min="50"
-                        @input="setOptVidPreviewInterval"></b-form-input>
+        <b-card v-if="loading" class="mt-4">
+            <Preloader></Preloader>
        </b-card>
-
-        <h4 class="mt-3">{{ $t("treemapOptions") }}</h4>
-        <b-card>
-          <label>{{ $t("opt.treemapType") }}</label>
-          <b-form-select :value="optTreemapType" :options="treemapTypeOptions"
-                         @input="setOptTreemapType"></b-form-select>
-
-          <label>{{ $t("opt.treemapTiling") }}</label>
-          <b-form-select :value="optTreemapTiling" :options="treemapTilingOptions"
-                         @input="setOptTreemapTiling"></b-form-select>
-
-          <label>{{ $t("opt.treemapColorGroupingDepth") }}</label>
-          <b-form-input :value="optTreemapColorGroupingDepth" type="number" min="1"
-                        @input="setOptTreemapColorGroupingDepth"></b-form-input>
-
-          <label>{{ $t("opt.treemapSize") }}</label>
-          <b-form-select :value="optTreemapSize" :options="treemapSizeOptions"
-                         @input="setOptTreemapSize"></b-form-select>
-
-          <template v-if="$store.getters.optTreemapSize === 'custom'">
-            <!-- TODO Width/Height input -->
-            <b-form-input type="number" min="0" step="10"></b-form-input>
-            <b-form-input type="number" min="0" step="10"></b-form-input>
-          </template>
-
-          <label>{{ $t("opt.treemapColor") }}</label>
-          <b-form-select :value="optTreemapColor" :options="treemapColorOptions"
-                         @input="setOptTreemapColor"></b-form-select>
-        </b-card>
-
-        <b-button variant="danger" class="mt-4" @click="onResetClick()">{{ $t("configReset") }}</b-button>
-      </b-card-body>
-    </b-card>
-
-    <b-card v-if="loading" class="mt-4">
-      <Preloader></Preloader>
-    </b-card>
-    <DebugInfo v-else></DebugInfo>
-  </div>
+        <DebugInfo v-else></DebugInfo>
+    </div>
 </template>

 <script>
@@ -150,161 +224,168 @@ import GearIcon from "@/components/icons/GearIcon.vue";
 import LanguageIcon from "@/components/icons/LanguageIcon";

 export default {
-  components: {LanguageIcon, GearIcon, DebugInfo, Preloader},
-  data() {
-    return {
-      loading: false,
-      configLoading: false,
-      langOptions: [
-        {value: "en", text: this.$t("lang.en")},
-        {value: "fr", text: this.$t("lang.fr")},
-        {value: "zh-CN", text: this.$t("lang.zh-CN")},
-      ],
-      queryModeOptions: [
-        {value: "simple", text: this.$t("queryMode.simple")},
-        {value: "advanced", text: this.$t("queryMode.advanced")}
-      ],
-      displayModeOptions: [
-        {value: "grid", text: this.$t("displayMode.grid")},
-        {value: "list", text: this.$t("displayMode.list")}
-      ],
-      columnsOptions: [
-        {value: "auto", text: this.$t("columns.auto")},
-        {value: 1, text: "1"},
-        {value: 2, text: "2"},
-        {value: 3, text: "3"},
-        {value: 4, text: "4"},
-        {value: 5, text: "5"},
-        {value: 6, text: "6"},
-        {value: 7, text: "7"},
-        {value: 8, text: "8"},
-        {value: 9, text: "9"},
-        {value: 10, text: "10"},
-        {value: 11, text: "11"},
-        {value: 12, text: "12"},
-      ],
-      treemapTypeOptions: [
-        {value: "cascaded", text: this.$t("treemapType.cascaded")},
-        {value: "flat", text: this.$t("treemapType.flat")}
-      ],
-      treemapTilingOptions: [
-        {value: "binary", text: this.$t("treemapTiling.binary")},
-        {value: "squarify", text: this.$t("treemapTiling.squarify")},
-        {value: "slice", text: this.$t("treemapTiling.slice")},
-        {value: "dice", text: this.$t("treemapTiling.dice")},
-        {value: "sliceDice", text: this.$t("treemapTiling.sliceDice")},
-      ],
-      treemapSizeOptions: [
-        {value: "small", text: this.$t("treemapSize.small")},
-        {value: "medium", text: this.$t("treemapSize.medium")},
-        {value: "large", text: this.$t("treemapSize.large")},
-        {value: "x-large", text: this.$t("treemapSize.xLarge")},
-        {value: "xx-large", text: this.$t("treemapSize.xxLarge")},
-        // {value: "custom", text: this.$t("treemapSize.custom")},
-      ],
-      treemapColorOptions: [
-        {value: "PuBuGn", text: "Purple-Blue-Green"},
-        {value: "PuRd", text: "Purple-Red"},
-        {value: "PuBu", text: "Purple-Blue"},
-        {value: "YlOrBr", text: "Yellow-Orange-Brown"},
-        {value: "YlOrRd", text: "Yellow-Orange-Red"},
-        {value: "YlGn", text: "Yellow-Green"},
-        {value: "YlGnBu", text: "Yellow-Green-Blue"},
-        {value: "Plasma", text: "Plasma"},
-        {value: "Magma", text: "Magma"},
-        {value: "Inferno", text: "Inferno"},
-        {value: "Viridis", text: "Viridis"},
-        {value: "Turbo", text: "Turbo"},
-      ],
-      themeOptions: [
-        {value: "light", text: this.$t("theme.light")},
-        {value: "black", text: this.$t("theme.black")}
-      ]
+    components: {LanguageIcon, GearIcon, DebugInfo, Preloader},
+    data() {
+        return {
+            loading: false,
+            configLoading: false,
+            langOptions: [
+                {value: "en", text: this.$t("lang.en")},
+                {value: "fr", text: this.$t("lang.fr")},
+                {value: "zh-CN", text: this.$t("lang.zh-CN")},
+                {value: "de", text: this.$t("lang.de")},
+            ],
+            queryModeOptions: [
+                {value: "simple", text: this.$t("queryMode.simple")},
+                {value: "advanced", text: this.$t("queryMode.advanced")}
+            ],
+            displayModeOptions: [
+                {value: "grid", text: this.$t("displayMode.grid")},
+                {value: "list", text: this.$t("displayMode.list")}
+            ],
+            columnsOptions: [
+                {value: "auto", text: this.$t("columns.auto")},
+                {value: 1, text: "1"},
+                {value: 2, text: "2"},
+                {value: 3, text: "3"},
+                {value: 4, text: "4"},
+                {value: 5, text: "5"},
+                {value: 6, text: "6"},
+                {value: 7, text: "7"},
+                {value: 8, text: "8"},
+                {value: 9, text: "9"},
+                {value: 10, text: "10"},
+                {value: 11, text: "11"},
+                {value: 12, text: "12"},
+            ],
+            treemapTypeOptions: [
+                {value: "cascaded", text: this.$t("treemapType.cascaded")},
+                {value: "flat", text: this.$t("treemapType.flat")}
+            ],
+            treemapTilingOptions: [
+                {value: "binary", text: this.$t("treemapTiling.binary")},
+                {value: "squarify", text: this.$t("treemapTiling.squarify")},
+                {value: "slice", text: this.$t("treemapTiling.slice")},
+                {value: "dice", text: this.$t("treemapTiling.dice")},
+                {value: "sliceDice", text: this.$t("treemapTiling.sliceDice")},
+            ],
+            treemapSizeOptions: [
+                {value: "small", text: this.$t("treemapSize.small")},
+                {value: "medium", text: this.$t("treemapSize.medium")},
+                {value: "large", text: this.$t("treemapSize.large")},
+                {value: "x-large", text: this.$t("treemapSize.xLarge")},
+                {value: "xx-large", text: this.$t("treemapSize.xxLarge")},
+                // {value: "custom", text: this.$t("treemapSize.custom")},
+            ],
+            treemapColorOptions: [
+                {value: "PuBuGn", text: "Purple-Blue-Green"},
+                {value: "PuRd", text: "Purple-Red"},
+                {value: "PuBu", text: "Purple-Blue"},
+                {value: "YlOrBr", text: "Yellow-Orange-Brown"},
+                {value: "YlOrRd", text: "Yellow-Orange-Red"},
+                {value: "YlGn", text: "Yellow-Green"},
+                {value: "YlGnBu", text: "Yellow-Green-Blue"},
+                {value: "Plasma", text: "Plasma"},
+                {value: "Magma", text: "Magma"},
+                {value: "Inferno", text: "Inferno"},
+                {value: "Viridis", text: "Viridis"},
+                {value: "Turbo", text: "Turbo"},
+            ],
+            themeOptions: [
+                {value: "light", text: this.$t("theme.light")},
+                {value: "black", text: this.$t("theme.black")}
+            ]

-    }
-  },
-  computed: {
-    ...mapGetters([
-      "optTheme",
-      "optDisplay",
-      "optColumns",
-      "optHighlight",
-      "optFuzzy",
-      "optSearchInPath",
-      "optSuggestPath",
-      "optFragmentSize",
-      "optQueryMode",
-      "optTreemapType",
-      "optTreemapTiling",
-      "optTreemapColorGroupingDepth",
-      "optTreemapColor",
-      "optTreemapSize",
-      "optLightboxLoadOnlyCurrent",
-      "optLightboxSlideDuration",
-      "optResultSize",
-      "optTagOrOperator",
-      "optLang",
-      "optHideDuplicates",
-      "optHideLegacy",
-      "optUpdateMimeMap",
-      "optUseDatePicker",
-      "optVidPreviewInterval",
-      "optSimpleLightbox",
-      "optShowTagPickerFilter",
-    ]),
-    clientWidth() {
-      return window.innerWidth;
-    }
-  },
-  mounted() {
-    this.$store.subscribe((mutation) => {
-      if (mutation.type.startsWith("setOpt")) {
-        this.$store.dispatch("updateConfiguration");
-      }
-    });
-  },
-  methods: {
-    ...mapActions({
-      setSist2Info: "setSist2Info",
-    }),
-    ...mapMutations([
-      "setOptTheme",
-      "setOptDisplay",
-      "setOptColumns",
-      "setOptHighlight",
-      "setOptFuzzy",
-      "setOptSearchInPath",
-      "setOptSuggestPath",
-      "setOptFragmentSize",
-      "setOptQueryMode",
-      "setOptTreemapType",
-      "setOptTreemapTiling",
-      "setOptTreemapColorGroupingDepth",
-      "setOptTreemapColor",
-      "setOptTreemapSize",
-      "setOptLightboxLoadOnlyCurrent",
-      "setOptLightboxSlideDuration",
-      "setOptResultSize",
-      "setOptTagOrOperator",
-      "setOptLang",
-      "setOptHideDuplicates",
-      "setOptHideLegacy",
-      "setOptUpdateMimeMap",
-      "setOptUseDatePicker",
-      "setOptVidPreviewInterval",
-      "setOptSimpleLightbox",
-      "setOptShowTagPickerFilter",
-    ]),
-    onResetClick() {
-      localStorage.removeItem("sist2_configuration");
-      window.location.reload();
-    }
-  },
+        }
+    },
+    computed: {
+        ...mapGetters([
+            "optTheme",
+            "optDisplay",
+            "optColumns",
+            "optHighlight",
+            "optFuzzy",
+            "optSearchInPath",
+            "optSuggestPath",
+            "optFragmentSize",
+            "optQueryMode",
+            "optTreemapType",
+            "optTreemapTiling",
+            "optTreemapColorGroupingDepth",
+            "optTreemapColor",
+            "optTreemapSize",
+            "optLightboxLoadOnlyCurrent",
+            "optLightboxSlideDuration",
+            "optResultSize",
+            "optTagOrOperator",
+            "optLang",
+            "optHideDuplicates",
+            "optHideLegacy",
+            "optUpdateMimeMap",
+            "optUseDatePicker",
+            "optVidPreviewInterval",
+            "optSimpleLightbox",
+            "optShowTagPickerFilter",
+            "optFeaturedFields",
+            "optMlRepositories",
+            "optAutoAnalyze",
+        ]),
+        clientWidth() {
+            return window.innerWidth;
+        }
+    },
+    mounted() {
+        this.$store.subscribe((mutation) => {
+            if (mutation.type.startsWith("setOpt")) {
+                this.$store.dispatch("updateConfiguration");
+            }
+        });
+    },
+    methods: {
+        ...mapActions({
+            setSist2Info: "setSist2Info",
+        }),
+        ...mapMutations([
+            "setOptTheme",
+            "setOptDisplay",
+            "setOptColumns",
+            "setOptHighlight",
+            "setOptFuzzy",
+            "setOptSearchInPath",
+            "setOptSuggestPath",
+            "setOptFragmentSize",
+            "setOptQueryMode",
+            "setOptTreemapType",
+            "setOptTreemapTiling",
+            "setOptTreemapColorGroupingDepth",
+            "setOptTreemapColor",
+            "setOptTreemapSize",
+            "setOptLightboxLoadOnlyCurrent",
+            "setOptLightboxSlideDuration",
+            "setOptResultSize",
+            "setOptTagOrOperator",
+            "setOptLang",
+            "setOptHideDuplicates",
+            "setOptHideLegacy",
+            "setOptUpdateMimeMap",
+            "setOptUseDatePicker",
+            "setOptVidPreviewInterval",
+            "setOptSimpleLightbox",
+            "setOptShowTagPickerFilter",
+            "setOptFeaturedFields",
+            "setOptMlRepositories",
+            "setOptAutoAnalyze",
+        ]),
+        onResetClick() {
+            localStorage.removeItem("sist2_configuration");
+            window.location.reload();
+        }
+    },
 }
 </script>

 <style>
 .shrink {
-  flex-grow: inherit;
+    flex-grow: inherit;
 }
 </style>
--- a/sist2-vue/src/views/SearchPage.vue
+++ b/sist2-vue/src/views/SearchPage.vue
@@ -1,57 +1,61 @@
 <template>
-  <div class="container">
-    <Lightbox></Lightbox>
-    <HelpDialog :show="showHelp" @close="showHelp = false"></HelpDialog>
+    <div class="container">
+        <Lightbox></Lightbox>
+        <HelpDialog :show="showHelp" @close="showHelp = false"></HelpDialog>

-    <b-card v-if="uiLoading">
-      <Preloader></Preloader>
-    </b-card>
+        <b-card v-if="uiLoading">
+            <Preloader></Preloader>
+        </b-card>

-    <b-card v-show="!uiLoading" id="search-panel">
-      <SearchBar @show-help="showHelp=true"></SearchBar>
-      <b-row>
-        <b-col style="height: 70px;" sm="6">
-          <SizeSlider></SizeSlider>
-        </b-col>
-        <b-col>
-          <PathTree @search="search(true)"></PathTree>
-        </b-col>
-      </b-row>
-      <b-row>
-        <b-col sm="6">
-          <DateSlider></DateSlider>
-          <b-row>
-            <b-col>
-              <IndexPicker></IndexPicker>
-            </b-col>
-          </b-row>
-        </b-col>
-        <b-col>
-          <b-tabs justified>
-            <b-tab :title="$t('mimeTypes')">
-              <MimePicker></MimePicker>
-            </b-tab>
-            <b-tab :title="$t('tags')">
-              <TagPicker :show-search-bar="$store.state.optShowTagPickerFilter"></TagPicker>
-            </b-tab>
-          </b-tabs>
-        </b-col>
-      </b-row>
-    </b-card>
+        <b-alert v-show="!uiLoading && showEsConnectionError" show variant="danger" class="mt-2">
+            {{ $t("toast.esConnErr") }}
+        </b-alert>

-    <div v-show="docs.length === 0 && !uiLoading">
-      <Preloader v-if="searchBusy" class="mt-3"></Preloader>
+        <b-card v-show="!uiLoading && !showEsConnectionError" id="search-panel">
+            <SearchBar @show-help="showHelp=true"></SearchBar>
+            <b-row>
+                <b-col style="height: 70px;" sm="6">
+                    <SizeSlider></SizeSlider>
+                </b-col>
+                <b-col>
+                    <PathTree @search="search(true)"></PathTree>
+                </b-col>
+            </b-row>
+            <b-row>
+                <b-col sm="6">
+                    <DateSlider></DateSlider>
+                    <b-row>
+                        <b-col>
+                            <IndexPicker></IndexPicker>
+                        </b-col>
+                    </b-row>
+                </b-col>
+                <b-col>
+                    <b-tabs justified>
+                        <b-tab :title="$t('mimeTypes')">
+                            <MimePicker></MimePicker>
+                        </b-tab>
+                        <b-tab :title="$t('tags')">
+                            <TagPicker :show-search-bar="$store.state.optShowTagPickerFilter"></TagPicker>
+                        </b-tab>
+                    </b-tabs>
+                </b-col>
+            </b-row>
+        </b-card>

-      <ResultsCard></ResultsCard>
+        <div v-show="docs.length === 0 && !uiLoading">
+            <Preloader v-if="searchBusy" class="mt-3"></Preloader>
+
+            <ResultsCard></ResultsCard>
+        </div>
+
+        <div v-if="docs.length > 0">
+            <ResultsCard></ResultsCard>
+
+            <DocCardWall v-if="optDisplay==='grid'" :docs="docs" :append="appendFunc"></DocCardWall>
+            <DocList v-else :docs="docs" :append="appendFunc"></DocList>
+        </div>
    </div>
-
-    <div v-if="docs.length > 0">
-      <ResultsCard></ResultsCard>
-
-      <DocCardWall v-if="optDisplay==='grid'" :docs="docs" :append="appendFunc"></DocCardWall>
-      <DocList v-else :docs="docs" :append="appendFunc"></DocList>
-    </div>
-  </div>
 </template>

 <script lang="ts">
@@ -78,234 +82,253 @@ import HelpDialog from "@/components/HelpDialog.vue";


 export default Vue.extend({
-  components: {
-    HelpDialog,
-    DocList,
-    TagPicker,
-    DateSlider,
-    SizeSlider, PathTree, ResultsCard, MimePicker, Lightbox, DocCardWall, IndexPicker, SearchBar, Preloader
-  },
-  data: () => ({
-    loading: false,
-    uiLoading: true,
-    search: undefined as any,
-    docs: [] as EsHit[],
-    docIds: new Set(),
-    docChecksums: new Set(),
-    searchBusy: false,
-    Sist2Query: Sist2Query,
-    showHelp: false
-  }),
-  computed: {
-    ...mapGetters(["indices", "optDisplay"]),
-  },
-  mounted() {
-    // Handle touch events
-    window.ontouchend = () => this.$store.commit("busTouchEnd");
-    window.ontouchcancel = this.$store.commit("busTouchEnd");
-
-    this.search = _debounce(async (clear: boolean) => {
-      if (clear) {
-        await this.clearResults();
-      }
-
-      await this.searchNow(Sist2Query.searchQuery());
-
-    }, 350, {leading: false});
-
-    this.$store.dispatch("loadFromArgs", this.$route).then(() => {
-      this.$store.subscribe(() => this.$store.dispatch("updateArgs", this.$router));
-      this.$store.subscribe((mutation) => {
-        if ([
-          "setSizeMin", "setSizeMax", "setDateMin", "setDateMax", "setSearchText", "setPathText",
-          "setSortMode", "setOptHighlight", "setOptFragmentSize", "setFuzzy", "setSize", "setSelectedIndices",
-          "setSelectedMimeTypes", "setSelectedTags", "setOptQueryMode", "setOptSearchInPath",
-        ].includes(mutation.type)) {
-          if (this.searchBusy) {
-            return;
-          }
-
-          this.search(true);
-        }
-      });
-    });
-
-    this.setIndices(this.$store.getters["sist2Info"].indices)
-
-    this.getDateRange().then((range: { min: number, max: number }) => {
-      this.setDateBoundsMin(range.min);
-      this.setDateBoundsMax(range.max);
-
-      const doBlankSearch = !this.$store.state.optUpdateMimeMap;
-
-      Sist2Api.getMimeTypes(Sist2Query.searchQuery(doBlankSearch)).then(({mimeMap}) => {
-        this.$store.commit("setUiMimeMap", mimeMap);
-        this.uiLoading = false;
-        this.search(true);
-      });
-    });
-  },
-  methods: {
-    ...mapActions({
-      setSist2Info: "setSist2Info",
+    components: {
+        HelpDialog,
+        DocList,
+        TagPicker,
+        DateSlider,
+        SizeSlider, PathTree, ResultsCard, MimePicker, Lightbox, DocCardWall, IndexPicker, SearchBar, Preloader
+    },
+    data: () => ({
+        loading: false,
+        uiLoading: true,
+        search: undefined as any,
+        docs: [] as EsHit[],
+        docIds: new Set(),
+        docChecksums: new Set(),
+        searchBusy: false,
+        Sist2Query: Sist2Query,
+        showHelp: false,
+        showEsConnectionError: false
    }),
-    ...mapMutations({
-      setIndices: "setIndices",
-      setDateBoundsMin: "setDateBoundsMin",
-      setDateBoundsMax: "setDateBoundsMax",
-      setTags: "setTags",
-    }),
-    showErrorToast() {
-      this.$bvToast.toast(
-          this.$t("toast.esConnErr"),
-          {
-            title: this.$t("toast.esConnErrTitle"),
-            noAutoHide: true,
-            toaster: "b-toaster-bottom-right",
-            headerClass: "toast-header-error",
-            bodyClass: "toast-body-error",
-          });
+    computed: {
+        ...mapGetters(["indices", "optDisplay"]),
    },
-    showSyntaxErrorToast: function (): void {
-      this.$bvToast.toast(
-          this.$t("toast.esQueryErr"),
-          {
-            title: this.$t("toast.esQueryErrTitle"),
-            noAutoHide: true,
-            toaster: "b-toaster-bottom-right",
-            headerClass: "toast-header-warning",
-            bodyClass: "toast-body-warning",
-          });
-    },
-    async searchNow(q: any) {
-      this.searchBusy = true;
-      await this.$store.dispatch("incrementQuerySequence");
-      this.$store.commit("busSearch");
+    mounted() {
+        // Handle touch events
+        window.ontouchend = () => this.$store.commit("busTouchEnd");
+        window.ontouchcancel = this.$store.commit("busTouchEnd");

-      Sist2Api.esQuery(q).then(async (resp: EsResult) => {
-        await this.handleSearch(resp);
-        this.searchBusy = false;
-      }).catch(err => {
-        if (err.response.status === 500 && this.$store.state.optQueryMode === "advanced") {
-          this.showSyntaxErrorToast();
-        } else {
-          this.showErrorToast();
-        }
-      });
-    },
-    async clearResults() {
-      this.docs = [];
-      this.docIds.clear();
-      this.docChecksums.clear();
-      await this.$store.dispatch("clearResults");
-      this.$store.commit("setUiReachedScrollEnd", false);
-    },
-    async handleSearch(resp: EsResult) {
-      if (resp.hits.hits.length == 0 || resp.hits.hits.length < this.$store.state.optSize) {
-        this.$store.commit("setUiReachedScrollEnd", true);
-      }
+        this.search = _debounce(async (clear: boolean) => {
+            if (clear) {
+                await this.clearResults();
+            }

-      resp.hits.hits = resp.hits.hits.filter(hit => !this.docIds.has(hit._id));
+            await this.searchNow(Sist2Query.searchQuery());

-      if (this.$store.state.optHideDuplicates) {
-        resp.hits.hits = resp.hits.hits.filter(hit => {
+        }, 350, {leading: false});

-          if (!("checksum" in hit._source)) {
-            return true;
-          }
+        this.$store.dispatch("loadFromArgs", this.$route).then(() => {
+            this.$store.subscribe(() => this.$store.dispatch("updateArgs", this.$router));
+            this.$store.subscribe((mutation) => {
+                if ([
+                    "setSizeMin", "setSizeMax", "setDateMin", "setDateMax", "setSearchText", "setPathText",
+                    "setSortMode", "setOptHighlight", "setOptFragmentSize", "setFuzzy", "setSize", "setSelectedIndices",
+                    "setSelectedMimeTypes", "setSelectedTags", "setOptQueryMode", "setOptSearchInPath",
+                ].includes(mutation.type)) {
+                    if (this.searchBusy) {
+                        return;
+                    }

-          const isDupe = !this.docChecksums.has(hit._source.checksum);
-          this.docChecksums.add(hit._source.checksum);
-          return isDupe;
+                    this.search(true);
+                }
+            });
        });
-      }

-      for (const hit of resp.hits.hits) {
-        if (hit._props.isPlayableImage || hit._props.isPlayableVideo) {
-          hit._seq = await this.$store.dispatch("getKeySequence");
-          this.$store.commit("addLightboxSource", {
-            source: `f/${hit._id}`,
-            thumbnail: hit._props.hasThumbnail
-                ? `t/${hit._source.index}/${hit._id}`
-                : null,
-            caption: {
-              component: LightboxCaption,
-              props: {hit: hit}
-            },
-            type: hit._props.isVideo ? "video" : "image"
-          });
-        }
-      }
+        this.setIndices(this.$store.getters["sist2Info"].indices)

-      await this.$store.dispatch("remountLightbox");
-      this.$store.commit("setLastQueryResult", resp);
+        this.getDateRange().then((range: { min: number, max: number }) => {
+            this.setDateBoundsMin(range.min);
+            this.setDateBoundsMax(range.max);

-      this.docs.push(...resp.hits.hits);
+            const doBlankSearch = !this.$store.state.optUpdateMimeMap;

-      resp.hits.hits.forEach(hit => this.docIds.add(hit._id));
+            Sist2Api.getMimeTypes(Sist2Query.searchQuery(doBlankSearch)).then(({mimeMap}) => {
+                this.$store.commit("setUiMimeMap", mimeMap);
+                this.uiLoading = false;
+                this.search(true);
+            });
+        }).catch(error => {
+            console.log(error);
+
+            if (error.response.status == 503 || error.response.status == 500) {
+                this.showEsConnectionError = true;
+                this.uiLoading = false;
+            } else {
+                this.showErrorToast();
+            }
+        });
    },
-    getDateRange(): Promise<{ min: number, max: number }> {
-      return sist2.esQuery({
-        // TODO: filter current selected indices
-        aggs: {
-          dateMin: {min: {field: "mtime"}},
-          dateMax: {max: {field: "mtime"}},
+    methods: {
+        ...mapActions({
+            setSist2Info: "setSist2Info",
+        }),
+        ...mapMutations({
+            setIndices: "setIndices",
+            setDateBoundsMin: "setDateBoundsMin",
+            setDateBoundsMax: "setDateBoundsMax",
+            setTags: "setTags",
+        }),
+        showErrorToast() {
+            this.$bvToast.toast(
+                this.$t("toast.esConnErr"),
+                {
+                    title: this.$t("toast.esConnErrTitle"),
+                    noAutoHide: true,
+                    toaster: "b-toaster-bottom-right",
+                    headerClass: "toast-header-error",
+                    bodyClass: "toast-body-error",
+                });
        },
-        size: 0
-      }).then(res => {
-        return {
-          min: res.aggregations.dateMin.value,
-          max: res.aggregations.dateMax.value,
+        showSyntaxErrorToast: function (): void {
+            this.$bvToast.toast(
+                this.$t("toast.esQueryErr"),
+                {
+                    title: this.$t("toast.esQueryErrTitle"),
+                    noAutoHide: true,
+                    toaster: "b-toaster-bottom-right",
+                    headerClass: "toast-header-warning",
+                    bodyClass: "toast-body-warning",
+                });
+        },
+        async searchNow(q: any) {
+            this.searchBusy = true;
+            await this.$store.dispatch("incrementQuerySequence");
+            this.$store.commit("busSearch");
+
+            Sist2Api.esQuery(q).then(async (resp: EsResult) => {
+                await this.handleSearch(resp);
+                this.searchBusy = false;
+            }).catch(err => {
+                if (err.response.status === 500 && this.$store.state.optQueryMode === "advanced") {
+                    this.showSyntaxErrorToast();
+                } else {
+                    this.showErrorToast();
+                }
+            });
+        },
+        async clearResults() {
+            this.docs = [];
+            this.docIds.clear();
+            this.docChecksums.clear();
+            await this.$store.dispatch("clearResults");
+            this.$store.commit("setUiReachedScrollEnd", false);
+        },
+        async handleSearch(resp: EsResult) {
+            if (resp.hits.hits.length == 0 || resp.hits.hits.length < this.$store.state.optSize) {
+                this.$store.commit("setUiReachedScrollEnd", true);
+            }
+
+            resp.hits.hits = resp.hits.hits.filter(hit => !this.docIds.has(hit._id));
+
+            if (this.$store.state.optHideDuplicates) {
+                resp.hits.hits = resp.hits.hits.filter(hit => {
+
+                    if (!("checksum" in hit._source)) {
+                        return true;
+                    }
+
+                    const isDupe = !this.docChecksums.has(hit._source.checksum);
+                    this.docChecksums.add(hit._source.checksum);
+                    return isDupe;
+                });
+            }
+
+            for (const hit of resp.hits.hits) {
+                if (hit._props.isPlayableImage || hit._props.isPlayableVideo) {
+                    hit._seq = await this.$store.dispatch("getKeySequence");
+                    this.$store.commit("addLightboxSource", {
+                        source: `f/${hit._id}`,
+                        thumbnail: hit._props.hasThumbnail
+                            ? `t/${hit._source.index}/${hit._id}`
+                            : null,
+                        caption: {
+                            component: LightboxCaption,
+                            props: {hit: hit}
+                        },
+                        type: hit._props.isVideo ? "video" : "image"
+                    });
+                }
+            }
+
+            await this.$store.dispatch("remountLightbox");
+            this.$store.commit("setLastQueryResult", resp);
+
+            this.docs.push(...resp.hits.hits);
+
+            resp.hits.hits.forEach(hit => this.docIds.add(hit._id));
+        },
+        getDateRange(): Promise<{ min: number, max: number }> {
+            return sist2.esQuery({
+                // TODO: filter current selected indices
+                aggs: {
+                    dateMin: {min: {field: "mtime"}},
+                    dateMax: {max: {field: "mtime"}},
+                },
+                size: 0
+            }).then(res => {
+                const range = {
+                    min: res.aggregations.dateMin.value,
+                    max: res.aggregations.dateMax.value,
+                }
+
+                if (range.min == null) {
+                    range.min = 0;
+                    range.max = 1;
+                } else if (range.min == range.max) {
+                    range.max += 1;
+                }
+
+                return range;
+            });
+        },
+        appendFunc() {
+            if (!this.$store.state.uiReachedScrollEnd && this.search && !this.searchBusy) {
+                this.searchNow(Sist2Query.searchQuery());
+            }
+        }
+    },
+    beforeRouteUpdate(to, from, next) {
+        if (this.$store.state.uiLightboxIsOpen) {
+            this.$store.commit("_setUiShowLightbox", false);
+            next(false);
+        } else {
+            next();
        }
-      })
    },
-    appendFunc() {
-      if (!this.$store.state.uiReachedScrollEnd && this.search && !this.searchBusy) {
-        this.searchNow(Sist2Query.searchQuery());
-      }
-    }
-  },
-  beforeRouteUpdate(to, from, next) {
-    if (this.$store.state.uiLightboxIsOpen) {
-      this.$store.commit("_setUiShowLightbox", false);
-      next(false);
-    } else {
-      next();
-    }
-  },
 })
 </script>

 <style>

 #search-panel {
-  box-shadow: 0 .125rem .25rem rgba(0, 0, 0, .08) !important;
-  border-radius: 0;
-  border: none;
+    box-shadow: 0 .125rem .25rem rgba(0, 0, 0, .08) !important;
+    border-radius: 0;
+    border: none;
 }

 .toast-header-info, .toast-body-info {
-  background: #2196f3;
-  color: #fff !important;
+    background: #2196f3;
+    color: #fff !important;
 }

 .toast-header-error, .toast-body-error {
-  background: #a94442;
-  color: #f2dede !important;
+    background: #a94442;
+    color: #f2dede !important;
 }

 .toast-header-error {
-  color: #fff !important;
-  border-bottom: none;
-  margin-bottom: -1em;
+    color: #fff !important;
+    border-bottom: none;
+    margin-bottom: -1em;
 }

 .toast-header-error .close {
-  text-shadow: none;
+    text-shadow: none;
 }

 .toast-header-warning, .toast-body-warning {
-  background: #FF8F00;
-  color: #FFF3E0 !important;
+    background: #FF8F00;
+    color: #FFF3E0 !important;
 }
 </style>
--- a/src/auth0/auth0_c_api.h
+++ b/src/auth0/auth0_c_api.h
@@ -1,12 +1,13 @@
 #ifndef SIST2_AUTH0_C_API_H
 #define SIST2_AUTH0_C_API_H

-#include "stdlib.h"

 #ifdef __cplusplus
 #define EXTERNC extern "C"
+#include "cstdlib"
 #else
 #define EXTERNC
+#include "stdlib.h"
 #endif

 #define AUTH0_OK (0)
--- a/src/cli.c
+++ b/src/cli.c
@@ -2,16 +2,17 @@
 #include "ctx.h"
 #include <tesseract/capi.h>

-#define DEFAULT_OUTPUT "index.sist2/"
+#define DEFAULT_OUTPUT "index.sist2"
+#define DEFAULT_NAME "index"
 #define DEFAULT_CONTENT_SIZE 32768
-#define DEFAULT_QUALITY 1
-#define DEFAULT_THUMBNAIL_SIZE 500
+#define DEFAULT_QUALITY 2
+#define DEFAULT_THUMBNAIL_SIZE 552
 #define DEFAULT_THUMBNAIL_COUNT 1
 #define DEFAULT_REWRITE_URL ""

 #define DEFAULT_ES_URL "http://localhost:9200"
 #define DEFAULT_ES_INDEX "sist2"
-#define DEFAULT_BATCH_SIZE 100
+#define DEFAULT_BATCH_SIZE 70
 #define DEFAULT_TAGLINE "Lightning-fast file system indexer and search tool"
 #define DEFAULT_LANG "en"

@@ -20,8 +21,6 @@

 #define DEFAULT_MAX_MEM_BUFFER 2000

-#define DEFAULT_THROTTLE_MEMORY_THRESHOLD 0
-
 const char *TESS_DATAPATHS[] = {
        "/usr/share/tessdata/",
        "/usr/share/tesseract-ocr/tessdata/",
@@ -48,9 +47,6 @@ void scan_args_destroy(scan_args_t *args) {
    if (args->name != NULL) {
        free(args->name);
    }
-    if (args->incremental != NULL) {
-        free(args->incremental);
-    }
    if (args->path != NULL) {
        free(args->path);
    }
@@ -61,7 +57,6 @@ void scan_args_destroy(scan_args_t *args) {
 }

 void index_args_destroy(index_args_t *args) {
-    //todo
    if (args->es_mappings_path) {
        free(args->es_mappings);
    }
@@ -76,7 +71,6 @@ void index_args_destroy(index_args_t *args) {
 }

 void web_args_destroy(web_args_t *args) {
-    //todo
    free(args);
 }

@@ -97,23 +91,17 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {

    char *abs_path = abspath(argv[1]);
    if (abs_path == NULL) {
-        LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1])
+        LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1]);
    } else {
+        abs_path = realloc(abs_path, strlen(abs_path) + 2);
+        strcat(abs_path, "/");
        args->path = abs_path;
    }

-    if (args->incremental != OPTION_VALUE_UNSPECIFIED) {
-        args->incremental = abspath(args->incremental);
-        if (abs_path == NULL) {
-            sist_log("main.c", LOG_SIST_WARNING, "Could not open original index! Disabled incremental scan feature.");
-            args->incremental = NULL;
-        }
-    }
-
    if (args->tn_quality == OPTION_VALUE_UNSPECIFIED) {
        args->tn_quality = DEFAULT_QUALITY;
-    } else if (args->tn_quality < 1.0f || args->tn_quality > 31.0f) {
-        fprintf(stderr, "Invalid value for --thumbnail-quality argument: %f. Must be within [1.0, 31.0].\n",
+    } else if (args->tn_quality < 2 || args->tn_quality > 31) {
+        fprintf(stderr, "Invalid value for --thumbnail-quality argument: %d. Must be within [2, 31].\n",
                args->tn_quality);
        return 1;
    }
@@ -140,8 +128,8 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {

    if (args->threads == 0) {
        args->threads = 1;
-    } else if (args->threads < 0) {
-        fprintf(stderr, "Invalid value for --threads: %d. Must be a positive number\n", args->threads);
+    } else if (args->threads < 0 || args->threads > 256) {
+        fprintf(stderr, "Invalid value for --threads: %d. Must be a positive number <= 256\n", args->threads);
        return 1;
    }

@@ -152,20 +140,24 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
        args->output = expandpath(args->output);
    }

-    int ret = mkdir(args->output, S_IRUSR | S_IWUSR | S_IXUSR);
-    if (ret != 0) {
-        fprintf(stderr, "Invalid output: '%s' (%s).\n", args->output, strerror(errno));
-        return 1;
+    char *abs_output = abspath(args->output);
+    if (args->incremental && abs_output == NULL) {
+        LOG_WARNINGF("main.c", "Could not open original index for incremental scan: %s. Will not perform incremental scan.", args->output);
+        args->incremental = FALSE;
+    } else if (!args->incremental && abs_output != NULL) {
+        LOG_FATALF("main.c", "Index already exists: %s. If you wish to perform incremental scan, you must specify --incremental", abs_output);
    }
+    free(abs_output);

    if (args->depth <= 0) {
-        args->depth = G_MAXINT32;
+        args->depth = 2147483647;
    } else {
        args->depth += 1;
    }

    if (args->name == OPTION_VALUE_UNSPECIFIED) {
-        args->name = g_path_get_basename(args->output);
+        args->name = malloc(strlen(DEFAULT_NAME) + 1);
+        strcpy(args->name, DEFAULT_NAME);
    } else {
        char *tmp = malloc(strlen(args->name) + 1);
        strcpy(tmp, args->name);
@@ -224,7 +216,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
            }
            if (trained_data_path != NULL && path != trained_data_path) {
                LOG_FATAL("cli.c", "When specifying more than one tesseract language, all the traineddata "
-                                   "files must be in the same folder")
+                                   "files must be in the same folder");
            }
            trained_data_path = path;

@@ -232,7 +224,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
        }
        free(lang);

-        ret = TessBaseAPIInit3(api, trained_data_path, args->tesseract_lang);
+        int ret = TessBaseAPIInit3(api, trained_data_path, args->tesseract_lang);
        if (ret != 0) {
            fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang);
            return 1;
@@ -249,12 +241,12 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {

        pcre *re = pcre_compile(args->exclude_regex, 0, &error, &error_offset, 0);
        if (error != NULL) {
-            LOG_FATALF("cli.c", "pcre_compile returned error: %s (offset:%d)", error, error_offset)
+            LOG_FATALF("cli.c", "pcre_compile returned error: %s (offset:%d)", error, error_offset);
        }

        pcre_extra *re_extra = pcre_study(re, 0, &error);
        if (error != NULL) {
-            LOG_FATALF("cli.c", "pcre_study returned error: %s", error)
+            LOG_FATALF("cli.c", "pcre_study returned error: %s", error);
        }

        ScanCtx.exclude = re;
@@ -273,14 +265,10 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
        args->max_memory_buffer_mib = DEFAULT_MAX_MEM_BUFFER;
    }

-    if (args->scan_mem_limit_mib == OPTION_VALUE_UNSPECIFIED || args->scan_mem_limit_mib == OPTION_VALUE_DISABLE) {
-        args->scan_mem_limit_mib = DEFAULT_THROTTLE_MEMORY_THRESHOLD;
-    }
-
    if (args->list_path != OPTION_VALUE_UNSPECIFIED) {
        if (strcmp(args->list_path, "-") == 0) {
            args->list_file = stdin;
-            LOG_DEBUG("cli.c", "Using stdin as list file")
+            LOG_DEBUG("cli.c", "Using stdin as list file");
        } else {
            args->list_file = fopen(args->list_path, "r");

@@ -290,27 +278,27 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
        }
    }

-    LOG_DEBUGF("cli.c", "arg tn_quality=%f", args->tn_quality)
-    LOG_DEBUGF("cli.c", "arg tn_size=%d", args->tn_size)
-    LOG_DEBUGF("cli.c", "arg tn_count=%d", args->tn_count)
-    LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size)
-    LOG_DEBUGF("cli.c", "arg threads=%d", args->threads)
-    LOG_DEBUGF("cli.c", "arg incremental=%s", args->incremental)
-    LOG_DEBUGF("cli.c", "arg output=%s", args->output)
-    LOG_DEBUGF("cli.c", "arg rewrite_url=%s", args->rewrite_url)
-    LOG_DEBUGF("cli.c", "arg name=%s", args->name)
-    LOG_DEBUGF("cli.c", "arg depth=%d", args->depth)
-    LOG_DEBUGF("cli.c", "arg path=%s", args->path)
-    LOG_DEBUGF("cli.c", "arg archive=%s", args->archive)
-    LOG_DEBUGF("cli.c", "arg archive_passphrase=%s", args->archive_passphrase)
-    LOG_DEBUGF("cli.c", "arg tesseract_lang=%s", args->tesseract_lang)
-    LOG_DEBUGF("cli.c", "arg tesseract_path=%s", args->tesseract_path)
-    LOG_DEBUGF("cli.c", "arg exclude=%s", args->exclude_regex)
-    LOG_DEBUGF("cli.c", "arg fast=%d", args->fast)
-    LOG_DEBUGF("cli.c", "arg fast_epub=%d", args->fast_epub)
-    LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold)
-    LOG_DEBUGF("cli.c", "arg max_memory_buffer_mib=%d", args->max_memory_buffer_mib)
-    LOG_DEBUGF("cli.c", "arg list_path=%s", args->list_path)
+    LOG_DEBUGF("cli.c", "arg tn_quality=%f", args->tn_quality);
+    LOG_DEBUGF("cli.c", "arg tn_size=%d", args->tn_size);
+    LOG_DEBUGF("cli.c", "arg tn_count=%d", args->tn_count);
+    LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size);
+    LOG_DEBUGF("cli.c", "arg threads=%d", args->threads);
+    LOG_DEBUGF("cli.c", "arg incremental=%d", args->incremental);
+    LOG_DEBUGF("cli.c", "arg output=%s", args->output);
+    LOG_DEBUGF("cli.c", "arg rewrite_url=%s", args->rewrite_url);
+    LOG_DEBUGF("cli.c", "arg name=%s", args->name);
+    LOG_DEBUGF("cli.c", "arg depth=%d", args->depth);
+    LOG_DEBUGF("cli.c", "arg path=%s", args->path);
+    LOG_DEBUGF("cli.c", "arg archive=%s", args->archive);
+    LOG_DEBUGF("cli.c", "arg archive_passphrase=%s", args->archive_passphrase);
+    LOG_DEBUGF("cli.c", "arg tesseract_lang=%s", args->tesseract_lang);
+    LOG_DEBUGF("cli.c", "arg tesseract_path=%s", args->tesseract_path);
+    LOG_DEBUGF("cli.c", "arg exclude=%s", args->exclude_regex);
+    LOG_DEBUGF("cli.c", "arg fast=%d", args->fast);
+    LOG_DEBUGF("cli.c", "arg fast_epub=%d", args->fast_epub);
+    LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold);
+    LOG_DEBUGF("cli.c", "arg max_memory_buffer_mib=%d", args->max_memory_buffer_mib);
+    LOG_DEBUGF("cli.c", "arg list_path=%s", args->list_path);

    return 0;
 }
@@ -320,20 +308,20 @@ int load_external_file(const char *file_path, char **dst) {
    int res = stat(file_path, &info);

    if (res == -1) {
-        LOG_ERRORF("cli.c", "Error opening file '%s': %s\n", file_path, strerror(errno))
+        LOG_ERRORF("cli.c", "Error opening file '%s': %s\n", file_path, strerror(errno));
        return 1;
    }

    int fd = open(file_path, O_RDONLY);
    if (fd == -1) {
-        LOG_ERRORF("cli.c", "Error opening file '%s': %s\n", file_path, strerror(errno))
+        LOG_ERRORF("cli.c", "Error opening file '%s': %s\n", file_path, strerror(errno));
        return 1;
    }

    *dst = malloc(info.st_size + 1);
    res = read(fd, *dst, info.st_size);
    if (res < 0) {
-        LOG_ERRORF("cli.c", "Error reading file '%s': %s\n", file_path, strerror(errno))
+        LOG_ERRORF("cli.c", "Error reading file '%s': %s\n", file_path, strerror(errno));
        return 1;
    }

@@ -361,7 +349,7 @@ int index_args_validate(index_args_t *args, int argc, const char **argv) {

    char *index_path = abspath(argv[1]);
    if (index_path == NULL) {
-        LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1])
+        LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1]);
    } else {
        args->index_path = index_path;
    }
@@ -396,28 +384,28 @@ int index_args_validate(index_args_t *args, int argc, const char **argv) {
        args->batch_size = DEFAULT_BATCH_SIZE;
    }

-    LOG_DEBUGF("cli.c", "arg es_url=%s", args->es_url)
-    LOG_DEBUGF("cli.c", "arg es_index=%s", args->es_index)
-    LOG_DEBUGF("cli.c", "arg es_insecure_ssl=%d", args->es_insecure_ssl)
-    LOG_DEBUGF("cli.c", "arg index_path=%s", args->index_path)
-    LOG_DEBUGF("cli.c", "arg script_path=%s", args->script_path)
-    LOG_DEBUGF("cli.c", "arg async_script=%d", args->async_script)
+    LOG_DEBUGF("cli.c", "arg es_url=%s", args->es_url);
+    LOG_DEBUGF("cli.c", "arg es_index=%s", args->es_index);
+    LOG_DEBUGF("cli.c", "arg es_insecure_ssl=%d", args->es_insecure_ssl);
+    LOG_DEBUGF("cli.c", "arg index_path=%s", args->index_path);
+    LOG_DEBUGF("cli.c", "arg script_path=%s", args->script_path);
+    LOG_DEBUGF("cli.c", "arg async_script=%d", args->async_script);

    if (args->script) {
        char log_buf[5000];

        strncpy(log_buf, args->script, sizeof(log_buf));
        *(log_buf + sizeof(log_buf) - 1) = '\0';
-        LOG_DEBUGF("cli.c", "arg script=%s", log_buf)
+        LOG_DEBUGF("cli.c", "arg script=%s", log_buf);
    }

-    LOG_DEBUGF("cli.c", "arg print=%d", args->print)
-    LOG_DEBUGF("cli.c", "arg es_mappings_path=%s", args->es_mappings_path)
-    LOG_DEBUGF("cli.c", "arg es_mappings=%s", args->es_mappings)
-    LOG_DEBUGF("cli.c", "arg es_settings_path=%s", args->es_settings_path)
-    LOG_DEBUGF("cli.c", "arg es_settings=%s", args->es_settings)
-    LOG_DEBUGF("cli.c", "arg batch_size=%d", args->batch_size)
-    LOG_DEBUGF("cli.c", "arg force_reset=%d", args->force_reset)
+    LOG_DEBUGF("cli.c", "arg print=%d", args->print);
+    LOG_DEBUGF("cli.c", "arg es_mappings_path=%s", args->es_mappings_path);
+    LOG_DEBUGF("cli.c", "arg es_mappings=%s", args->es_mappings);
+    LOG_DEBUGF("cli.c", "arg es_settings_path=%s", args->es_settings_path);
+    LOG_DEBUGF("cli.c", "arg es_settings=%s", args->es_settings);
+    LOG_DEBUGF("cli.c", "arg batch_size=%d", args->batch_size);
+    LOG_DEBUGF("cli.c", "arg force_reset=%d", args->force_reset);

    return 0;
 }
@@ -538,23 +526,24 @@ int web_args_validate(web_args_t *args, int argc, const char **argv) {
    for (int i = 0; i < args->index_count; i++) {
        char *abs_path = abspath(args->indices[i]);
        if (abs_path == NULL) {
-            LOG_FATALF("cli.c", "Index not found: %s", args->indices[i])
+            LOG_FATALF("cli.c", "Index not found: %s", args->indices[i]);
        }
+        free(abs_path);
    }

-    LOG_DEBUGF("cli.c", "arg es_url=%s", args->es_url)
-    LOG_DEBUGF("cli.c", "arg es_index=%s", args->es_index)
-    LOG_DEBUGF("cli.c", "arg es_insecure_ssl=%d", args->es_insecure_ssl)
-    LOG_DEBUGF("cli.c", "arg tagline=%s", args->tagline)
-    LOG_DEBUGF("cli.c", "arg dev=%d", args->dev)
-    LOG_DEBUGF("cli.c", "arg listen=%s", args->listen_address)
-    LOG_DEBUGF("cli.c", "arg credentials=%s", args->credentials)
-    LOG_DEBUGF("cli.c", "arg tag_credentials=%s", args->tag_credentials)
-    LOG_DEBUGF("cli.c", "arg auth_user=%s", args->auth_user)
-    LOG_DEBUGF("cli.c", "arg auth_pass=%s", args->auth_pass)
-    LOG_DEBUGF("cli.c", "arg index_count=%d", args->index_count)
+    LOG_DEBUGF("cli.c", "arg es_url=%s", args->es_url);
+    LOG_DEBUGF("cli.c", "arg es_index=%s", args->es_index);
+    LOG_DEBUGF("cli.c", "arg es_insecure_ssl=%d", args->es_insecure_ssl);
+    LOG_DEBUGF("cli.c", "arg tagline=%s", args->tagline);
+    LOG_DEBUGF("cli.c", "arg dev=%d", args->dev);
+    LOG_DEBUGF("cli.c", "arg listen=%s", args->listen_address);
+    LOG_DEBUGF("cli.c", "arg credentials=%s", args->credentials);
+    LOG_DEBUGF("cli.c", "arg tag_credentials=%s", args->tag_credentials);
+    LOG_DEBUGF("cli.c", "arg auth_user=%s", args->auth_user);
+    LOG_DEBUGF("cli.c", "arg auth_pass=%s", args->auth_pass);
+    LOG_DEBUGF("cli.c", "arg index_count=%d", args->index_count);
    for (int i = 0; i < args->index_count; i++) {
-        LOG_DEBUGF("cli.c", "arg indices[%d]=%s", i, args->indices[i])
+        LOG_DEBUGF("cli.c", "arg indices[%d]=%s", i, args->indices[i]);
    }

    return 0;
@@ -579,7 +568,7 @@ int exec_args_validate(exec_args_t *args, int argc, const char **argv) {

    char *index_path = abspath(argv[1]);
    if (index_path == NULL) {
-        LOG_FATALF("cli.c", "Invalid index PATH argument. File not found: %s", argv[1])
+        LOG_FATALF("cli.c", "Invalid index PATH argument. File not found: %s", argv[1]);
    } else {
        args->index_path = index_path;
    }
@@ -600,12 +589,12 @@ int exec_args_validate(exec_args_t *args, int argc, const char **argv) {
        return 1;
    }

-    LOG_DEBUGF("cli.c", "arg script_path=%s", args->script_path)
+    LOG_DEBUGF("cli.c", "arg script_path=%s", args->script_path);

    char log_buf[5000];
    strncpy(log_buf, args->script, sizeof(log_buf));
    *(log_buf + sizeof(log_buf) - 1) = '\0';
-    LOG_DEBUGF("cli.c", "arg script=%s", log_buf)
+    LOG_DEBUGF("cli.c", "arg script=%s", log_buf);

    return 0;
 }
--- a/src/cli.h
+++ b/src/cli.h
@@ -9,12 +9,12 @@
 #define OPTION_VALUE_UNSPECIFIED (0)

 typedef struct scan_args {
-    float tn_quality;
+    int tn_quality;
    int tn_size;
    int content_size;
    int threads;
-    int scan_mem_limit_mib;
-    char *incremental;
+    int incremental;
+    int optimize_database;
    char *output;
    char *rewrite_url;
    char *name;
--- a/src/ctx.c
+++ b/src/ctx.c
@@ -3,9 +3,10 @@
 ScanCtx_t ScanCtx = {
        .stat_index_size = 0,
        .stat_tn_size = 0,
-        .dbg_current_files = NULL,
-        .pool = NULL
+        .pool = NULL,
+        .index.path = {0,},
 };
 WebCtx_t WebCtx;
 IndexCtx_t IndexCtx;
 LogCtx_t LogCtx;
+__thread ProcData_t ProcData;
--- a/src/ctx.h
+++ b/src/ctx.h
@@ -16,47 +16,28 @@
 #include "libscan/msdoc/msdoc.h"
 #include "libscan/wpd/wpd.h"
 #include "libscan/json/json.h"
-#include "src/io/store.h"
+#include "src/database/database.h"
 #include "src/index/elastic.h"
+#include "sqlite3.h"

-#include <glib.h>
 #include <pcre.h>

 typedef struct {
    struct index_t index;

-    GHashTable *mime_table;
-    GHashTable *ext_table;
-
    tpool_t *pool;

-    tpool_t *writer_pool;
-
    int threads;
    int depth;
    int calculate_checksums;
-    size_t mem_limit;

    size_t stat_tn_size;
    size_t stat_index_size;

-    GHashTable *original_table;
-    GHashTable *copy_table;
-    GHashTable *new_table;
-    pthread_mutex_t copy_table_mu;
-
    pcre *exclude;
    pcre_extra *exclude_extra;
    int fast;

-    GHashTable *dbg_current_files;
-    pthread_mutex_t dbg_current_files_mu;
-
-    int dbg_failed_files_count;
-    int dbg_skipped_files_count;
-    int dbg_excluded_files_count;
-    pthread_mutex_t dbg_file_counts_mu;
-
    scan_arc_ctx_t arc_ctx;
    scan_comic_ctx_t comic_ctx;
    scan_ebook_ctx_t ebook_ctx;
@@ -85,10 +66,6 @@ typedef struct {
    char *es_index;
    int batch_size;
    tpool_t *pool;
-    store_t *tag_store;
-    GHashTable *tags;
-    store_t *meta_store;
-    GHashTable *meta;
    /**
     * Set to false when using --print
     */
@@ -118,10 +95,18 @@ typedef struct {
    int dev;
 } WebCtx_t;

+
+typedef struct {
+    int thread_id;
+    database_t *ipc_db;
+    database_t *index_db;
+} ProcData_t;
+
 extern ScanCtx_t ScanCtx;
 extern WebCtx_t WebCtx;
 extern IndexCtx_t IndexCtx;
 extern LogCtx_t LogCtx;
+extern __thread ProcData_t ProcData;


 #endif
--- a/src/database/database.c
+++ b/src/database/database.c
@@ -0,0 +1,644 @@
+#include "database.h"
+#include "malloc.h"
+#include "src/ctx.h"
+#include <string.h>
+#include <pthread.h>
+#include "src/util.h"
+
+#include <time.h>
+
+
+database_t *database_create(const char *filename, database_type_t type) {
+    database_t *db = malloc(sizeof(database_t));
+
+    strcpy(db->filename, filename);
+    db->type = type;
+    db->select_thumbnail_stmt = NULL;
+
+    db->ipc_ctx = NULL;
+
+    return db;
+}
+
+__always_inline
+static int sep_rfind(const char *str) {
+    for (int i = (int) strlen(str); i >= 0; i--) {
+        if (str[i] == '/') {
+            return i;
+        }
+    }
+    return -1;
+}
+
+void path_parent_func(sqlite3_context *ctx, int argc, sqlite3_value **argv) {
+    if (argc != 1 || sqlite3_value_type(argv[0]) != SQLITE_TEXT) {
+        sqlite3_result_error(ctx, "Invalid parameters", -1);
+    }
+
+    const char *value = (const char *) sqlite3_value_text(argv[0]);
+
+    int stop = sep_rfind(value);
+    if (stop == -1) {
+        sqlite3_result_null(ctx);
+        return;
+    }
+    char parent[PATH_MAX * 3];
+    strncpy(parent, value, stop);
+
+    sqlite3_result_text(ctx, parent, stop, SQLITE_TRANSIENT);
+}
+
+
+void save_current_job_info(sqlite3_context *ctx, int argc, sqlite3_value **argv) {
+    if (argc != 1 || sqlite3_value_type(argv[0]) != SQLITE_TEXT) {
+        sqlite3_result_error(ctx, "Invalid parameters", -1);
+    }
+
+    database_ipc_ctx_t *ipc_ctx = sqlite3_user_data(ctx);
+
+    const char *current_job = (const char *) sqlite3_value_text(argv[0]);
+
+    char buf[PATH_MAX];
+    strcpy(buf, current_job);
+
+    strcpy(ipc_ctx->current_job[ProcData.thread_id], current_job);
+
+    sqlite3_result_text(ctx, "ok", -1, SQLITE_STATIC);
+}
+
+void database_initialize(database_t *db) {
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_open(db->filename, &db->db));
+
+    LOG_DEBUGF("database.c", "Initializing database %s", db->filename);
+    if (db->type == INDEX_DATABASE) {
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, IndexDatabaseSchema, NULL, NULL, NULL));
+    } else if (db->type == IPC_CONSUMER_DATABASE || db->type == IPC_PRODUCER_DATABASE) {
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, IpcDatabaseSchema, NULL, NULL, NULL));
+    }
+
+    sqlite3_close(db->db);
+}
+
+void database_open(database_t *db) {
+    LOG_DEBUGF("database.c", "Opening database %s (%d)", db->filename, db->type);
+
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_open(db->filename, &db->db));
+    sqlite3_busy_timeout(db->db, 1000);
+
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA cache_size = -200000;", NULL, NULL, NULL));
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA synchronous = OFF;", NULL, NULL, NULL));
+
+    if (db->type == INDEX_DATABASE) {
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA temp_store = memory;", NULL, NULL, NULL));
+    }
+
+    if (db->type == INDEX_DATABASE) {
+        // Prepare statements;
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                db->db,
+                "SELECT data FROM thumbnail WHERE id=? AND num=? LIMIT 1;", -1,
+                &db->select_thumbnail_stmt, NULL));
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                db->db,
+                "UPDATE document SET marked=1 WHERE id=? AND mtime=? RETURNING id",
+                -1,
+                &db->mark_document_stmt, NULL));
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                db->db,
+                "REPLACE INTO document_sidecar (id, json_data) VALUES (?,?)", -1,
+                &db->write_document_sidecar_stmt, NULL));
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                db->db,
+                "REPLACE INTO document (id, mtime, size, json_data) VALUES (?, ?, ?, ?);", -1,
+                &db->write_document_stmt, NULL));
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                db->db,
+                "INSERT INTO thumbnail (id, num, data) VALUES (?,?,?) ON CONFLICT DO UPDATE SET data=excluded.data;",
+                -1,
+                &db->write_thumbnail_stmt, NULL));
+
+        // Create functions
+        sqlite3_create_function(
+                db->db,
+                "path_parent",
+                1,
+                SQLITE_UTF8,
+                NULL,
+                path_parent_func,
+                NULL,
+                NULL
+        );
+    } else if (db->type == IPC_CONSUMER_DATABASE) {
+
+        sqlite3_create_function(
+                db->db,
+                "save_current_job_info",
+                1,
+                SQLITE_UTF8,
+                db->ipc_ctx,
+                save_current_job_info,
+                NULL,
+                NULL
+        );
+
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                db->db,
+                "DELETE FROM parse_job WHERE id = (SELECT MIN(id) FROM parse_job)"
+                " RETURNING filepath,mtime,st_size,save_current_job_info(filepath);",
+                -1, &db->pop_parse_job_stmt, NULL
+        ));
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                db->db,
+                "DELETE FROM index_job WHERE id = (SELECT MIN(id) FROM index_job)"
+                " RETURNING doc_id,type,line;",
+                -1, &db->pop_index_job_stmt, NULL
+        ));
+
+    } else if (db->type == IPC_PRODUCER_DATABASE) {
+        char sql[40];
+        int max_size_mb = 10; // TODO: read from args.
+
+        snprintf(sql, sizeof(sql), "PRAGMA max_page_count=%d", (max_size_mb * 1024 * 1024) / 4096);
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, sql, NULL, NULL, NULL));
+
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                db->db, "INSERT INTO parse_job (filepath,mtime,st_size) VALUES (?,?,?);", -1,
+                &db->insert_parse_job_stmt, NULL));
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                db->db, "INSERT INTO index_job (doc_id,type,line) VALUES (?,?,?);", -1,
+                &db->insert_index_job_stmt, NULL));
+
+        sqlite3_create_function(
+                db->db,
+                "path_parent",
+                1,
+                SQLITE_UTF8,
+                NULL,
+                path_parent_func,
+                NULL,
+                NULL
+        );
+    }
+
+}
+
+void database_close(database_t *db, int optimize) {
+    LOG_DEBUGF("database.c", "Closing database %s", db->filename);
+
+    if (optimize) {
+        LOG_DEBUG("database.c", "Optimizing database");
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "VACUUM;", NULL, NULL, NULL));
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA optimize;", NULL, NULL, NULL));
+    }
+
+    sqlite3_close(db->db);
+
+    if (db->type == IPC_PRODUCER_DATABASE) {
+        remove(db->filename);
+    }
+
+    free(db);
+    db = NULL;
+}
+
+void *database_read_thumbnail(database_t *db, const char *id, int num, size_t *return_value_len) {
+    sqlite3_bind_text(db->select_thumbnail_stmt, 1, id, -1, SQLITE_STATIC);
+    sqlite3_bind_int(db->select_thumbnail_stmt, 2, num);
+
+    int ret = sqlite3_step(db->select_thumbnail_stmt);
+
+    if (ret == SQLITE_DONE) {
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->select_thumbnail_stmt));
+        *return_value_len = 0;
+        return NULL;
+    }
+
+    CRASH_IF_STMT_FAIL(ret);
+
+    const void *blob = sqlite3_column_blob(db->select_thumbnail_stmt, 0);
+    const int blob_size = sqlite3_column_bytes(db->select_thumbnail_stmt, 0);
+
+    *return_value_len = blob_size;
+    void *return_data = malloc(blob_size);
+    memcpy(return_data, blob, blob_size);
+
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->select_thumbnail_stmt));
+
+    return return_data;
+}
+
+void database_write_index_descriptor(database_t *db, index_descriptor_t *desc) {
+
+    sqlite3_exec(db->db, "DELETE FROM descriptor;", NULL, NULL, NULL);
+
+    sqlite3_stmt *stmt;
+
+    sqlite3_prepare_v2(db->db, "INSERT INTO descriptor (id, version_major, version_minor, version_patch,"
+                               " root, name, rewrite_url, timestamp) VALUES (?,?,?,?,?,?,?,?);", -1, &stmt, NULL);
+    sqlite3_bind_text(stmt, 1, desc->id, -1, SQLITE_STATIC);
+    sqlite3_bind_int(stmt, 2, desc->version_major);
+    sqlite3_bind_int(stmt, 3, desc->version_minor);
+    sqlite3_bind_int(stmt, 4, desc->version_patch);
+    sqlite3_bind_text(stmt, 5, desc->root, -1, SQLITE_STATIC);
+    sqlite3_bind_text(stmt, 6, desc->name, -1, SQLITE_STATIC);
+    sqlite3_bind_text(stmt, 7, desc->rewrite_url, -1, SQLITE_STATIC);
+    sqlite3_bind_int64(stmt, 8, desc->timestamp);
+
+    CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
+
+    sqlite3_finalize(stmt);
+}
+
+index_descriptor_t *database_read_index_descriptor(database_t *db) {
+
+    sqlite3_stmt *stmt;
+
+    sqlite3_prepare_v2(db->db, "SELECT id, version_major, version_minor, version_patch,"
+                               " root, name, rewrite_url, timestamp FROM descriptor;", -1, &stmt, NULL);
+
+    CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
+
+    const char *id = (char *) sqlite3_column_text(stmt, 0);
+    int v_major = sqlite3_column_int(stmt, 1);
+    int v_minor = sqlite3_column_int(stmt, 2);
+    int v_patch = sqlite3_column_int(stmt, 3);
+    const char *root = (char *) sqlite3_column_text(stmt, 4);
+    const char *name = (char *) sqlite3_column_text(stmt, 5);
+    const char *rewrite_url = (char *) sqlite3_column_text(stmt, 6);
+    int timestamp = sqlite3_column_int(stmt, 7);
+
+    index_descriptor_t *desc = malloc(sizeof(index_descriptor_t));
+    strcpy(desc->id, id);
+    snprintf(desc->version, sizeof(desc->version), "%d.%d.%d", v_major, v_minor, v_patch);
+    desc->version_major = v_major;
+    desc->version_minor = v_minor;
+    desc->version_patch = v_patch;
+    strcpy(desc->root, root);
+    strcpy(desc->name, name);
+    strcpy(desc->rewrite_url, rewrite_url);
+    desc->timestamp = timestamp;
+
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_finalize(stmt));
+
+    return desc;
+}
+
+database_iterator_t *database_create_delete_list_iterator(database_t *db) {
+
+    sqlite3_stmt *stmt;
+    sqlite3_prepare_v2(db->db, "SELECT id FROM delete_list;", -1, &stmt, NULL);
+
+    database_iterator_t *iter = malloc(sizeof(database_iterator_t));
+
+    iter->stmt = stmt;
+    iter->db = db;
+
+    return iter;
+}
+
+char *database_delete_list_iter(database_iterator_t *iter) {
+    int ret = sqlite3_step(iter->stmt);
+
+    if (ret == SQLITE_ROW) {
+        const char *id = (const char *) sqlite3_column_text(iter->stmt, 0);
+        char *id_heap = malloc(strlen(id) + 1);
+        strcpy(id_heap, id);
+        return id_heap;
+    }
+
+    if (ret != SQLITE_DONE) {
+        LOG_FATALF("database.c", "FIXME: delete iter returned %s", sqlite3_errmsg(iter->db->db));
+    }
+
+    if (sqlite3_finalize(iter->stmt) != SQLITE_OK) {
+        LOG_FATALF("database.c", "FIXME: delete iter returned %s", sqlite3_errmsg(iter->db->db));
+    }
+
+    iter->stmt = NULL;
+
+    return NULL;
+}
+
+database_iterator_t *database_create_document_iterator(database_t *db) {
+
+    sqlite3_stmt *stmt;
+
+    // TODO optimization: remove mtime, size, _id from json_data
+
+    sqlite3_prepare_v2(db->db, "WITH doc (j) AS (SELECT CASE"
+                               " WHEN sc.json_data IS NULL THEN"
+                               "  CASE"
+                               "   WHEN t.tag IS NULL THEN"
+                               "    json_set(document.json_data, '$._id', document.id, '$.size', document.size, '$.mtime', document.mtime)"
+                               "   ELSE"
+                               "    json_set(document.json_data, '$._id', document.id, '$.size', document.size, '$.mtime', document.mtime, '$.tag', json_group_array(t.tag))"
+                               "   END"
+                               " ELSE"
+                               "  CASE"
+                               "   WHEN t.tag IS NULL THEN"
+                               "    json_patch(json_set(document.json_data, '$._id', document.id, '$.size', document.size, '$.mtime', document.mtime), sc.json_data)"
+                               "   ELSE"
+                               //   This will overwrite any tags specified in the sidecar file!
+                               //   TODO: concatenate the two arrays?
+                               "    json_set(json_patch(document.json_data, sc.json_data), '$._id', document.id, '$.size', document.size, '$.mtime', document.mtime, '$.tag', json_group_array(t.tag))"
+                               "   END"
+                               " END"
+                               " FROM document"
+                               " LEFT JOIN document_sidecar sc ON document.id = sc.id"
+                               " LEFT JOIN tag t ON document.id = t.id"
+                               " GROUP BY document.id)"
+                               " SELECT json_set(j, '$.index', (SELECT id FROM descriptor)) FROM doc", -1, &stmt, NULL);
+
+    database_iterator_t *iter = malloc(sizeof(database_iterator_t));
+
+    iter->stmt = stmt;
+    iter->db = db;
+
+    return iter;
+}
+
+cJSON *database_document_iter(database_iterator_t *iter) {
+
+    if (iter->stmt == NULL) {
+        LOG_ERROR("database.c", "FIXME: database_document_iter() called after iteration stopped");
+        return NULL;
+    }
+
+    int ret = sqlite3_step(iter->stmt);
+
+    if (ret == SQLITE_ROW) {
+        const char *json_string = (const char *) sqlite3_column_text(iter->stmt, 0);
+        return cJSON_Parse(json_string);
+    }
+
+    if (ret != SQLITE_DONE) {
+        LOG_FATALF("database.c", "FIXME: doc iter returned %s", sqlite3_errmsg(iter->db->db));
+    }
+
+    if (sqlite3_finalize(iter->stmt) != SQLITE_OK) {
+        LOG_FATALF("database.c", "FIXME: doc iter returned %s", sqlite3_errmsg(iter->db->db));
+    }
+
+    iter->stmt = NULL;
+
+    return NULL;
+}
+
+cJSON *database_incremental_scan_begin(database_t *db) {
+    LOG_DEBUG("database.c", "Preparing database for incremental scan");
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "UPDATE document SET marked=0;", NULL, NULL, NULL));
+}
+
+cJSON *database_incremental_scan_end(database_t *db) {
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
+            db->db,
+            "DELETE FROM delete_list WHERE id IN (SELECT id FROM document WHERE marked=1);",
+            NULL, NULL, NULL
+    ));
+
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
+            db->db,
+            "DELETE FROM thumbnail WHERE id IN (SELECT id FROM document WHERE marked=0);",
+            NULL, NULL, NULL
+    ));
+
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
+            db->db,
+            "INSERT INTO delete_list (id) SELECT id FROM document WHERE marked=0;",
+            NULL, NULL, NULL
+    ));
+
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
+            db->db,
+            "DELETE FROM document_sidecar WHERE id IN (SELECT id FROM document WHERE marked=0);",
+            NULL, NULL, NULL
+    ));
+
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
+            db->db,
+            "DELETE FROM document WHERE marked=0;",
+            NULL, NULL, NULL
+    ));
+}
+
+int database_mark_document(database_t *db, const char *id, int mtime) {
+    sqlite3_bind_text(db->mark_document_stmt, 1, id, -1, SQLITE_STATIC);
+    sqlite3_bind_int(db->mark_document_stmt, 2, mtime);
+
+    pthread_mutex_lock(&db->ipc_ctx->index_db_mutex);
+    int ret = sqlite3_step(db->mark_document_stmt);
+
+    if (ret == SQLITE_ROW) {
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->mark_document_stmt));
+        pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
+        return TRUE;
+    }
+
+    if (ret == SQLITE_DONE) {
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->mark_document_stmt));
+        pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
+        return FALSE;
+    }
+    pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
+
+    CRASH_IF_STMT_FAIL(ret);
+}
+
+void database_write_document(database_t *db, document_t *doc, const char *json_data) {
+    sqlite3_bind_text(db->write_document_stmt, 1, doc->doc_id, -1, SQLITE_STATIC);
+    sqlite3_bind_int(db->write_document_stmt, 2, doc->mtime);
+    sqlite3_bind_int64(db->write_document_stmt, 3, (long) doc->size);
+    sqlite3_bind_text(db->write_document_stmt, 4, json_data, -1, SQLITE_STATIC);
+
+    pthread_mutex_lock(&db->ipc_ctx->index_db_mutex);
+    CRASH_IF_STMT_FAIL(sqlite3_step(db->write_document_stmt));
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->write_document_stmt));
+    pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
+}
+
+
+void database_write_document_sidecar(database_t *db, const char *id, const char *json_data) {
+    sqlite3_bind_text(db->write_document_sidecar_stmt, 1, id, -1, SQLITE_STATIC);
+    sqlite3_bind_text(db->write_document_sidecar_stmt, 2, json_data, -1, SQLITE_STATIC);
+
+    pthread_mutex_lock(&db->ipc_ctx->index_db_mutex);
+    CRASH_IF_STMT_FAIL(sqlite3_step(db->write_document_sidecar_stmt));
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->write_document_sidecar_stmt));
+    pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
+}
+
+void database_write_thumbnail(database_t *db, const char *id, int num, void *data, size_t data_size) {
+    sqlite3_bind_text(db->write_thumbnail_stmt, 1, id, -1, SQLITE_STATIC);
+    sqlite3_bind_int(db->write_thumbnail_stmt, 2, num);
+    sqlite3_bind_blob(db->write_thumbnail_stmt, 3, data, (int) data_size, SQLITE_STATIC);
+
+    pthread_mutex_lock(&db->ipc_ctx->index_db_mutex);
+    CRASH_IF_STMT_FAIL(sqlite3_step(db->write_thumbnail_stmt));
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->write_thumbnail_stmt));
+    pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
+}
+
+
+//void database_create_fts_index(database_t *db, database_t *fts_db) {
+//    // In a separate file,
+//
+//    // use database_initialize() to create FTS schema
+//    // if --force-reset, then truncate the tables first
+//
+//    /*
+//     * create/append fts table
+//     *
+//     * create/append scalar index table with
+//     *  id,index,size,mtime,mime
+//     *
+//     * create/append path index table with
+//     *  index,path,depth
+//     *
+//     * content table is a view with SELECT UNION for all attached tables
+//     *  random_seed column
+//     */
+//
+//    // INSERT INTO ft(ft) VALUES('optimize');
+//}
+
+job_t *database_get_work(database_t *db, job_type_t job_type) {
+    job_t *job;
+
+    pthread_mutex_lock(&db->ipc_ctx->mutex);
+    while (db->ipc_ctx->job_count == 0 && !db->ipc_ctx->no_more_jobs) {
+        pthread_cond_timedwait_ms(&db->ipc_ctx->has_work_cond, &db->ipc_ctx->mutex, 10);
+    }
+    pthread_mutex_unlock(&db->ipc_ctx->mutex);
+
+    pthread_mutex_lock(&db->ipc_ctx->db_mutex);
+
+    if (job_type == JOB_PARSE_JOB) {
+        int ret = sqlite3_step(db->pop_parse_job_stmt);
+        if (ret == SQLITE_DONE) {
+            CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->pop_parse_job_stmt));
+            pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
+            return NULL;
+        } else {
+            CRASH_IF_STMT_FAIL(ret);
+        }
+
+        job = malloc(sizeof(*job));
+
+        job->parse_job = create_parse_job(
+                (const char *) sqlite3_column_text(db->pop_parse_job_stmt, 0),
+                sqlite3_column_int(db->pop_parse_job_stmt, 1),
+                sqlite3_column_int64(db->pop_parse_job_stmt, 2));
+
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->pop_parse_job_stmt));
+    } else {
+
+        int ret = sqlite3_step(db->pop_index_job_stmt);
+
+        if (ret == SQLITE_DONE) {
+            CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->pop_index_job_stmt));
+            pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
+            return NULL;
+        }
+
+        CRASH_IF_STMT_FAIL(ret);
+
+        job = malloc(sizeof(*job));
+
+        const char *line = (const char *) sqlite3_column_text(db->pop_index_job_stmt, 2);
+        if (line != NULL) {
+            job->bulk_line = malloc(sizeof(es_bulk_line_t) + strlen(line) + 1);
+            strcpy(job->bulk_line->line, line);
+        } else {
+            job->bulk_line = malloc(sizeof(es_bulk_line_t));
+        }
+        strcpy(job->bulk_line->doc_id, (const char *) sqlite3_column_text(db->pop_index_job_stmt, 0));
+        job->bulk_line->type = sqlite3_column_int(db->pop_index_job_stmt, 1);
+        job->bulk_line->next = NULL;
+
+        CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->pop_index_job_stmt));
+    }
+
+    pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
+
+    pthread_mutex_lock(&db->ipc_ctx->mutex);
+    db->ipc_ctx->job_count -= 1;
+    pthread_mutex_unlock(&db->ipc_ctx->mutex);
+
+    job->type = job_type;
+    return job;
+}
+
+void database_add_work(database_t *db, job_t *job) {
+    int ret;
+
+    pthread_mutex_lock(&db->ipc_ctx->db_mutex);
+
+    if (job->type == JOB_PARSE_JOB) {
+        do {
+            sqlite3_bind_text(db->insert_parse_job_stmt, 1, job->parse_job->filepath, -1, SQLITE_STATIC);
+            sqlite3_bind_int(db->insert_parse_job_stmt, 2, job->parse_job->vfile.mtime);
+            sqlite3_bind_int64(db->insert_parse_job_stmt, 3, (long) job->parse_job->vfile.st_size);
+
+            ret = sqlite3_step(db->insert_parse_job_stmt);
+
+            if (ret == SQLITE_FULL) {
+                sqlite3_reset(db->insert_parse_job_stmt);
+                pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
+                usleep(1000000);
+                pthread_mutex_lock(&db->ipc_ctx->db_mutex);
+                continue;
+            } else {
+                CRASH_IF_STMT_FAIL(ret);
+            }
+
+            ret = sqlite3_reset(db->insert_parse_job_stmt);
+            if (ret == SQLITE_FULL) {
+                pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
+                usleep(100000);
+                pthread_mutex_lock(&db->ipc_ctx->db_mutex);
+            } else if (ret != SQLITE_OK) {
+                LOG_FATALF("database.c", "sqlite3_reset returned error %d", ret);
+            }
+        } while (ret != SQLITE_DONE && ret != SQLITE_OK);
+    } else if (job->type == JOB_BULK_LINE) {
+        do {
+            sqlite3_bind_text(db->insert_index_job_stmt, 1, job->bulk_line->doc_id, -1, SQLITE_STATIC);
+            sqlite3_bind_int(db->insert_index_job_stmt, 2, job->bulk_line->type);
+            if (job->bulk_line->type != ES_BULK_LINE_DELETE) {
+                sqlite3_bind_text(db->insert_index_job_stmt, 3, job->bulk_line->line, -1, SQLITE_STATIC);
+            } else {
+                sqlite3_bind_null(db->insert_index_job_stmt, 3);
+            }
+
+            ret = sqlite3_step(db->insert_index_job_stmt);
+
+            if (ret == SQLITE_FULL) {
+                sqlite3_reset(db->insert_index_job_stmt);
+                pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
+                usleep(100000);
+                pthread_mutex_lock(&db->ipc_ctx->db_mutex);
+                continue;
+            } else {
+                CRASH_IF_STMT_FAIL(ret);
+            }
+
+            ret = sqlite3_reset(db->insert_index_job_stmt);
+            if (ret == SQLITE_FULL) {
+                pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
+                usleep(100000);
+                pthread_mutex_lock(&db->ipc_ctx->db_mutex);
+            } else if (ret != SQLITE_OK) {
+                LOG_FATALF("database.c", "sqlite3_reset returned error %d", ret);
+            }
+
+        } while (ret != SQLITE_DONE && ret != SQLITE_OK);
+    } else {
+        LOG_FATAL("database.c", "FIXME: invalid job type");
+    }
+    pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
+
+    pthread_mutex_lock(&db->ipc_ctx->mutex);
+    db->ipc_ctx->job_count += 1;
+    pthread_cond_signal(&db->ipc_ctx->has_work_cond);
+    pthread_mutex_unlock(&db->ipc_ctx->mutex);
+}
--- a/src/database/database.h
+++ b/src/database/database.h
@@ -0,0 +1,167 @@
+#ifndef SIST2_DATABASE_H
+#define SIST2_DATABASE_H
+
+#include <sqlite3.h>
+#include <cjson/cJSON.h>
+#include "src/sist.h"
+#include "src/index/elastic.h"
+
+typedef struct index_descriptor index_descriptor_t;
+
+extern const char *IpcDatabaseSchema;
+extern const char *IndexDatabaseSchema;
+
+typedef enum {
+    INDEX_DATABASE,
+    IPC_CONSUMER_DATABASE,
+    IPC_PRODUCER_DATABASE,
+    FTS_DATABASE
+} database_type_t;
+
+typedef enum {
+    DATABASE_STAT_INVALID,
+    DATABASE_STAT_TREEMAP,
+    DATABASE_STAT_MIME_AGG,
+    DATABASE_STAT_SIZE_AGG,
+    DATABASE_STAT_DATE_AGG,
+} database_stat_type_d;
+
+typedef enum {
+    JOB_UNDEFINED,
+    JOB_BULK_LINE,
+    JOB_PARSE_JOB
+} job_type_t;
+
+typedef struct {
+    job_type_t type;
+    union {
+        parse_job_t *parse_job;
+        es_bulk_line_t *bulk_line;
+    };
+} job_t;
+
+typedef struct {
+    int job_count;
+    int no_more_jobs;
+    int completed_job_count;
+
+    pthread_mutex_t mutex;
+    pthread_mutex_t db_mutex;
+    pthread_mutex_t index_db_mutex;
+    pthread_cond_t has_work_cond;
+    char current_job[MAX_THREADS][PATH_MAX * 2];
+} database_ipc_ctx_t;
+
+typedef struct database {
+    char filename[PATH_MAX];
+    database_type_t type;
+    sqlite3 *db;
+
+    // Prepared statements
+    sqlite3_stmt *select_thumbnail_stmt;
+    sqlite3_stmt *treemap_merge_up_update_stmt;
+    sqlite3_stmt *treemap_merge_up_delete_stmt;
+
+    sqlite3_stmt *mark_document_stmt;
+    sqlite3_stmt *write_document_stmt;
+    sqlite3_stmt *write_document_sidecar_stmt;
+    sqlite3_stmt *write_thumbnail_stmt;
+
+    sqlite3_stmt *insert_parse_job_stmt;
+    sqlite3_stmt *insert_index_job_stmt;
+    sqlite3_stmt *pop_parse_job_stmt;
+    sqlite3_stmt *pop_index_job_stmt;
+
+    database_ipc_ctx_t *ipc_ctx;
+} database_t;
+
+typedef struct {
+    database_t *db;
+    sqlite3_stmt *stmt;
+} database_iterator_t;
+
+typedef struct {
+    const char *path;
+    const char *parent;
+    long size;
+} treemap_row_t;
+
+static treemap_row_t null_treemap_row = {0, 0, 0};
+
+
+database_t *database_create(const char *filename, database_type_t type);
+
+void database_initialize(database_t *db);
+
+void database_open(database_t *db);
+
+void database_close(database_t *, int optimize);
+
+void database_write_thumbnail(database_t *db, const char *id, int num, void *data, size_t data_size);
+
+void *database_read_thumbnail(database_t *db, const char *id, int num, size_t *return_value_len);
+
+void database_write_index_descriptor(database_t *db, index_descriptor_t *desc);
+
+index_descriptor_t *database_read_index_descriptor(database_t *db);
+
+void database_write_document(database_t *db, document_t *doc, const char *json_data);
+
+database_iterator_t *database_create_document_iterator(database_t *db);
+
+cJSON *database_document_iter(database_iterator_t *);
+
+#define database_document_iter_foreach(element, iter) \
+    for (cJSON *(element) = database_document_iter(iter); (element) != NULL; (element) = database_document_iter(iter))
+
+database_iterator_t *database_create_delete_list_iterator(database_t *db);
+
+char * database_delete_list_iter(database_iterator_t *iter);
+
+#define database_delete_list_iter_foreach(element, iter) \
+    for (char *(element) = database_delete_list_iter(iter); (element) != NULL; (element) = database_delete_list_iter(iter))
+
+
+cJSON *database_incremental_scan_begin(database_t *db);
+
+cJSON *database_incremental_scan_end(database_t *db);
+
+int database_mark_document(database_t *db, const char *id, int mtime);
+
+void database_write_document_sidecar(database_t *db, const char *id, const char *json_data);
+
+database_iterator_t *database_create_treemap_iterator(database_t *db, long threshold);
+
+treemap_row_t database_treemap_iter(database_iterator_t *iter);
+
+#define database_treemap_iter_foreach(element, iter) \
+    for (treemap_row_t element = database_treemap_iter(iter); element.path != NULL; element = database_treemap_iter(iter))
+
+
+void database_generate_stats(database_t *db, double treemap_threshold);
+
+database_stat_type_d database_get_stat_type_by_mnemonic(const char *name);
+
+job_t *database_get_work(database_t *db, job_type_t job_type);
+
+void database_add_work(database_t *db, job_t *job);
+
+//void database_index(database_t *db);
+
+cJSON *database_get_stats(database_t *db, database_stat_type_d type);
+
+#define CRASH_IF_STMT_FAIL(x) do { \
+        int return_value = x;                \
+        if (return_value != SQLITE_DONE && return_value != SQLITE_ROW) {     \
+            LOG_FATALF("database.c", "Sqlite error @ database.c:%d : (%d) %s", __LINE__, return_value, sqlite3_errmsg(db->db)); \
+        }                           \
+    } while (0)
+
+#define CRASH_IF_NOT_SQLITE_OK(x) do { \
+        int return_value = x;                \
+        if (return_value != SQLITE_OK) {     \
+            LOG_FATALF("database.c", "Sqlite error @ database.c:%d : (%d) %s", __LINE__, return_value, sqlite3_errmsg(db->db)); \
+        }                           \
+    } while (0)
+
+#endif //SIST2_DATABASE_H
--- a/src/database/database_schema.c
+++ b/src/database/database_schema.c
@@ -0,0 +1,78 @@
+
+const char *IpcDatabaseSchema =
+        "CREATE TABLE parse_job ("
+        "   id INTEGER PRIMARY KEY,"
+        "   filepath TEXT NOT NULL,"
+        "   mtime INTEGER NOT NULL,"
+        "   st_size INTEGER NOT NULL"
+        ");"
+        ""
+        "CREATE TABLE index_job ("
+        "   id INTEGER PRIMARY KEY,"
+        "   doc_id TEXT NOT NULL CHECK ( length(doc_id) = 32 ),"
+        "   type INTEGER NOT NULL,"
+        "   line TEXT"
+        ");";
+
+const char *IndexDatabaseSchema =
+        "CREATE TABLE thumbnail ("
+        "   id TEXT NOT NULL CHECK ( length(id) = 32 ),"
+        "   num INTEGER NOT NULL,"
+        "   data BLOB NOT NULL,"
+        "   PRIMARY KEY(id, num)"
+        ") WITHOUT ROWID;"
+        ""
+        "CREATE TABLE document ("
+        "   id TEXT PRIMARY KEY CHECK ( length(id) = 32 ),"
+        "   marked INTEGER NOT NULL DEFAULT (1),"
+        "   mtime INTEGER NOT NULL,"
+        "   size INTEGER NOT NULL,"
+        "   json_data TEXT NOT NULL CHECK ( json_valid(json_data) )"
+        ") WITHOUT ROWID;"
+        ""
+        "CREATE TABLE delete_list ("
+        "   id TEXT PRIMARY KEY CHECK ( length(id) = 32 )"
+        ") WITHOUT ROWID;"
+        ""
+        "CREATE TABLE tag ("
+        "   id TEXT NOT NULL,"
+        "   tag TEXT NOT NULL"
+        ");"
+        ""
+        "CREATE TABLE document_sidecar ("
+        "   id TEXT PRIMARY KEY NOT NULL,"
+        "   json_data TEXT NOT NULL"
+        ") WITHOUT ROWID;"
+        ""
+        "CREATE TABLE descriptor ("
+        "   id TEXT NOT NULL,"
+        "   version_major INTEGER NOT NULL,"
+        "   version_minor INTEGER NOT NULL,"
+        "   version_patch INTEGER NOT NULL,"
+        "   root TEXT NOT NULL,"
+        "   name TEXT NOT NULL,"
+        "   rewrite_url TEXT,"
+        "   timestamp INTEGER NOT NULL"
+        ");"
+        ""
+        "CREATE TABLE stats_treemap ("
+        "   path TEXT NOT NULL,"
+        "   size INTEGER NOT NULL"
+        ");"
+        ""
+        "CREATE TABLE stats_size_agg ("
+        "   bucket INTEGER NOT NULL,"
+        "   count INTEGER NOT NULL"
+        ");"
+        ""
+        "CREATE TABLE stats_date_agg ("
+        "   bucket INTEGER NOT NULL,"
+        "   count INTEGER NOT NULL"
+        ");"
+        ""
+        "CREATE TABLE stats_mime_agg ("
+        "   mime TEXT NOT NULL,"
+        "   size INTEGER NOT NULL,"
+        "   count INTEGER NOT NULL"
+        ");";
+
--- a/src/database/database_stats.c
+++ b/src/database/database_stats.c
@@ -0,0 +1,242 @@
+#include "database.h"
+#include "src/sist.h"
+#include "src/ctx.h"
+
+#define TREEMAP_MINIMUM_MERGES_TO_CONTINUE (100)
+#define SIZE_BUCKET (long)(5 * 1000 * 1000)
+#define DATE_BUCKET (long)(2629800) // ~30 days
+
+
+database_iterator_t *database_create_treemap_iterator(database_t *db, long threshold) {
+
+    sqlite3_stmt *stmt;
+
+    sqlite3_prepare_v2(db->db,
+                       "SELECT path, path_parent(path), size FROM tm"
+                       " WHERE path_parent(path) IN (SELECT path FROM tm)"
+                       " AND size<?",
+                       -1, &stmt, NULL);
+
+    sqlite3_bind_int64(stmt, 1, threshold);
+
+    database_iterator_t *iter = malloc(sizeof(database_iterator_t));
+
+    iter->stmt = stmt;
+    iter->db = db;
+
+    return iter;
+}
+
+treemap_row_t database_treemap_iter(database_iterator_t *iter) {
+
+    if (iter->stmt == NULL) {
+        LOG_FATAL("database.c", "FIXME: database_treemap_iter() called after iteration stopped");
+    }
+
+    int ret = sqlite3_step(iter->stmt);
+
+    if (ret == SQLITE_ROW) {
+        treemap_row_t row = {
+                .path = (const char *) sqlite3_column_text(iter->stmt, 0),
+                .parent = (const char *) sqlite3_column_text(iter->stmt, 1),
+                .size = sqlite3_column_int64(iter->stmt, 2)
+        };
+
+        return row;
+    }
+
+    if (ret != SQLITE_DONE) {
+        LOG_FATALF("database.c", "FIXME: doc iter returned %s", sqlite3_errmsg(iter->db->db));
+    }
+
+    sqlite3_finalize(iter->stmt);
+    iter->stmt = NULL;
+
+    return (treemap_row_t) {NULL, NULL, 0};
+}
+
+void database_generate_stats(database_t *db, double treemap_threshold) {
+
+    LOG_INFO("database.c", "Generating stats");
+
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "DELETE FROM stats_size_agg;", NULL, NULL, NULL));
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "DELETE FROM stats_date_agg;", NULL, NULL, NULL));
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "DELETE FROM stats_mime_agg;", NULL, NULL, NULL));
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "DELETE FROM stats_treemap;", NULL, NULL, NULL));
+
+    CRASH_IF_NOT_SQLITE_OK(
+            sqlite3_exec(db->db, "CREATE TEMP TABLE tm(path TEXT PRIMARY KEY, size INT);", NULL, NULL, NULL));
+
+    sqlite3_prepare_v2(db->db, "UPDATE tm SET size=size+? WHERE path=?;", -1, &db->treemap_merge_up_update_stmt, NULL);
+    sqlite3_prepare_v2(db->db, "DELETE FROM tm WHERE path = ?;", -1, &db->treemap_merge_up_delete_stmt, NULL);
+
+    // size aggregation
+    sqlite3_stmt *stmt;
+    sqlite3_prepare_v2(db->db, "INSERT INTO stats_size_agg"
+                               " SELECT"
+                               "  cast(size / ?1 as int) * ?1 as bucket,"
+                               "  count(*) as count"
+                               " FROM document"
+                               " GROUP BY bucket", -1, &stmt, NULL);
+    sqlite3_bind_int(stmt, 1, SIZE_BUCKET);
+    CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
+
+    sqlite3_finalize(stmt);
+
+    // date aggregation
+    sqlite3_prepare_v2(db->db, "INSERT INTO stats_date_agg"
+                               " SELECT"
+                               "  cast(mtime / ?1 as int) * ?1 as bucket,"
+                               "  count(*) as count"
+                               " FROM document"
+                               " GROUP BY bucket", -1, &stmt, NULL);
+    sqlite3_bind_int(stmt, 1, DATE_BUCKET);
+    CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
+
+    sqlite3_finalize(stmt);
+
+    // mime aggregation
+    sqlite3_prepare_v2(db->db, "INSERT INTO stats_mime_agg"
+                               " SELECT"
+                               "  (json_data->>'mime') as bucket,"
+                               "  sum(size),"
+                               "  count(*)"
+                               " FROM document"
+                               " WHERE bucket IS NOT NULL"
+                               " GROUP BY bucket", -1, &stmt, NULL);
+    CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
+
+    sqlite3_finalize(stmt);
+
+    // Treemap
+    sqlite3_prepare_v2(db->db, "SELECT SUM(size) FROM document;", -1, &stmt, NULL);
+    CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
+    long total_size = sqlite3_column_int64(stmt, 0);
+    long threshold = (long) ((double) total_size * treemap_threshold);
+    sqlite3_finalize(stmt);
+
+    // flat map
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db,
+                                        "INSERT INTO tm (path, size) SELECT json_data->>'path' as path, sum(size)"
+                                        " FROM document WHERE json_data->>'parent' IS NULL GROUP BY path;",
+                                        NULL, NULL, NULL));
+
+    // Merge up
+    int merged_rows = 0;
+    do {
+        if (merged_rows) {
+            LOG_INFOF("database.c", "Treemap merge iteration (%d rows changed)", merged_rows);
+        }
+        merged_rows = 0;
+
+        sqlite3_prepare_v2(db->db,
+                           "INSERT INTO tm (path, size) SELECT path_parent(path) as parent, 0 "
+                           " FROM tm WHERE parent not IN (SELECT path FROM tm) AND size<?"
+                           " ON CONFLICT DO NOTHING;", -1, &stmt, NULL);
+        sqlite3_bind_int64(stmt, 1, threshold);
+        CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
+
+        database_iterator_t *iter = database_create_treemap_iterator(db, threshold);
+        database_treemap_iter_foreach(row, iter) {
+            sqlite3_bind_int64(db->treemap_merge_up_update_stmt, 1, row.size);
+            sqlite3_bind_text(db->treemap_merge_up_update_stmt, 2, row.parent, -1, SQLITE_STATIC);
+            CRASH_IF_STMT_FAIL(sqlite3_step(db->treemap_merge_up_update_stmt));
+            CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->treemap_merge_up_update_stmt));
+
+            sqlite3_bind_text(db->treemap_merge_up_delete_stmt, 1, row.path, -1, SQLITE_STATIC);
+            CRASH_IF_STMT_FAIL(sqlite3_step(db->treemap_merge_up_delete_stmt));
+            CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->treemap_merge_up_delete_stmt));
+
+            merged_rows += 1;
+        }
+    } while (merged_rows > TREEMAP_MINIMUM_MERGES_TO_CONTINUE);
+
+    CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db,
+                                        "INSERT INTO stats_treemap (path, size) SELECT path,size FROM tm;",
+                                        NULL, NULL, NULL));
+
+    LOG_INFO("database.c", "Done!");
+}
+
+database_stat_type_d database_get_stat_type_by_mnemonic(const char *name) {
+    if (strcmp(name, "TMAP") == 0) {
+        return DATABASE_STAT_TREEMAP;
+    }
+    if (strcmp(name, "MAGG") == 0) {
+        return DATABASE_STAT_MIME_AGG;
+    }
+    if (strcmp(name, "SAGG") == 0) {
+        return DATABASE_STAT_SIZE_AGG;
+    }
+    if (strcmp(name, "DAGG") == 0) {
+        return DATABASE_STAT_DATE_AGG;
+    }
+
+    return DATABASE_STAT_INVALID;
+}
+
+cJSON *database_get_stats(database_t *db, database_stat_type_d type) {
+
+    sqlite3_stmt *stmt;
+
+    switch (type) {
+        case DATABASE_STAT_TREEMAP:
+            CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                    db->db, "SELECT path,size FROM stats_treemap", -1, &stmt, NULL
+            ));
+            break;
+        case DATABASE_STAT_DATE_AGG:
+            CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                    db->db, "SELECT bucket,count FROM stats_date_agg", -1, &stmt, NULL
+            ));
+            break;
+        case DATABASE_STAT_SIZE_AGG:
+            CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                    db->db, "SELECT bucket,count FROM stats_size_agg", -1, &stmt, NULL
+            ));
+            break;
+        case DATABASE_STAT_MIME_AGG:
+            CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
+                    db->db, "SELECT mime,size,count FROM stats_mime_agg", -1, &stmt, NULL
+            ));
+            break;
+        case DATABASE_STAT_INVALID:
+        default:
+        LOG_FATALF("database_stats.c", "Invalid stat type: %d", type);
+    }
+
+    cJSON *json = cJSON_CreateArray();
+
+    int ret;
+    do {
+        ret = sqlite3_step(stmt);
+        CRASH_IF_STMT_FAIL(ret);
+
+        if (ret == SQLITE_DONE) {
+            break;
+        }
+
+        cJSON *row = cJSON_CreateObject();
+
+        switch (type) {
+            case DATABASE_STAT_TREEMAP:
+                cJSON_AddStringToObject(row, "path", (const char *) sqlite3_column_text(stmt, 0));
+                cJSON_AddNumberToObject(row, "size", (double) sqlite3_column_int64(stmt, 1));
+                break;
+            case DATABASE_STAT_DATE_AGG:
+            case DATABASE_STAT_SIZE_AGG:
+                cJSON_AddNumberToObject(row, "bucket", (double) sqlite3_column_int64(stmt, 0));
+                cJSON_AddNumberToObject(row, "count", (double) sqlite3_column_int64(stmt, 1));
+                break;
+            case DATABASE_STAT_MIME_AGG:
+                cJSON_AddStringToObject(row, "mime", (const char *) sqlite3_column_text(stmt, 0));
+                cJSON_AddNumberToObject(row, "size", (double) sqlite3_column_int64(stmt, 1));
+                cJSON_AddNumberToObject(row, "count", (double) sqlite3_column_int64(stmt, 2));
+                break;
+        }
+
+        cJSON_AddItemToArray(json, row);
+    } while (TRUE);
+
+    return json;
+}
--- a/src/index/elastic.c
+++ b/src/index/elastic.c
@@ -29,7 +29,7 @@ void destroy_indexer(es_indexer_t *indexer) {
        return;
    }

-    LOG_DEBUG("elastic.c", "Destroying indexer")
+    LOG_DEBUG("elastic.c", "Destroying indexer");

    if (indexer->es_url != NULL) {
        free(indexer->es_url);
@@ -64,18 +64,17 @@ void print_json(cJSON *document, const char id_str[SIST_DOC_ID_LEN]) {
    cJSON_Delete(line);
 }

-void index_json_func(void *arg) {
-    es_bulk_line_t *line = arg;
-    elastic_index_line(line);
-}
+void delete_document(const char *document_id) {
+    es_bulk_line_t bulk_line;

-void delete_document(const char* document_id_str, void* UNUSED(_data)) {
-    es_bulk_line_t *bulk_line = malloc(sizeof(es_bulk_line_t));
-    bulk_line->type = ES_BULK_LINE_DELETE;
-    bulk_line->next = NULL;
+    bulk_line.type = ES_BULK_LINE_DELETE;
+    bulk_line.next = NULL;
+    strcpy(bulk_line.doc_id, document_id);

-    strcpy(bulk_line->doc_id, document_id_str);
-    tpool_add_work(IndexCtx.pool, index_json_func, bulk_line);
+    tpool_add_work(IndexCtx.pool, &(job_t) {
+            .type = JOB_BULK_LINE,
+            .bulk_line = &bulk_line,
+    });
 }


@@ -92,7 +91,11 @@ void index_json(cJSON *document, const char doc_id[SIST_DOC_ID_LEN]) {
    bulk_line->next = NULL;

    cJSON_free(json);
-    tpool_add_work(IndexCtx.pool, index_json_func, bulk_line);
+    tpool_add_work(IndexCtx.pool, &(job_t) {
+        .type = JOB_BULK_LINE,
+        .bulk_line = bulk_line,
+    });
+    free(bulk_line);
 }

 void execute_update_script(const char *script, int async, const char index_id[SIST_INDEX_ID_LEN]) {
@@ -266,7 +269,7 @@ void print_error(response_t *r) {
 void _elastic_flush(int max) {

    if (max == 0) {
-        LOG_WARNING("elastic.c", "calling _elastic_flush with 0 in queue")
+        LOG_WARNING("elastic.c", "calling _elastic_flush with 0 in queue");
        return;
    }

@@ -279,13 +282,13 @@ void _elastic_flush(int max) {
    response_t *r = web_post(bulk_url, buf, IndexCtx.es_insecure_ssl);

    if (r->status_code == 0) {
-        LOG_FATALF("elastic.c", "Could not connect to %s, make sure that elasticsearch is running!\n", IndexCtx.es_url)
+        LOG_FATALF("elastic.c", "Could not connect to %s, make sure that elasticsearch is running!\n", IndexCtx.es_url);
    }

    if (r->status_code == 413) {

        if (max <= 1) {
-            LOG_ERRORF("elastic.c", "Single document too large, giving up: {%s}", Indexer->line_head->doc_id)
+            LOG_ERRORF("elastic.c", "Single document too large, giving up: {%s}", Indexer->line_head->doc_id);
            free_response(r);
            free(buf);
            free_queue(1);
@@ -306,7 +309,7 @@ void _elastic_flush(int max) {

        free_response(r);
        free(buf);
-        LOG_WARNING("elastic.c", "Got 429 status, will retry after delay")
+        LOG_WARNING("elastic.c", "Got 429 status, will retry after delay");
        usleep(1000000 * 20);
        _elastic_flush(max);
        return;
@@ -441,7 +444,7 @@ es_version_t *elastic_get_version(const char *es_url, int insecure) {
    }

    if (cJSON_GetObjectItem(response, "error") != NULL) {
-        LOG_WARNING("elastic.c", "Could not get Elasticsearch version")
+        LOG_WARNING("elastic.c", "Could not get Elasticsearch version");
        print_error(r);
        free_response(r);
        return NULL;
@@ -477,7 +480,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
    IndexCtx.es_version = es_version;

    if (es_version == NULL) {
-        LOG_FATAL("elastic.c", "Could not get ES version")
+        LOG_FATAL("elastic.c", "Could not get ES version");
    }

    LOG_INFOF("elastic.c",
@@ -485,7 +488,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
              format_es_version(es_version), IS_SUPPORTED_ES_VERSION(es_version), IS_LEGACY_VERSION(es_version));

    if (!IS_SUPPORTED_ES_VERSION(es_version)) {
-        LOG_FATAL("elastic.c", "This elasticsearch version is not supported!")
+        LOG_FATAL("elastic.c", "This elasticsearch version is not supported!");
    }

    char *settings = NULL;
@@ -512,7 +515,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s

        if (r->status_code != 200) {
            print_error(r);
-            LOG_FATAL("elastic.c", "Could not create index")
+            LOG_FATAL("elastic.c", "Could not create index");
        }

        LOG_INFOF("elastic.c", "Create index <%d>", r->status_code);
@@ -533,12 +536,13 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
        LOG_INFOF("elastic.c", "Update ES settings <%d>", r->status_code);
        if (r->status_code != 200) {
            print_error(r);
-            LOG_FATAL("elastic.c", "Could not update user settings")
+            LOG_FATAL("elastic.c", "Could not update user settings");
        }
        free_response(r);

        if (IS_LEGACY_VERSION(es_version)) {
-            snprintf(url, sizeof(url), "%s/%s/_mappings/_doc?include_type_name=true", IndexCtx.es_url, IndexCtx.es_index);
+            snprintf(url, sizeof(url), "%s/%s/_mappings/_doc?include_type_name=true", IndexCtx.es_url,
+                     IndexCtx.es_index);
        } else {
            snprintf(url, sizeof(url), "%s/%s/_mappings", IndexCtx.es_url, IndexCtx.es_index);
        }
@@ -547,7 +551,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
        LOG_INFOF("elastic.c", "Update ES mappings <%d>", r->status_code);
        if (r->status_code != 200) {
            print_error(r);
-            LOG_FATAL("elastic.c", "Could not update user mappings")
+            LOG_FATAL("elastic.c", "Could not update user mappings");
        }
        free_response(r);

--- a/src/index/elastic.h
+++ b/src/index/elastic.h
@@ -46,7 +46,7 @@ void print_json(cJSON *document, const char index_id_str[SIST_INDEX_ID_LEN]);

 void index_json(cJSON *document, const char doc_id[SIST_INDEX_ID_LEN]);

-void delete_document(const char *document_id_str, void* data);
+void delete_document(const char *document_id);

 es_indexer_t *create_indexer(const char *url, const char *index);

--- a/src/index/static_generated.c
+++ b/src/index/static_generated.c
--- a/src/index/web.c
+++ b/src/index/web.c
@@ -65,7 +65,7 @@ void web_post_async_poll(subreq_ctx_t *req) {
        curl_easy_getinfo(req->handle, CURLINFO_RESPONSE_CODE, &req->response->status_code);

        if (req->response->status_code == 0) {
-            LOG_ERRORF("web.c", "CURL Error: %s", req->curl_err_buffer)
+            LOG_ERRORF("web.c", "CURL Error: %s", req->curl_err_buffer);
        }

        curl_multi_cleanup(req->multi);
@@ -104,7 +104,7 @@ subreq_ctx_t *web_post_async(const char *url, char *data, int insecure) {
    curl_multi_add_handle(req->multi, curl);
    curl_multi_perform(req->multi, &req->running_handles);

-    LOG_DEBUGF("web.c", "async request POST %s", url)
+    LOG_DEBUGF("web.c", "async request POST %s", url);

    return req;
 }
@@ -136,7 +136,7 @@ response_t *web_get(const char *url, int timeout, int insecure) {
    curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp->status_code);

    if (resp->status_code == 0) {
-        LOG_ERRORF("web.c", "CURL Error: %s", err_buffer)
+        LOG_ERRORF("web.c", "CURL Error: %s", err_buffer);
    }

    curl_easy_cleanup(curl);
@@ -180,7 +180,7 @@ response_t *web_post(const char *url, const char *data, int insecure) {
    resp->size = buffer.cur;

    if (resp->status_code == 0) {
-        LOG_ERRORF("web.c", "CURL Error: %s", err_buffer)
+        LOG_ERRORF("web.c", "CURL Error: %s", err_buffer);
    }

    curl_easy_cleanup(curl);
--- a/src/io/serialize.c
+++ b/src/io/serialize.c
@@ -1,9 +1,7 @@
 #include "src/ctx.h"
 #include "serialize.h"
-#include "src/parsing/parse.h"
 #include "src/parsing/mime.h"

-#include <zstd.h>

 char *get_meta_key_text(enum metakey meta_key) {

@@ -79,7 +77,7 @@ char *get_meta_key_text(enum metakey meta_key) {
        case MetaChecksum:
            return "checksum";
        default:
-        LOG_FATALF("serialize.c", "FIXME: Unknown meta key: %d", meta_key)
+        LOG_FATALF("serialize.c", "FIXME: Unknown meta key: %d", meta_key);
    }
 }

@@ -93,8 +91,6 @@ char *build_json_string(document_t *doc) {
    } else {
        cJSON_AddStringToObject(json, "mime", mime_text);
    }
-    cJSON_AddNumberToObject(json, "size", (double) doc->size);
-    cJSON_AddNumberToObject(json, "mtime", doc->mtime);

    // Ignore root directory in the file path
    doc->ext = (short) (doc->ext - ScanCtx.index.desc.root_len);
@@ -124,8 +120,6 @@ char *build_json_string(document_t *doc) {
        cJSON_AddStringToObject(json, "path", "");
    }

-    cJSON_AddStringToObject(json, "_id", doc->doc_id);
-
    // Metadata
    meta_line_t *meta = doc->meta_head;
    while (meta != NULL) {
@@ -175,7 +169,7 @@ char *build_json_string(document_t *doc) {
                break;
            }
            default:
-            LOG_FATALF("serialize.c", "Invalid meta key: %x %s", meta->key, get_meta_key_text(meta->key))
+            LOG_FATALF("serialize.c", "Invalid meta key: %x %s", meta->key, get_meta_key_text(meta->key));
        }

        meta_line_t *tmp = meta;
@@ -189,391 +183,10 @@ char *build_json_string(document_t *doc) {
    return json_str;
 }

-static struct {
-    FILE *out_file;
-    size_t buf_out_size;
-
-    void *buf_out;
-
-    ZSTD_CCtx *cctx;
-} WriterCtx = {
-        .out_file =  NULL
-};
-
-#define ZSTD_COMPRESSION_LEVEL 10
-
-void initialize_writer_ctx(const char *file_path) {
-    WriterCtx.out_file = fopen(file_path, "wb");
-
-    WriterCtx.buf_out_size = ZSTD_CStreamOutSize();
-    WriterCtx.buf_out = malloc(WriterCtx.buf_out_size);
-
-    WriterCtx.cctx = ZSTD_createCCtx();
-
-    ZSTD_CCtx_setParameter(WriterCtx.cctx, ZSTD_c_compressionLevel, ZSTD_COMPRESSION_LEVEL);
-    ZSTD_CCtx_setParameter(WriterCtx.cctx, ZSTD_c_checksumFlag, FALSE);
-
-    LOG_DEBUGF("serialize.c", "Open index file for writing %s", file_path)
-}
-
-void zstd_write_string(const char *string, const size_t len) {
-    ZSTD_inBuffer input = {string, len, 0};
-
-    do {
-        ZSTD_outBuffer output = {WriterCtx.buf_out, WriterCtx.buf_out_size, 0};
-        ZSTD_compressStream2(WriterCtx.cctx, &output, &input, ZSTD_e_continue);
-
-        if (output.pos > 0) {
-            ScanCtx.stat_index_size += fwrite(WriterCtx.buf_out, 1, output.pos, WriterCtx.out_file);
-        }
-    } while (input.pos != input.size);
-}
-
-void write_document_func(void *arg) {
-
-    if (WriterCtx.out_file == NULL) {
-        char dstfile[PATH_MAX];
-        snprintf(dstfile, PATH_MAX, "%s_index_main.ndjson.zst", ScanCtx.index.path);
-        initialize_writer_ctx(dstfile);
-    }
-
-    document_t *doc = arg;
-
-    char *json_str = build_json_string(doc);
-    const size_t json_str_len = strlen(json_str);
-
-    json_str = realloc(json_str, json_str_len + 1);
-    *(json_str + json_str_len) = '\n';
-
-    zstd_write_string(json_str, json_str_len + 1);
-
-    free(json_str);
-    free(doc->filepath);
-}
-
-void zstd_close() {
-    if (WriterCtx.out_file == NULL) {
-        LOG_DEBUG("serialize.c", "No zstd stream to close, skipping cleanup")
-        return;
-    }
-
-    size_t remaining;
-    do {
-        ZSTD_outBuffer output = {WriterCtx.buf_out, WriterCtx.buf_out_size, 0};
-        remaining = ZSTD_endStream(WriterCtx.cctx, &output);
-
-        if (output.pos > 0) {
-            ScanCtx.stat_index_size += fwrite(WriterCtx.buf_out, 1, output.pos, WriterCtx.out_file);
-        }
-    } while (remaining != 0);
-
-    ZSTD_freeCCtx(WriterCtx.cctx);
-    free(WriterCtx.buf_out);
-    fclose(WriterCtx.out_file);
-
-    LOG_DEBUG("serialize.c", "End zstd stream & close index file")
-}
-
-void writer_cleanup() {
-    zstd_close();
-    WriterCtx.out_file = NULL;
-}
-
-void write_index_descriptor(char *path, index_descriptor_t *desc) {
-    cJSON *json = cJSON_CreateObject();
-    cJSON_AddStringToObject(json, "id", desc->id);
-    cJSON_AddStringToObject(json, "version", desc->version);
-    cJSON_AddStringToObject(json, "root", desc->root);
-    cJSON_AddStringToObject(json, "name", desc->name);
-    cJSON_AddStringToObject(json, "type", desc->type);
-    cJSON_AddStringToObject(json, "rewrite_url", desc->rewrite_url);
-    cJSON_AddNumberToObject(json, "timestamp", (double) desc->timestamp);
-
-    int fd = open(path, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR);
-    if (fd < 0) {
-        LOG_FATALF("serialize.c", "Could not open index descriptor: %s", strerror(errno));
-    }
-    char *str = cJSON_Print(json);
-    size_t ret = write(fd, str, strlen(str));
-    if (ret == -1) {
-        LOG_FATALF("serialize.c", "Could not write index descriptor: %s", strerror(errno));
-    }
-    free(str);
-    close(fd);
-
-    cJSON_Delete(json);
-}
-
-index_descriptor_t read_index_descriptor(char *path) {
-
-    struct stat info;
-    stat(path, &info);
-    int fd = open(path, O_RDONLY);
-
-    if (fd == -1) {
-        LOG_FATALF("serialize.c", "Invalid/corrupt index (Could not find descriptor): %s: %s\n", path, strerror(errno))
-    }
-
-    char *buf = malloc(info.st_size + 1);
-    size_t ret = read(fd, buf, info.st_size);
-    if (ret == -1) {
-        LOG_FATALF("serialize.c", "Could not read index descriptor: %s", strerror(errno));
-    }
-    *(buf + info.st_size) = '\0';
-    close(fd);
-
-    cJSON *json = cJSON_Parse(buf);
-
-    index_descriptor_t descriptor;
-    descriptor.timestamp = (long) cJSON_GetObjectItem(json, "timestamp")->valuedouble;
-    strcpy(descriptor.root, cJSON_GetObjectItem(json, "root")->valuestring);
-    strcpy(descriptor.name, cJSON_GetObjectItem(json, "name")->valuestring);
-    strcpy(descriptor.rewrite_url, cJSON_GetObjectItem(json, "rewrite_url")->valuestring);
-    descriptor.root_len = (short) strlen(descriptor.root);
-    strcpy(descriptor.version, cJSON_GetObjectItem(json, "version")->valuestring);
-    strcpy(descriptor.id, cJSON_GetObjectItem(json, "id")->valuestring);
-    if (cJSON_GetObjectItem(json, "type") == NULL) {
-        strcpy(descriptor.type, INDEX_TYPE_NDJSON);
-    } else {
-        strcpy(descriptor.type, cJSON_GetObjectItem(json, "type")->valuestring);
-    }
-
-    cJSON_Delete(json);
-    free(buf);
-
-    return descriptor;
-}
-
-
 void write_document(document_t *doc) {
-    tpool_add_work(ScanCtx.writer_pool, write_document_func, doc);
-}
+    char *json_str = build_json_string(doc);

-void thread_cleanup() {
-    cleanup_parse();
-    cleanup_font();
-}
-
-void read_index_bin_handle_line(const char *line, const char *index_id, index_func func) {
-
-    cJSON *document = cJSON_Parse(line);
-    const char *path_md5_str = cJSON_GetObjectItem(document, "_id")->valuestring;
-
-    cJSON_AddStringToObject(document, "index", index_id);
-
-    // Load meta from sidecar files
-    cJSON *meta_obj = NULL;
-    if (IndexCtx.meta != NULL) {
-        const char *meta_string = g_hash_table_lookup(IndexCtx.meta, path_md5_str);
-        if (meta_string != NULL) {
-            meta_obj = cJSON_Parse(meta_string);
-
-            cJSON *child;
-            for (child = meta_obj->child; child != NULL; child = child->next) {
-                char meta_key[4096];
-                strcpy(meta_key, child->string);
-                cJSON_DeleteItemFromObject(document, meta_key);
-                cJSON_AddItemReferenceToObject(document, meta_key, child);
-            }
-        }
-    }
-
-    // Load tags from tags DB
-    if (IndexCtx.tags != NULL) {
-        const char *tags_string = g_hash_table_lookup(IndexCtx.tags, path_md5_str);
-        if (tags_string != NULL) {
-            cJSON *tags_arr = cJSON_Parse(tags_string);
-            cJSON_DeleteItemFromObject(document, "tag");
-            cJSON_AddItemToObject(document, "tag", tags_arr);
-        }
-    }
-
-    func(document, path_md5_str);
-    cJSON_DeleteItemFromObject(document, "_id");
-    cJSON_Delete(document);
-    if (meta_obj) {
-        cJSON_Delete(meta_obj);
-    }
-}
-
-void read_lines(const char *path, const line_processor_t processor) {
-    dyn_buffer_t buf = dyn_buffer_create();
-
-    // Initialize zstd things
-    FILE *file = fopen(path, "rb");
-
-    size_t const buf_in_size = ZSTD_DStreamInSize();
-    void *const buf_in = malloc(buf_in_size);
-
-    size_t const buf_out_size = ZSTD_DStreamOutSize();
-    void *const buf_out = malloc(buf_out_size);
-
-    ZSTD_DCtx *const dctx = ZSTD_createDCtx();
-
-    size_t read;
-    size_t last_ret = 0;
-    while ((read = fread(buf_in, 1, buf_in_size, file))) {
-        ZSTD_inBuffer input = {buf_in, read, 0};
-
-        while (input.pos < input.size) {
-            ZSTD_outBuffer output = {buf_out, buf_out_size, 0};
-
-            size_t const ret = ZSTD_decompressStream(dctx, &output, &input);
-
-            for (int i = 0; i < output.pos; i++) {
-                char c = ((char *) output.dst)[i];
-
-                if (c == '\n') {
-                    dyn_buffer_write_char(&buf, '\0');
-                    processor.func(buf.buf, processor.data);
-                    buf.cur = 0;
-                } else {
-                    dyn_buffer_write_char(&buf, c);
-                }
-            }
-
-            last_ret = ret;
-        }
-    }
-
-    if (last_ret != 0) {
-        /* The last return value from ZSTD_decompressStream did not end on a
-         * frame, but we reached the end of the file! We assume this is an
-         * error, and the input was truncated.
-         */
-        LOG_FATALF("serialize.c", "EOF before end of stream: %zu", last_ret)
-    }
-
-    ZSTD_freeDCtx(dctx);
-    free(buf_in);
-    free(buf_out);
-
-    dyn_buffer_destroy(&buf);
-    fclose(file);
-}
-
-void read_index_ndjson(const char *line, void *_data) {
-    void **data = _data;
-    const char *index_id = data[0];
-    index_func func = data[1];
-    read_index_bin_handle_line(line, index_id, func);
-}
-
-void read_index(const char *path, const char index_id[SIST_INDEX_ID_LEN], const char *type, index_func func) {
-    if (strcmp(type, INDEX_TYPE_NDJSON) == 0) {
-        read_lines(path, (line_processor_t) {
-                .data = (void *[2]) {(void *) index_id, func},
-                .func = read_index_ndjson,
-        });
-    }
-}
-
-static __thread GHashTable *IncrementalReadTable = NULL;
-
-void json_put_incremental(cJSON *document, UNUSED(const char doc_id[SIST_DOC_ID_LEN])) {
-    const char *path_md5_str = cJSON_GetObjectItem(document, "_id")->valuestring;
-    const int mtime = cJSON_GetObjectItem(document, "mtime")->valueint;
-
-    incremental_put(IncrementalReadTable, path_md5_str, mtime);
-}
-
-void incremental_read(GHashTable *table, const char *filepath, index_descriptor_t *desc) {
-    IncrementalReadTable = table;
-    read_index(filepath, desc->id, desc->type, json_put_incremental);
-}
-
-static __thread GHashTable *IncrementalCopyTable = NULL;
-static __thread GHashTable *IncrementalNewTable = NULL;
-static __thread store_t *IncrementalCopySourceStore = NULL;
-static __thread store_t *IncrementalCopyDestinationStore = NULL;
-
-void incremental_copy_handle_doc(cJSON *document, UNUSED(const char id_str[SIST_DOC_ID_LEN])) {
-
-    const char *doc_id = cJSON_GetObjectItem(document, "_id")->valuestring;
-
-    if (cJSON_GetObjectItem(document, "parent") != NULL || incremental_get(IncrementalCopyTable, doc_id)) {
-        // Copy index line
-        cJSON_DeleteItemFromObject(document, "index");
-        char *json_str = cJSON_PrintUnformatted(document);
-        const size_t json_str_len = strlen(json_str);
-
-        json_str = realloc(json_str, json_str_len + 1);
-        *(json_str + json_str_len) = '\n';
-
-        // Copy tn store contents
-        size_t buf_len;
-        char *buf = store_read(IncrementalCopySourceStore, (char *) doc_id, SIST_DOC_ID_LEN, &buf_len);
-        if (buf_len != 0) {
-            store_write(IncrementalCopyDestinationStore, (char *) doc_id, SIST_DOC_ID_LEN, buf, buf_len);
-            free(buf);
-        }
-
-        // Also copy additional thumbnails
-        if (cJSON_GetObjectItem(document, "thumbnail") != NULL) {
-            const int thumbnail_count = cJSON_GetObjectItem(document, "thumbnail")->valueint;
-
-            for (int i = 1; i < thumbnail_count; i++) {
-                char tn_key[SIST_DOC_ID_LEN + sizeof(char) * 4];
-
-                snprintf(tn_key, sizeof(tn_key), "%s%04d", doc_id, i);
-
-                buf = store_read(IncrementalCopySourceStore, tn_key, sizeof(tn_key), &buf_len);
-                if (buf_len != 0) {
-                    store_write(IncrementalCopyDestinationStore, tn_key, sizeof(tn_key), buf, buf_len);
-                    free(buf);
-                }
-            }
-        }
-
-        zstd_write_string(json_str, json_str_len + 1);
-        free(json_str);
-    }
-}
-
-/**
- * Copy items from an index that are in the copy_table. Also copies from
- * the store.
- */
-void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
-                      const char *dst_filepath, GHashTable *copy_table) {
-
-    if (WriterCtx.out_file == NULL) {
-        initialize_writer_ctx(dst_filepath);
-    }
-
-    IncrementalCopyTable = copy_table;
-    IncrementalCopySourceStore = store;
-    IncrementalCopyDestinationStore = dst_store;
-
-    read_index(filepath, "", INDEX_TYPE_NDJSON, incremental_copy_handle_doc);
-}
-
-void incremental_delete_handle_doc(cJSON *document, UNUSED(const char id_str[SIST_DOC_ID_LEN])) {
-
-    char doc_id_n[SIST_DOC_ID_LEN + 1];
-    doc_id_n[SIST_DOC_ID_LEN] = '\0';
-    doc_id_n[SIST_DOC_ID_LEN - 1] = '\n';
-    const char *doc_id = cJSON_GetObjectItem(document, "_id")->valuestring;
-
-    // do not delete archive virtual entries
-    if (cJSON_GetObjectItem(document, "parent") == NULL 
-        && !incremental_get(IncrementalCopyTable, doc_id)
-        && !incremental_get(IncrementalNewTable, doc_id)
-        ) {
-        memcpy(doc_id_n, doc_id, SIST_DOC_ID_LEN - 1);
-        zstd_write_string(doc_id, sizeof(doc_id_n));
-    }
-}
-
-void incremental_delete(const char *del_filepath, const char *index_filepath,
-                        GHashTable *copy_table, GHashTable *new_table) {
-
-    if (WriterCtx.out_file == NULL) {
-        initialize_writer_ctx(del_filepath);
-    }
-
-    IncrementalCopyTable = copy_table;
-    IncrementalNewTable = new_table;
-
-    read_index(index_filepath, "", INDEX_TYPE_NDJSON, incremental_delete_handle_doc);
-}
+    database_write_document(ProcData.index_db, doc, json_str);
+    free(doc);
+    free(json_str);
+}
--- a/src/io/serialize.h
+++ b/src/io/serialize.h
@@ -2,55 +2,7 @@
 #define SIST2_SERIALIZE_H

 #include "src/sist.h"
-#include "store.h"
-
-#include <sys/syscall.h>
-#include <glib.h>
-
-typedef struct line_processor {
-  void* data;
-  void (*func)(const char*, void*);
-} line_processor_t;
-
-typedef void(*index_func)(cJSON *, const char[SIST_DOC_ID_LEN]);
-
-void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
-                      const char *dst_filepath, GHashTable *copy_table);
-
-void incremental_delete(const char *del_filepath, const char* index_filepath, 
-                        GHashTable *copy_table, GHashTable *new_table);

 void write_document(document_t *doc);

-void read_lines(const char *path, const line_processor_t processor);
-
-void read_index(const char *path, const char index_id[SIST_INDEX_ID_LEN], const char *type, index_func);
-
-void incremental_read(GHashTable *table, const char *filepath, index_descriptor_t *desc);
-
-/**
- * Must be called after write_document
- */
-void thread_cleanup();
-
-void writer_cleanup();
-
-void write_index_descriptor(char *path, index_descriptor_t *desc);
-
-index_descriptor_t read_index_descriptor(char *path);
-
-// caller ensures char file_path[PATH_MAX]
-#define READ_INDICES(file_path, index_path, action_ok, action_main_fail, cond_original) \
-    snprintf(file_path, PATH_MAX, "%s_index_main.ndjson.zst", index_path);              \
-    if (access(file_path, R_OK) == 0) {                                                 \
-        action_ok;                                                                      \
-    } else {                                                                            \
-        action_main_fail;                                                               \
-    }                                                                                   \
-    snprintf(file_path, PATH_MAX, "%s_index_original.ndjson.zst", index_path);          \
-    if ((cond_original) && access(file_path, R_OK) == 0) {                              \
-        action_ok;                                                                      \
-    }                                                                                   \
-
-
 #endif
--- a/src/io/store.c
+++ b/src/io/store.c
@@ -1,195 +0,0 @@
-#include "store.h"
-#include "src/ctx.h"
-
-store_t *store_create(const char *path, size_t chunk_size) {
-    store_t *store = malloc(sizeof(struct store_t));
-    mkdir(path, S_IWUSR | S_IRUSR | S_IXUSR);
-    strcpy(store->path, path);
-
-#if (SIST_FAKE_STORE != 1)
-    store->chunk_size = chunk_size;
-    pthread_rwlock_init(&store->lock, NULL);
-
-    mdb_env_create(&store->env);
-
-    int open_ret = mdb_env_open(store->env,
-                                path,
-                                MDB_WRITEMAP | MDB_MAPASYNC,
-                                S_IRUSR | S_IWUSR
-    );
-
-    if (open_ret != 0) {
-        LOG_FATALF("store.c", "Error while opening store: %s (%s)\n", mdb_strerror(open_ret), path)
-    }
-
-    store->size = (size_t) store->chunk_size;
-    mdb_env_set_mapsize(store->env, store->size);
-
-    // Open dbi
-    MDB_txn *txn;
-    mdb_txn_begin(store->env, NULL, 0, &txn);
-    mdb_dbi_open(txn, NULL, 0, &store->dbi);
-    mdb_txn_commit(txn);
-#endif
-
-    return store;
-}
-
-void store_destroy(store_t *store) {
-
-#if (SIST_FAKE_STORE != 1)
-    pthread_rwlock_destroy(&store->lock);
-    mdb_dbi_close(store->env, store->dbi);
-    mdb_env_close(store->env);
-#endif
-    free(store);
-}
-
-void store_flush(store_t *store) {
-    mdb_env_sync(store->env, TRUE);
-}
-
-void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len) {
-
-    if (LogCtx.very_verbose) {
-        LOG_DEBUGF("store.c", "Store write %s@{%s} %lu bytes", store->path, key, buf_len)
-    }
-
-#if (SIST_FAKE_STORE != 1)
-
-    MDB_val mdb_key;
-    mdb_key.mv_data = key;
-    mdb_key.mv_size = key_len;
-
-    MDB_val mdb_value;
-    mdb_value.mv_data = buf;
-    mdb_value.mv_size = buf_len;
-
-    MDB_txn *txn;
-    pthread_rwlock_rdlock(&store->lock);
-    mdb_txn_begin(store->env, NULL, 0, &txn);
-
-    int put_ret = mdb_put(txn, store->dbi, &mdb_key, &mdb_value, 0);
-    ScanCtx.stat_tn_size += buf_len;
-
-    int db_full = FALSE;
-    int should_abort_transaction = FALSE;
-
-    if (put_ret == MDB_MAP_FULL) {
-        db_full = TRUE;
-        should_abort_transaction = TRUE;
-    } else {
-        int commit_ret = mdb_txn_commit(txn);
-
-        if (commit_ret == MDB_MAP_FULL) {
-            db_full = TRUE;
-        }
-    }
-
-    if (db_full) {
-        LOG_DEBUGF("store.c", "Updating mdb mapsize to %lu bytes", store->size)
-
-        if (should_abort_transaction) {
-            mdb_txn_abort(txn);
-        }
-
-        pthread_rwlock_unlock(&store->lock);
-
-        // Cannot resize when there is a opened transaction.
-        //  Resize take effect on the next commit.
-        pthread_rwlock_wrlock(&store->lock);
-        store->size += store->chunk_size;
-        int resize_ret = mdb_env_set_mapsize(store->env, store->size);
-        if (resize_ret != 0) {
-            LOG_ERROR("store.c", mdb_strerror(put_ret))
-        }
-        mdb_txn_begin(store->env, NULL, 0, &txn);
-        int put_ret_retry = mdb_put(txn, store->dbi, &mdb_key, &mdb_value, 0);
-
-        if (put_ret_retry != 0) {
-            LOG_ERROR("store.c", mdb_strerror(put_ret))
-        }
-
-        int ret = mdb_txn_commit(txn);
-        if (ret != 0) {
-            LOG_FATALF("store.c", "FIXME: Could not commit to store %s: %s (%d), %d, %d %d",
-                       store->path, mdb_strerror(ret), ret,
-                       put_ret, put_ret_retry);
-        }
-        LOG_DEBUGF("store.c", "Updated mdb mapsize to %lu bytes", store->size)
-    } else if (put_ret != 0) {
-        LOG_ERROR("store.c", mdb_strerror(put_ret))
-    }
-
-    pthread_rwlock_unlock(&store->lock);
-
-#endif
-}
-
-char *store_read(store_t *store, char *key, size_t key_len, size_t *ret_vallen) {
-    char *buf = NULL;
-
-#if (SIST_FAKE_STORE != 1)
-    MDB_val mdb_key;
-    mdb_key.mv_data = key;
-    mdb_key.mv_size = key_len;
-
-    MDB_val mdb_value;
-
-    MDB_txn *txn;
-    mdb_txn_begin(store->env, NULL, MDB_RDONLY, &txn);
-
-    int get_ret = mdb_get(txn, store->dbi, &mdb_key, &mdb_value);
-
-    if (get_ret == MDB_NOTFOUND) {
-        *ret_vallen = 0;
-    } else {
-        *ret_vallen = mdb_value.mv_size;
-        buf = malloc(mdb_value.mv_size);
-        memcpy(buf, mdb_value.mv_data, mdb_value.mv_size);
-    }
-
-    mdb_txn_abort(txn);
-#endif
-    return buf;
-}
-
-GHashTable *store_read_all(store_t *store) {
-
-    int count = 0;
-
-    GHashTable *table = g_hash_table_new_full(g_str_hash, g_str_equal, free, free);
-
-    MDB_txn *txn = NULL;
-    mdb_txn_begin(store->env, NULL, MDB_RDONLY, &txn);
-
-    MDB_cursor *cur = NULL;
-    mdb_cursor_open(txn, store->dbi, &cur);
-
-    MDB_val key;
-    MDB_val value;
-
-    while (mdb_cursor_get(cur, &key, &value, MDB_NEXT) == 0) {
-        char *key_str = malloc(key.mv_size);
-        memcpy(key_str, key.mv_data, key.mv_size);
-        char *val_str = malloc(value.mv_size);
-        memcpy(val_str, value.mv_data, value.mv_size);
-
-        g_hash_table_insert(table, key_str, val_str);
-        count += 1;
-    }
-
-    const char *path;
-    mdb_env_get_path(store->env, &path);
-    LOG_DEBUGF("store.c", "Read %d entries from %s", count, path);
-
-    mdb_cursor_close(cur);
-    mdb_txn_abort(txn);
-    return table;
-}
-
-
-void store_copy(store_t *store, const char *destination) {
-    mkdir(destination, S_IWUSR | S_IRUSR | S_IXUSR);
-    mdb_env_copy(store->env, destination);
-}
--- a/src/io/store.h
+++ b/src/io/store.h
@@ -1,37 +0,0 @@
-#ifndef SIST2_STORE_H
-#define SIST2_STORE_H
-
-#include <pthread.h>
-#include <lmdb.h>
-
-#include <glib.h>
-
-#define STORE_SIZE_TN (1024 * 1024 * 5)
-#define STORE_SIZE_TAG (1024 * 1024)
-#define STORE_SIZE_META STORE_SIZE_TAG
-
-typedef struct store_t {
-    char path[PATH_MAX];
-    char *tmp_path;
-    MDB_dbi dbi;
-    MDB_env *env;
-    size_t size;
-    size_t chunk_size;
-    pthread_rwlock_t lock;
-} store_t;
-
-store_t *store_create(const char *path, size_t chunk_size);
-
-void store_destroy(store_t *store);
-
-void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len);
-
-void store_flush(store_t *store);
-
-char *store_read(store_t *store, char *key, size_t key_len, size_t *ret_vallen);
-
-GHashTable *store_read_all(store_t *store);
-
-void store_copy(store_t *store, const char *destination);
-
-#endif
--- a/src/io/walk.c
+++ b/src/io/walk.c
@@ -1,44 +1,12 @@
 #include "walk.h"
 #include "src/ctx.h"
-#include "src/parsing/parse.h"
+#include "src/parsing/fs_util.h"

 #include <ftw.h>
+#include <pthread.h>

 #define STR_STARTS_WITH(x, y) (strncmp(y, x, strlen(y) - 1) == 0)

-__always_inline
-parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info, int base) {
-    int len = (int) strlen(filepath);
-    parse_job_t *job = malloc(sizeof(parse_job_t) + len);
-
-    strcpy(job->filepath, filepath);
-    job->base = base;
-    char *p = strrchr(filepath + base, '.');
-    if (p != NULL) {
-        job->ext = (int) (p - filepath + 1);
-    } else {
-        job->ext = len;
-    }
-
-    job->vfile.info = *info;
-
-    job->parent[0] = '\0';
-
-    job->vfile.filepath = job->filepath;
-    job->vfile.read = fs_read;
-    // Filesystem reads are always rewindable
-    job->vfile.read_rewindable = fs_read;
-    job->vfile.reset = fs_reset;
-    job->vfile.close = fs_close;
-    job->vfile.fd = -1;
-    job->vfile.is_fs_file = TRUE;
-    job->vfile.has_checksum = FALSE;
-    job->vfile.rewind_buffer_size = 0;
-    job->vfile.rewind_buffer = NULL;
-    job->vfile.calculate_checksum = ScanCtx.calculate_checksums;
-
-    return job;
-}

 int sub_strings[30];
 #define EXCLUDED(str) (pcre_exec(ScanCtx.exclude, ScanCtx.exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
@@ -53,12 +21,9 @@ int handle_entry(const char *filepath, const struct stat *info, int typeflag, st
    }

    if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) {
-        LOG_DEBUGF("walk.c", "Excluded: %s", filepath)
+        LOG_DEBUGF("walk.c", "Excluded: %s", filepath);

        if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
-            pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
-            ScanCtx.dbg_excluded_files_count += 1;
-            pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
        } else if (typeflag == FTW_D) {
            return FTW_SKIP_SUBTREE;
        }
@@ -67,8 +32,13 @@ int handle_entry(const char *filepath, const struct stat *info, int typeflag, st
    }

    if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
-        parse_job_t *job = create_fs_parse_job(filepath, info, ftw->base);
-        tpool_add_work(ScanCtx.pool, parse, job);
+        parse_job_t *job = create_parse_job(filepath, (int) info->st_mtim.tv_sec, info->st_size);
+
+        tpool_add_work(ScanCtx.pool, &(job_t) {
+                .type = JOB_PARSE_JOB,
+                .parse_job = job
+        });
+        free(job);
    }

    return FTW_CONTINUE;
@@ -109,14 +79,7 @@ int iterate_file_list(void *input_file) {
        }

        if (ScanCtx.exclude != NULL && EXCLUDED(absolute_path)) {
-            LOG_DEBUGF("walk.c", "Excluded: %s", absolute_path)
-
-            if (S_ISREG(info.st_mode)) {
-                pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
-                ScanCtx.dbg_excluded_files_count += 1;
-                pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
-            }
-
+            LOG_DEBUGF("walk.c", "Excluded: %s", absolute_path);
            continue;
        }

@@ -124,11 +87,14 @@ int iterate_file_list(void *input_file) {
            LOG_FATALF("walk.c", "File is not a children of root folder (%s): %s", ScanCtx.index.desc.root, buf);
        }

-        int base = (int) (strrchr(buf, '/') - buf) + 1;
-
-        parse_job_t *job = create_fs_parse_job(absolute_path, &info, base);
+        parse_job_t *job = create_parse_job(absolute_path, (int) info.st_mtim.tv_sec, info.st_size);
        free(absolute_path);
-        tpool_add_work(ScanCtx.pool, parse, job);
+
+        tpool_add_work(ScanCtx.pool, &(job_t) {
+                .type = JOB_PARSE_JOB,
+                .parse_job = job
+        });
+        free(job);
    }

    return 0;
--- a/src/log.c
+++ b/src/log.c
@@ -21,8 +21,6 @@ void vsist_logf(const char *filepath, int level, char *format, va_list ap) {

    char log_str[LOG_MAX_LENGTH];

-    unsigned long long pid = (unsigned long long) pthread_self();
-
    char datetime[32];
    time_t t;
    struct tm result;
@@ -42,8 +40,8 @@ void vsist_logf(const char *filepath, int level, char *format, va_list ap) {

        log_len = snprintf(
                log_str, sizeof(log_str),
-                "{\"thread\":\"%04llX\",\"datetime\":\"%s\",\"level\":\"%s\",\"filepath\":%s,\"message\":%s}\n",
-                pid, datetime, log_levels[level], filepath_json_str, log_str_json_str
+                "{\"thread\":\"T%d\",\"datetime\":\"%s\",\"level\":\"%s\",\"filepath\":%s,\"message\":%s}\n",
+                ProcData.thread_id, datetime, log_levels[level], filepath_json_str, log_str_json_str
        );

        cJSON_Delete(filepath_json);
@@ -58,15 +56,15 @@ void vsist_logf(const char *filepath, int level, char *format, va_list ap) {
    if (is_tty) {
        log_len = snprintf(
                log_str, sizeof(log_str),
-                "\033[%dm[%04llX]%s [%s] [%s %s] ",
-                31 + ((unsigned int) (pid)) % 7, pid, log_colors[level],
+                "\033[%dmT%d%s [%s] [%s %s] ",
+                31 + ProcData.thread_id % 7, ProcData.thread_id, log_colors[level],
                datetime, log_levels[level], filepath
        );
    } else {
        log_len = snprintf(
                log_str, sizeof(log_str),
-                "[%04llX] [%s] [%s %s] ",
-                pid, datetime, log_levels[level], filepath
+                "T%d [%s] [%s %s] ",
+                ProcData.thread_id, datetime, log_levels[level], filepath
        );
    }

@@ -112,8 +110,6 @@ void sist_log(const char *filepath, int level, char *str) {

    char log_str[LOG_MAX_LENGTH];

-    unsigned long long pid = (unsigned long long) pthread_self();
-
    char datetime[32];
    time_t t;
    struct tm result;
@@ -132,8 +128,8 @@ void sist_log(const char *filepath, int level, char *str) {

        log_len = snprintf(
                log_str, sizeof(log_str),
-                "{\"thread\":\"%04llX\",\"datetime\":\"%s\",\"level\":\"%s\",\"filepath\":%s,\"message\":%s}\n",
-                pid, datetime, log_levels[level], filepath_json_str, log_str_json_str
+                "{\"thread\":\"T%d\",\"datetime\":\"%s\",\"level\":\"%s\",\"filepath\":%s,\"message\":%s}\n",
+                ProcData.thread_id, datetime, log_levels[level], filepath_json_str, log_str_json_str
        );

        cJSON_Delete(log_str_json);
@@ -147,16 +143,16 @@ void sist_log(const char *filepath, int level, char *str) {
    if (is_tty) {
        log_len = snprintf(
                log_str, sizeof(log_str),
-                "\033[%dm[%04llX]%s [%s] [%s %s] %s \033[0m\n",
-                31 + ((unsigned int) (pid)) % 7, pid, log_colors[level],
+                "\033[%dmT%d%s [%s] [%s %s] %s \033[0m\n",
+                31 + ProcData.thread_id % 7, ProcData.thread_id, log_colors[level],
                datetime, log_levels[level], filepath,
                str
        );
    } else {
        log_len = snprintf(
                log_str, sizeof(log_str),
-                "[%04llX] [%s] [%s %s] %s \n",
-                pid, datetime, log_levels[level], filepath,
+                "T%d [%s] [%s %s] %s \n",
+                ProcData.thread_id, datetime, log_levels[level], filepath,
                str
        );
    }
--- a/src/log.h
+++ b/src/log.h
@@ -2,6 +2,7 @@
 #define SIST2_LOG_H


+#include <signal.h>
 #define LOG_MAX_LENGTH 8192

 #define LOG_SIST_DEBUG 0
@@ -10,32 +11,37 @@
 #define LOG_SIST_ERROR 3
 #define LOG_SIST_FATAL 4

-#define LOG_DEBUGF(filepath, fmt, ...) \
-    if (LogCtx.very_verbose) {sist_logf(filepath, LOG_SIST_DEBUG, fmt, __VA_ARGS__);}
-#define LOG_DEBUG(filepath, str) \
-    if (LogCtx.very_verbose) {sist_log(filepath, LOG_SIST_DEBUG, str);}
+#define LOG_DEBUGF(filepath, fmt, ...) do{\
+    if (LogCtx.very_verbose) {sist_logf(filepath, LOG_SIST_DEBUG, fmt, __VA_ARGS__);}}while(0)
+#define LOG_DEBUG(filepath, str) do{\
+    if (LogCtx.very_verbose) {sist_log(filepath, LOG_SIST_DEBUG, str);}}while(0)

-#define LOG_INFOF(filepath, fmt, ...) \
-    if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_INFO, fmt, __VA_ARGS__);}
-#define LOG_INFO(filepath, str) \
-    if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_INFO, str);}
+#define LOG_INFOF(filepath, fmt, ...) do {\
+    if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_INFO, fmt, __VA_ARGS__);}} while(0)
+#define LOG_INFO(filepath, str) do {\
+    if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_INFO, str);}} while(0)

-#define LOG_WARNINGF(filepath, fmt, ...) \
-    if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_WARNING, fmt, __VA_ARGS__);}
-#define LOG_WARNING(filepath, str) \
-    if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_WARNING, str);}
+#define LOG_WARNINGF(filepath, fmt, ...) do {\
+    if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_WARNING, fmt, __VA_ARGS__);}}while(0)
+#define LOG_WARNING(filepath, str) do{\
+    if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_WARNING, str);}}while(0)

-#define LOG_ERRORF(filepath, fmt, ...) \
-    if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_ERROR, fmt, __VA_ARGS__);}
-#define LOG_ERROR(filepath, str) \
-    if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_ERROR, str);}
+#define LOG_ERRORF(filepath, fmt, ...) do {\
+    if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_ERROR, fmt, __VA_ARGS__);}}while(0)
+#define LOG_ERROR(filepath, str) do{\
+    if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_ERROR, str);}}while(0)

-#define LOG_FATALF(filepath, fmt, ...) \
+#define LOG_FATALF(filepath, fmt, ...)\
    sist_logf(filepath, LOG_SIST_FATAL, fmt, __VA_ARGS__);\
-    exit(-1);
+    raise(SIGUSR1)
 #define LOG_FATAL(filepath, str) \
    sist_log(filepath, LOG_SIST_FATAL, str);\
-    exit(-1);
+    exit(SIGUSR1)
+
+#define LOG_FATALF_NO_EXIT(filepath, fmt, ...) \
+    sist_logf(filepath, LOG_SIST_FATAL, fmt, __VA_ARGS__)
+#define LOG_FATAL_NO_EXIT(filepath, str) \
+    sist_log(filepath, LOG_SIST_FATAL, str)

 #include "sist.h"

--- a/src/magic_generated.c
+++ b/src/magic_generated.c
--- a/src/main.c
+++ b/src/main.c
@@ -5,8 +5,6 @@
 #include <locale.h>

 #include "cli.h"
-#include "io/serialize.h"
-#include "io/store.h"
 #include "tpool.h"
 #include "io/walk.h"
 #include "index/elastic.h"
@@ -16,13 +14,9 @@
 #include "auth0/auth0_c_api.h"

 #include <signal.h>
-#include <unistd.h>
+#include <pthread.h>

-#include "stats.h"
-
-#define DESCRIPTION "Lightning-fast file system indexer and search tool."
-
-#define EPILOG "Made by simon987 <me@simon987.net>. Released under GPL-3.0"
+#include "src/database/database.h"


 static const char *const usage[] = {
@@ -34,109 +28,62 @@ static const char *const usage[] = {
 };


-static __sighandler_t sigsegv_handler = NULL;
-static __sighandler_t sigabrt_handler = NULL;
+void database_scan_begin(scan_args_t *args) {
+    index_descriptor_t *desc = &ScanCtx.index.desc;

-void sig_handler(int signum) {
+    database_t *db = database_create(args->output, INDEX_DATABASE);

-    LogCtx.verbose = TRUE;
-    LogCtx.very_verbose = TRUE;
+    if (args->incremental) {
+        // Update existing descriptor
+        database_open(db);
+        index_descriptor_t *original_desc = database_read_index_descriptor(db);

-    LOG_ERROR("*SIGNAL HANDLER*", "=============================================\n\n");
-    LOG_ERRORF("*SIGNAL HANDLER*", "Uh oh! Caught fatal signal: %s", strsignal(signum));
+        // copy original index id
+        strcpy(desc->id, original_desc->id);

-    if (ScanCtx.dbg_current_files != NULL) {
-        GHashTableIter iter;
-        g_hash_table_iter_init(&iter, ScanCtx.dbg_current_files);
-
-        void *key;
-        void *value;
-        while (g_hash_table_iter_next(&iter, &key, &value)) {
-            parse_job_t *job = value;
-
-            if (isatty(STDERR_FILENO)) {
-                LOG_DEBUGF(
-                        "*SIGNAL HANDLER*",
-                        "Thread \033[%dm[%04llX]\033[0m was working on job '%s'",
-                        31 + ((unsigned int) key) % 7, key, job->filepath
-                );
-            } else {
-                LOG_DEBUGF(
-                        "*SIGNAL HANDLER*",
-                        "THREAD [%04llX] was working on job %s",
-                        key, job->filepath
-                );
-            }
+        if (original_desc->version_major != VersionMajor) {
+            LOG_FATALF("main.c", "Version mismatch! Index is %s but executable is %s", original_desc->version, Version);
        }
-    }

-    if (ScanCtx.pool != NULL) {
-        tpool_dump_debug_info(ScanCtx.pool);
-    }
+        strcpy(original_desc->root, desc->root);
+        original_desc->root_len = desc->root_len;
+        strcpy(original_desc->rewrite_url, desc->rewrite_url);
+        strcpy(original_desc->name, desc->name);

-    if (IndexCtx.pool != NULL) {
-        tpool_dump_debug_info(IndexCtx.pool);
-    }
+        time(&original_desc->timestamp);

-    LOG_INFO(
-            "*SIGNAL HANDLER*",
-            "Please consider creating a bug report at https://github.com/simon987/sist2/issues !"
-    )
-    LOG_INFO(
-            "*SIGNAL HANDLER*",
-            "sist2 is an open source project and relies on the collaboration of its users to diagnose and fix bugs"
-    )
+        database_write_index_descriptor(db, original_desc);
+        free(original_desc);

-#ifndef SIST_DEBUG
-    LOG_WARNING(
-            "*SIGNAL HANDLER*",
-            "You are running sist2 in release mode! Please consider downloading the debug binary from the Github "
-            "releases page to provide additionnal information when submitting a bug report."
-    )
-#endif
+        database_incremental_scan_begin(db);

-    if (signum == SIGSEGV && sigsegv_handler != NULL) {
-        sigsegv_handler(signum);
-    } else if (signum == SIGABRT && sigabrt_handler != NULL) {
-        sigabrt_handler(signum);
-    }
-
-    exit(-1);
-}
-
-void init_dir(const char *dirpath, scan_args_t *args) {
-    char path[PATH_MAX];
-    snprintf(path, PATH_MAX, "%sdescriptor.json", dirpath);
-
-    time(&ScanCtx.index.desc.timestamp);
-    strcpy(ScanCtx.index.desc.version, Version);
-    strcpy(ScanCtx.index.desc.type, INDEX_TYPE_NDJSON);
-
-    if (args->incremental != NULL) {
-        // copy old index id
-        char descriptor_path[PATH_MAX];
-        snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->incremental);
-        index_descriptor_t original_desc = read_index_descriptor(descriptor_path);
-        memcpy(ScanCtx.index.desc.id, original_desc.id, sizeof(original_desc.id));
    } else {
+        // Create new descriptor
+
+        time(&desc->timestamp);
+        strcpy(desc->version, Version);
+        desc->version_major = VersionMajor;
+        desc->version_minor = VersionMinor;
+        desc->version_patch = VersionPatch;
+
        // generate new index id based on timestamp
        unsigned char index_md5[MD5_DIGEST_LENGTH];
        MD5((unsigned char *) &ScanCtx.index.desc.timestamp, sizeof(ScanCtx.index.desc.timestamp), index_md5);
        buf2hex(index_md5, MD5_DIGEST_LENGTH, ScanCtx.index.desc.id);
+
+        database_initialize(db);
+        database_open(db);
+        database_write_index_descriptor(db, desc);
    }

-    write_index_descriptor(path, &ScanCtx.index.desc);
+    database_close(db, FALSE);
 }

-void scan_print_header() {
-    LOG_INFOF("main.c", "sist2 v%s", Version)
+void write_thumbnail_callback(char *key, int num, void *buf, size_t buf_len) {
+    database_write_thumbnail(ProcData.index_db, key, num, buf, buf_len);
 }

-void _store(char *key, size_t key_len, char *buf, size_t buf_len) {
-    store_write(ScanCtx.index.store, key, key_len, buf, buf_len);
-}
-
-void _log(const char *filepath, int level, char *str) {
+void log_callback(const char *filepath, int level, char *str) {
    if (level == LEVEL_FATAL) {
        sist_log(filepath, level, str);
        exit(-1);
@@ -153,7 +100,7 @@ void _log(const char *filepath, int level, char *str) {
    }
 }

-void _logf(const char *filepath, int level, char *format, ...) {
+void logf_callback(const char *filepath, int level, char *format, ...) {

    va_list args;

@@ -177,17 +124,12 @@ void _logf(const char *filepath, int level, char *format, ...) {

 void initialize_scan_context(scan_args_t *args) {

-    ScanCtx.dbg_current_files = g_hash_table_new_full(g_int64_hash, g_int64_equal, NULL, NULL);
-    pthread_mutex_init(&ScanCtx.dbg_current_files_mu, NULL);
-    pthread_mutex_init(&ScanCtx.dbg_file_counts_mu, NULL);
-    pthread_mutex_init(&ScanCtx.copy_table_mu, NULL);
-
    ScanCtx.calculate_checksums = args->calculate_checksums;

    // Archive
    ScanCtx.arc_ctx.mode = args->archive_mode;
-    ScanCtx.arc_ctx.log = _log;
-    ScanCtx.arc_ctx.logf = _logf;
+    ScanCtx.arc_ctx.log = log_callback;
+    ScanCtx.arc_ctx.logf = logf_callback;
    ScanCtx.arc_ctx.parse = (parse_callback_t) parse;
    if (args->archive_passphrase != NULL) {
        strcpy(ScanCtx.arc_ctx.passphrase, args->archive_passphrase);
@@ -196,17 +138,16 @@ void initialize_scan_context(scan_args_t *args) {
    }

    // Comic
-    ScanCtx.comic_ctx.log = _log;
-    ScanCtx.comic_ctx.logf = _logf;
-    ScanCtx.comic_ctx.store = _store;
+    ScanCtx.comic_ctx.log = log_callback;
+    ScanCtx.comic_ctx.logf = logf_callback;
+    ScanCtx.comic_ctx.store = write_thumbnail_callback;
    ScanCtx.comic_ctx.enable_tn = args->tn_count > 0;
    ScanCtx.comic_ctx.tn_size = args->tn_size;
    ScanCtx.comic_ctx.tn_qscale = args->tn_quality;
-    ScanCtx.comic_ctx.cbr_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/x-cbr");
-    ScanCtx.comic_ctx.cbz_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/x-cbz");
+    ScanCtx.comic_ctx.cbr_mime = mime_get_mime_by_string("application/x-cbr");
+    ScanCtx.comic_ctx.cbz_mime = mime_get_mime_by_string("application/x-cbz");

    // Ebook
-    pthread_mutex_init(&ScanCtx.ebook_ctx.mupdf_mutex, NULL);
    ScanCtx.ebook_ctx.content_size = args->content_size;
    ScanCtx.ebook_ctx.enable_tn = args->tn_count > 0;
    ScanCtx.ebook_ctx.tn_size = args->tn_size;
@@ -214,25 +155,25 @@ void initialize_scan_context(scan_args_t *args) {
        ScanCtx.ebook_ctx.tesseract_lang = args->tesseract_lang;
        ScanCtx.ebook_ctx.tesseract_path = args->tesseract_path;
    }
-    ScanCtx.ebook_ctx.log = _log;
-    ScanCtx.ebook_ctx.logf = _logf;
-    ScanCtx.ebook_ctx.store = _store;
+    ScanCtx.ebook_ctx.log = log_callback;
+    ScanCtx.ebook_ctx.logf = logf_callback;
+    ScanCtx.ebook_ctx.store = write_thumbnail_callback;
    ScanCtx.ebook_ctx.fast_epub_parse = args->fast_epub;
    ScanCtx.ebook_ctx.tn_qscale = args->tn_quality;

    // Font
    ScanCtx.font_ctx.enable_tn = args->tn_count > 0;
-    ScanCtx.font_ctx.log = _log;
-    ScanCtx.font_ctx.logf = _logf;
-    ScanCtx.font_ctx.store = _store;
+    ScanCtx.font_ctx.log = log_callback;
+    ScanCtx.font_ctx.logf = logf_callback;
+    ScanCtx.font_ctx.store = write_thumbnail_callback;

    // Media
    ScanCtx.media_ctx.tn_qscale = args->tn_quality;
    ScanCtx.media_ctx.tn_size = args->tn_size;
    ScanCtx.media_ctx.tn_count = args->tn_count;
-    ScanCtx.media_ctx.log = _log;
-    ScanCtx.media_ctx.logf = _logf;
-    ScanCtx.media_ctx.store = _store;
+    ScanCtx.media_ctx.log = log_callback;
+    ScanCtx.media_ctx.logf = logf_callback;
+    ScanCtx.media_ctx.store = write_thumbnail_callback;
    ScanCtx.media_ctx.max_media_buffer = (long) args->max_memory_buffer_mib * 1024 * 1024;
    ScanCtx.media_ctx.read_subtitles = args->read_subtitles;
    ScanCtx.media_ctx.read_subtitles = args->tn_count;
@@ -246,32 +187,33 @@ void initialize_scan_context(scan_args_t *args) {
    // OOXML
    ScanCtx.ooxml_ctx.enable_tn = args->tn_count > 0;
    ScanCtx.ooxml_ctx.content_size = args->content_size;
-    ScanCtx.ooxml_ctx.log = _log;
-    ScanCtx.ooxml_ctx.logf = _logf;
-    ScanCtx.ooxml_ctx.store = _store;
+    ScanCtx.ooxml_ctx.log = log_callback;
+    ScanCtx.ooxml_ctx.logf = logf_callback;
+    ScanCtx.ooxml_ctx.store = write_thumbnail_callback;

    // MOBI
    ScanCtx.mobi_ctx.content_size = args->content_size;
-    ScanCtx.mobi_ctx.log = _log;
-    ScanCtx.mobi_ctx.logf = _logf;
+    ScanCtx.mobi_ctx.log = log_callback;
+    ScanCtx.mobi_ctx.logf = logf_callback;
+    ScanCtx.mobi_ctx.store = write_thumbnail_callback;
+    ScanCtx.mobi_ctx.enable_tn = args->tn_count > 0;
+    ScanCtx.mobi_ctx.tn_size = args->tn_size;
+    ScanCtx.mobi_ctx.tn_qscale = args->tn_quality;

    // TEXT
    ScanCtx.text_ctx.content_size = args->content_size;
-    ScanCtx.text_ctx.log = _log;
-    ScanCtx.text_ctx.logf = _logf;
+    ScanCtx.text_ctx.log = log_callback;
+    ScanCtx.text_ctx.logf = logf_callback;

    // MSDOC
-    ScanCtx.msdoc_ctx.enable_tn = args->tn_count > 0;
-    ScanCtx.msdoc_ctx.tn_size = args->tn_size;
    ScanCtx.msdoc_ctx.content_size = args->content_size;
-    ScanCtx.msdoc_ctx.log = _log;
-    ScanCtx.msdoc_ctx.logf = _logf;
-    ScanCtx.msdoc_ctx.store = _store;
-    ScanCtx.msdoc_ctx.msdoc_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/msword");
+    ScanCtx.msdoc_ctx.log = log_callback;
+    ScanCtx.msdoc_ctx.logf = logf_callback;
+    ScanCtx.msdoc_ctx.store = write_thumbnail_callback;
+    ScanCtx.msdoc_ctx.msdoc_mime = mime_get_mime_by_string("application/msword");

    ScanCtx.threads = args->threads;
    ScanCtx.depth = args->depth;
-    ScanCtx.mem_limit = (size_t) args->scan_mem_limit_mib * 1024 * 1024;

    strncpy(ScanCtx.index.path, args->output, sizeof(ScanCtx.index.path));
    strncpy(ScanCtx.index.desc.name, args->name, sizeof(ScanCtx.index.desc.name));
@@ -284,176 +226,66 @@ void initialize_scan_context(scan_args_t *args) {
    ScanCtx.raw_ctx.tn_qscale = args->tn_quality;
    ScanCtx.raw_ctx.enable_tn = args->tn_count > 0;
    ScanCtx.raw_ctx.tn_size = args->tn_size;
-    ScanCtx.raw_ctx.log = _log;
-    ScanCtx.raw_ctx.logf = _logf;
-    ScanCtx.raw_ctx.store = _store;
+    ScanCtx.raw_ctx.log = log_callback;
+    ScanCtx.raw_ctx.logf = logf_callback;
+    ScanCtx.raw_ctx.store = write_thumbnail_callback;

    // Wpd
    ScanCtx.wpd_ctx.content_size = args->content_size;
-    ScanCtx.wpd_ctx.log = _log;
-    ScanCtx.wpd_ctx.logf = _logf;
-    ScanCtx.wpd_ctx.wpd_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/wordperfect");
+    ScanCtx.wpd_ctx.log = log_callback;
+    ScanCtx.wpd_ctx.logf = logf_callback;
+    ScanCtx.wpd_ctx.wpd_mime = mime_get_mime_by_string("application/wordperfect");

    // Json
    ScanCtx.json_ctx.content_size = args->content_size;
-    ScanCtx.json_ctx.log = _log;
-    ScanCtx.json_ctx.logf = _logf;
-    ScanCtx.json_ctx.json_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/json");
-    ScanCtx.json_ctx.ndjson_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/ndjson");
+    ScanCtx.json_ctx.log = log_callback;
+    ScanCtx.json_ctx.logf = logf_callback;
+    ScanCtx.json_ctx.json_mime = mime_get_mime_by_string("application/json");
+    ScanCtx.json_ctx.ndjson_mime = mime_get_mime_by_string("application/ndjson");
 }

-/**
- * Loads an existing index as the baseline for incremental scanning.
- *   1. load old index files (original+main) => original_table
- *   2. allocate empty table                 => copy_table
- *   3. allocate empty table                 => new_table
- * the original_table/copy_table/new_table will be populated in parsing/parse.c:parse
- * and consumed in main.c:save_incremental_index
- *
- * Note: the existing index may or may not be of incremental index form.
- */
-void load_incremental_index(const scan_args_t *args) {
-    char file_path[PATH_MAX];
-
-    ScanCtx.original_table = incremental_get_table();
-    ScanCtx.copy_table = incremental_get_table();
-    ScanCtx.new_table = incremental_get_table();
-
-    char descriptor_path[PATH_MAX];
-    snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->incremental);
-    index_descriptor_t original_desc = read_index_descriptor(descriptor_path);
-
-    if (strcmp(original_desc.version, Version) != 0) {
-        LOG_FATALF("main.c", "Version mismatch! Index is %s but executable is %s", original_desc.version, Version)
-    }
-
-    READ_INDICES(
-            file_path,
-            args->incremental,
-            incremental_read(ScanCtx.original_table, file_path, &original_desc),
-            LOG_DEBUG("main.c", "The base index for incremental scan does not have a main index"),
-            TRUE
-    );
-
-    LOG_INFOF("main.c", "Loaded %d items in to mtime table.", g_hash_table_size(ScanCtx.original_table))
-}
-
-/**
- * Saves an incremental index.
- * Before calling this function, the scanner should have finished writing the main index.
- *   1. Build original_table - new_table => delete_table
- *   2. Incrementally copy from old index files [(original+main) /\ copy_table] => index_original.ndjson.zst & store
- */
-void save_incremental_index(scan_args_t *args) {
-    char dst_path[PATH_MAX];
-    char store_path[PATH_MAX];
-    char file_path[PATH_MAX];
-    char del_path[PATH_MAX];
-    snprintf(store_path, PATH_MAX, "%sthumbs", args->incremental);
-    snprintf(dst_path, PATH_MAX, "%s_index_original.ndjson.zst", ScanCtx.index.path);
-    store_t *source = store_create(store_path, STORE_SIZE_TN);
-
-    LOG_INFOF("main.c", "incremental_delete: original size = %u, copy size = %u, new size = %u",
-              g_hash_table_size(ScanCtx.original_table),
-              g_hash_table_size(ScanCtx.copy_table),
-              g_hash_table_size(ScanCtx.new_table));
-    snprintf(del_path, PATH_MAX, "%s_index_delete.list.zst", ScanCtx.index.path);
-    READ_INDICES(file_path, args->incremental,
-                 incremental_delete(del_path, file_path, ScanCtx.copy_table, ScanCtx.new_table),
-                 perror("incremental_delete"), 1);
-    writer_cleanup();
-
-    READ_INDICES(file_path, args->incremental,
-                 incremental_copy(source, ScanCtx.index.store, file_path, dst_path, ScanCtx.copy_table),
-                 perror("incremental_copy"), 1);
-    writer_cleanup();
-
-    store_destroy(source);
-
-    snprintf(store_path, PATH_MAX, "%stags", args->incremental);
-    snprintf(dst_path, PATH_MAX, "%stags", ScanCtx.index.path);
-    store_t *source_tags = store_create(store_path, STORE_SIZE_TAG);
-    store_copy(source_tags, dst_path);
-    store_destroy(source_tags);
-}
-
-/**
- * An index can be either incremental or non-incremental (initial index).
- * For an initial index, there is only the "main" index.
- * For an incremental index, there are, additionally:
- *   - An "original" index, referencing all files unchanged since the previous index.
- *   - A "delete" index, referencing all files that exist in the previous index, but deleted since then.
- * Therefore, for an incremental index, "main"+"original" covers all the current files in the live filesystem,
- * and is orthognal with the "delete" index. When building an incremental index upon an old incremental index,
- * the old "delete" index can be safely ignored.
- */
 void sist2_scan(scan_args_t *args) {
-
-    ScanCtx.mime_table = mime_get_mime_table();
-    ScanCtx.ext_table = mime_get_ext_table();
-
    initialize_scan_context(args);

-    init_dir(ScanCtx.index.path, args);
+    database_scan_begin(args);

-    char store_path[PATH_MAX];
-    snprintf(store_path, PATH_MAX, "%sthumbs", ScanCtx.index.path);
-    ScanCtx.index.store = store_create(store_path, STORE_SIZE_TN);
+    LOG_INFOF("main.c", "sist2 v%s", Version);

-    snprintf(store_path, PATH_MAX, "%smeta", ScanCtx.index.path);
-    ScanCtx.index.meta_store = store_create(store_path, STORE_SIZE_META);
-
-    scan_print_header();
-
-    if (args->incremental != NULL) {
-        load_incremental_index(args);
-    }
-
-    ScanCtx.pool = tpool_create(ScanCtx.threads, thread_cleanup, TRUE, TRUE, ScanCtx.mem_limit);
+    ScanCtx.pool = tpool_create(ScanCtx.threads, TRUE);
    tpool_start(ScanCtx.pool);

-    ScanCtx.writer_pool = tpool_create(1, writer_cleanup, TRUE, FALSE, 0);
-    tpool_start(ScanCtx.writer_pool);
-
    if (args->list_path) {
        // Scan using file list
        int list_ret = iterate_file_list(args->list_file);
        if (list_ret != 0) {
-            LOG_FATALF("main.c", "iterate_file_list() failed! (%d)", list_ret)
+            LOG_FATALF("main.c", "iterate_file_list() failed! (%d)", list_ret);
        }
    } else {
        // Scan directory recursively
        int walk_ret = walk_directory_tree(ScanCtx.index.desc.root);
        if (walk_ret == -1) {
-            LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno)
+            LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno);
        }
    }

    tpool_wait(ScanCtx.pool);
    tpool_destroy(ScanCtx.pool);

-    tpool_wait(ScanCtx.writer_pool);
-    tpool_destroy(ScanCtx.writer_pool);
+    LOG_DEBUGF("main.c", "Thumbnail store size: %lu", ScanCtx.stat_tn_size);
+    LOG_DEBUGF("main.c", "Index size: %lu", ScanCtx.stat_index_size);

-    LOG_DEBUGF("main.c", "Skipped files: %d", ScanCtx.dbg_skipped_files_count)
-    LOG_DEBUGF("main.c", "Excluded files: %d", ScanCtx.dbg_excluded_files_count)
-    LOG_DEBUGF("main.c", "Failed files: %d", ScanCtx.dbg_failed_files_count)
-    LOG_DEBUGF("main.c", "Thumbnail store size: %lu", ScanCtx.stat_tn_size)
-    LOG_DEBUGF("main.c", "Index size: %lu", ScanCtx.stat_index_size)
+    database_t *db = database_create(args->output, INDEX_DATABASE);
+    database_open(db);

-    if (args->incremental != NULL) {
-        save_incremental_index(args);
+    if (args->incremental != FALSE) {
+        database_incremental_scan_end(db);
    }

-    generate_stats(&ScanCtx.index, args->treemap_threshold, ScanCtx.index.path);
-
-    store_destroy(ScanCtx.index.store);
-    store_destroy(ScanCtx.index.meta_store);
+    database_generate_stats(db, args->treemap_threshold);
+    database_close(db, args->optimize_database);
 }

 void sist2_index(index_args_t *args) {
-    char file_path[PATH_MAX];
-
    IndexCtx.es_url = args->es_url;
    IndexCtx.es_index = args->es_index;
    IndexCtx.es_insecure_ssl = args->es_insecure_ssl;
@@ -464,91 +296,78 @@ void sist2_index(index_args_t *args) {
        elastic_init(args->force_reset, args->es_mappings, args->es_settings);
    }

-    char descriptor_path[PATH_MAX];
-    snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->index_path);
+    database_t *db = database_create(args->index_path, INDEX_DATABASE);
+    database_open(db);
+    index_descriptor_t *desc = database_read_index_descriptor(db);
+    database_close(db, FALSE);

-    index_descriptor_t desc = read_index_descriptor(descriptor_path);
+    LOG_DEBUGF("main.c", "Index version %s", desc->version);

-    LOG_DEBUGF("main.c", "descriptor version %s (%s)", desc.version, desc.type)
-
-    if (strcmp(desc.version, Version) != 0) {
-        LOG_FATALF("main.c", "Version mismatch! Index is %s but executable is %s", desc.version, Version)
+    if (desc->version_major != VersionMajor) {
+        LOG_FATALF("main.c", "Version mismatch! Index is %s but executable is %s", desc->version, Version);
    }

-    DIR *dir = opendir(args->index_path);
-    if (dir == NULL) {
-        LOG_FATALF("main.c", "Could not open index %s: %s", args->index_path, strerror(errno))
-    }
-
-    char path_tmp[PATH_MAX];
-    snprintf(path_tmp, sizeof(path_tmp), "%stags", args->index_path);
-    IndexCtx.tag_store = store_create(path_tmp, STORE_SIZE_TAG);
-    IndexCtx.tags = store_read_all(IndexCtx.tag_store);
-
-    snprintf(path_tmp, sizeof(path_tmp), "%smeta", args->index_path);
-    IndexCtx.meta_store = store_create(path_tmp, STORE_SIZE_META);
-    IndexCtx.meta = store_read_all(IndexCtx.meta_store);
-
-    index_func f;
-    if (args->print) {
-        f = print_json;
-    } else {
-        f = index_json;
-    }
-
-    IndexCtx.pool = tpool_create(args->threads, elastic_cleanup, FALSE, args->print == 0, 0);
+    IndexCtx.pool = tpool_create(args->threads, args->print == FALSE);
    tpool_start(IndexCtx.pool);

-    READ_INDICES(file_path, args->index_path, {
-        read_index(file_path, desc.id, desc.type, f);
-        LOG_DEBUGF("main.c", "Read index file %s (%s)", file_path, desc.type);
-    }, {}, !args->incremental);
+    int cnt = 0;

-    // Only read the _delete index if we're sending data to ES
-    if (!args->print) {
-        snprintf(file_path, PATH_MAX, "%s_index_delete.list.zst", args->index_path);
-        if (0 == access(file_path, R_OK)) {
-            read_lines(file_path, (line_processor_t) {
-                    .data = NULL,
-                    .func = delete_document
-            });
-            LOG_DEBUGF("main.c", "Read index file %s (%s)", file_path, desc.type)
+    db = database_create(args->index_path, INDEX_DATABASE);
+    database_open(db);
+    database_iterator_t *iterator = database_create_document_iterator(db);
+    database_document_iter_foreach(json, iterator) {
+        char doc_id[SIST_DOC_ID_LEN];
+        strcpy(doc_id, cJSON_GetObjectItem(json, "_id")->valuestring);
+        cJSON_DeleteItemFromObject(json, "_id");
+
+        if (args->print) {
+            print_json(json, doc_id);
+        } else {
+            index_json(json, doc_id);
+            cnt += 1;
        }
+        cJSON_Delete(json);
    }

-    closedir(dir);
+    free(iterator);
+
+    if (!args->print) {
+        database_iterator_t *del_iter = database_create_delete_list_iterator(db);
+        database_delete_list_iter_foreach(id, del_iter) {
+            delete_document(id);
+            free(id);
+        }
+        free(del_iter);
+    }
+
+    database_close(db, FALSE);

    tpool_wait(IndexCtx.pool);
-
    tpool_destroy(IndexCtx.pool);

    if (IndexCtx.needs_es_connection) {
-        finish_indexer(args->script, args->async_script, desc.id);
+        finish_indexer(args->script, args->async_script, desc->id);
    }
-
-    store_destroy(IndexCtx.tag_store);
-    store_destroy(IndexCtx.meta_store);
-    g_hash_table_remove_all(IndexCtx.tags);
-    g_hash_table_destroy(IndexCtx.tags);
+    free(desc);
 }

 void sist2_exec_script(exec_args_t *args) {
-
    LogCtx.verbose = TRUE;

-    char descriptor_path[PATH_MAX];
-    snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->index_path);
-    index_descriptor_t desc = read_index_descriptor(descriptor_path);
-
    IndexCtx.es_url = args->es_url;
    IndexCtx.es_index = args->es_index;
    IndexCtx.es_insecure_ssl = args->es_insecure_ssl;
    IndexCtx.needs_es_connection = TRUE;

-    LOG_DEBUGF("main.c", "descriptor version %s (%s)", desc.version, desc.type)
+    database_t *db = database_create(args->index_path, INDEX_DATABASE);
+    database_open(db);

-    execute_update_script(args->script, args->async_script, desc.id);
+    index_descriptor_t *desc = database_read_index_descriptor(db);
+    LOG_DEBUGF("main.c", "Index version %s", desc->version);
+
+    execute_update_script(args->script, args->async_script, desc->id);
    free(args->script);
+    database_close(db, FALSE);
 }

 void sist2_web(web_args_t *args) {
@@ -572,23 +391,17 @@ void sist2_web(web_args_t *args) {

    for (int i = 0; i < args->index_count; i++) {
        char *abs_path = abspath(args->indices[i]);
-        if (abs_path == NULL) {
-            return;
-        }
-        char path_tmp[PATH_MAX];
-
-        snprintf(path_tmp, PATH_MAX, "%sthumbs", abs_path);
-        WebCtx.indices[i].store = store_create(path_tmp, STORE_SIZE_TN);
-
-        snprintf(path_tmp, PATH_MAX, "%stags", abs_path);
-        mkdir(path_tmp, S_IWUSR | S_IRUSR | S_IXUSR);
-        WebCtx.indices[i].tag_store = store_create(path_tmp, STORE_SIZE_TAG);
-
-        snprintf(path_tmp, PATH_MAX, "%sdescriptor.json", abs_path);
-        WebCtx.indices[i].desc = read_index_descriptor(path_tmp);

        strcpy(WebCtx.indices[i].path, abs_path);
-        LOG_INFOF("main.c", "Loaded index: [%s]", WebCtx.indices[i].desc.name)
+
+        WebCtx.indices[i].db = database_create(abs_path, INDEX_DATABASE);
+        database_open(WebCtx.indices[i].db);
+
+        index_descriptor_t *desc = database_read_index_descriptor(WebCtx.indices[i].db);
+        WebCtx.indices[i].desc = *desc;
+        free(desc);
+
+        LOG_INFOF("main.c", "Loaded index: [%s]", WebCtx.indices[i].desc.name);
        free(abs_path);
    }

@@ -603,7 +416,7 @@ void sist2_web(web_args_t *args) {
 *   Negative number          -> Raise error
 *   Specified a valid number -> Continue as normal
 */
-int set_to_negative_if_value_is_zero(struct argparse *self, const struct argparse_option *option) {
+int set_to_negative_if_value_is_zero(UNUSED(struct argparse *self), const struct argparse_option *option) {
    int specified_value = *(int *) option->value;

    if (specified_value == 0) {
@@ -616,11 +429,7 @@ int set_to_negative_if_value_is_zero(struct argparse *self, const struct argpars
    }
 }

-
 int main(int argc, const char *argv[]) {
-    sigsegv_handler = signal(SIGSEGV, sig_handler);
-    sigabrt_handler = signal(SIGABRT, sig_handler);
-
    setlocale(LC_ALL, "");

    scan_args_t *scan_args = scan_args_create();
@@ -640,38 +449,37 @@ int main(int argc, const char *argv[]) {
    struct argparse_option options[] = {
            OPT_HELP(),

-            OPT_BOOLEAN('v', "version", &arg_version, "Show version and exit"),
-            OPT_BOOLEAN(0, "verbose", &LogCtx.verbose, "Turn on logging"),
-            OPT_BOOLEAN(0, "very-verbose", &LogCtx.very_verbose, "Turn on debug messages"),
+            OPT_BOOLEAN('v', "version", &arg_version, "Print version and exit."),
+            OPT_BOOLEAN(0, "verbose", &LogCtx.verbose, "Turn on logging."),
+            OPT_BOOLEAN(0, "very-verbose", &LogCtx.very_verbose, "Turn on debug messages."),
            OPT_BOOLEAN(0, "json-logs", &LogCtx.json_logs, "Output logs in JSON format."),

            OPT_GROUP("Scan options"),
-            OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"),
-            OPT_INTEGER(0, "mem-throttle", &scan_args->scan_mem_limit_mib,
-                        "Total memory threshold in MiB for scan throttling. DEFAULT=0",
-                        set_to_negative_if_value_is_zero, (intptr_t) &scan_args->scan_mem_limit_mib),
-            OPT_FLOAT('q', "thumbnail-quality", &scan_args->tn_quality,
-                      "Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=1",
-                      set_to_negative_if_value_is_zero, (intptr_t) &scan_args->tn_quality),
+            OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT: 1"),
+            OPT_INTEGER('q', "thumbnail-quality", &scan_args->tn_quality,
+                        "Thumbnail quality, on a scale of 2 to 31, 2 being the best. DEFAULT: 2",
+                        set_to_negative_if_value_is_zero, (intptr_t) &scan_args->tn_quality),
            OPT_INTEGER(0, "thumbnail-size", &scan_args->tn_size,
-                        "Thumbnail size, in pixels. DEFAULT=500",
+                        "Thumbnail size, in pixels. DEFAULT: 552",
                        set_to_negative_if_value_is_zero, (intptr_t) &scan_args->tn_size),
            OPT_INTEGER(0, "thumbnail-count", &scan_args->tn_count,
-                        "Number of thumbnails to generate. Set a value > 1 to create video previews, set to 0 to disable thumbnails. DEFAULT=1",
+                        "Number of thumbnails to generate. Set a value > 1 to create video previews, set to 0 to disable thumbnails. DEFAULT: 1",
                        set_to_negative_if_value_is_zero, (intptr_t) &scan_args->tn_count),
            OPT_INTEGER(0, "content-size", &scan_args->content_size,
-                        "Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT=32768",
+                        "Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT: 32768",
                        set_to_negative_if_value_is_zero, (intptr_t) &scan_args->content_size),
-            OPT_STRING(0, "incremental", &scan_args->incremental,
-                       "Reuse an existing index and only scan modified files."),
-            OPT_STRING('o', "output", &scan_args->output, "Output directory. DEFAULT=index.sist2/"),
+            OPT_STRING('o', "output", &scan_args->output, "Output index file path. DEFAULT: index.sist2"),
+            OPT_BOOLEAN(0, "incremental", &scan_args->incremental,
+                        "If the output file path exists, only scan new or modified files."),
+            OPT_BOOLEAN(0, "optimize-index", &scan_args->optimize_database,
+                        "Defragment index file after scan to reduce its file size."),
            OPT_STRING(0, "rewrite-url", &scan_args->rewrite_url, "Serve files from this url instead of from disk."),
-            OPT_STRING(0, "name", &scan_args->name, "Index display name. DEFAULT: (name of the directory)"),
+            OPT_STRING(0, "name", &scan_args->name, "Index display name. DEFAULT: index"),
            OPT_INTEGER(0, "depth", &scan_args->depth, "Scan up to DEPTH subdirectories deep. "
                                                       "Use 0 to only scan files in PATH. DEFAULT: -1"),
            OPT_STRING(0, "archive", &scan_args->archive, "Archive file mode (skip|list|shallow|recurse). "
-                                                          "skip: Don't parse, list: only get file names as text, "
-                                                          "shallow: Don't parse archives inside archives. DEFAULT: recurse"),
+                                                          "skip: don't scan, list: only save file names as text, "
+                                                          "shallow: don't scan archives inside archives. DEFAULT: recurse"),
            OPT_STRING(0, "archive-passphrase", &scan_args->archive_passphrase,
                       "Passphrase for encrypted archive files"),

@@ -680,8 +488,8 @@ int main(int argc, const char *argv[]) {
                       "which are installed on your machine)"),
            OPT_BOOLEAN(0, "ocr-images", &scan_args->ocr_images, "Enable OCR'ing of image files."),
            OPT_BOOLEAN(0, "ocr-ebooks", &scan_args->ocr_ebooks, "Enable OCR'ing of ebook files."),
-            OPT_STRING('e', "exclude", &scan_args->exclude_regex, "Files that match this regex will not be scanned"),
-            OPT_BOOLEAN(0, "fast", &scan_args->fast, "Only index file names & mime type"),
+            OPT_STRING('e', "exclude", &scan_args->exclude_regex, "Files that match this regex will not be scanned."),
+            OPT_BOOLEAN(0, "fast", &scan_args->fast, "Only index file names & mime type."),
            OPT_STRING(0, "treemap-threshold", &scan_args->treemap_threshold_str, "Relative size threshold for treemap "
                                                                                  "(see USAGE.md). DEFAULT: 0.0005"),
            OPT_INTEGER(0, "mem-buffer", &scan_args->max_memory_buffer_mib,
@@ -689,47 +497,52 @@ int main(int argc, const char *argv[]) {
                        "(see USAGE.md). DEFAULT: 2000"),
            OPT_BOOLEAN(0, "read-subtitles", &scan_args->read_subtitles, "Read subtitles from media files."),
            OPT_BOOLEAN(0, "fast-epub", &scan_args->fast_epub,
-                        "Faster but less accurate EPUB parsing (no thumbnails, metadata)"),
+                        "Faster but less accurate EPUB parsing (no thumbnails, metadata)."),
            OPT_BOOLEAN(0, "checksums", &scan_args->calculate_checksums, "Calculate file checksums when scanning."),
            OPT_STRING(0, "list-file", &scan_args->list_path, "Specify a list of newline-delimited paths to be scanned"
                                                              " instead of normal directory traversal. Use '-' to read"
                                                              " from stdin."),

            OPT_GROUP("Index options"),
-            OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"),
-            OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url with port. DEFAULT=http://localhost:9200"),
-            OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl, "Do not verify SSL connections to Elasticsearch."),
-            OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"),
-            OPT_BOOLEAN('p', "print", &index_args->print, "Just print JSON documents to stdout."),
+            OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT: 1"),
+            OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url with port. DEFAULT: http://localhost:9200"),
+            OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl,
+                        "Do not verify SSL connections to Elasticsearch."),
+            OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT: sist2"),
+            OPT_BOOLEAN('p', "print", &index_args->print,
+                        "Print JSON documents to stdout instead of indexing to elasticsearch."),
            OPT_BOOLEAN(0, "incremental-index", &index_args->incremental,
                        "Conduct incremental indexing. Assumes that the old index is already ingested in Elasticsearch."),
            OPT_STRING(0, "script-file", &common_script_path, "Path to user script."),
            OPT_STRING(0, "mappings-file", &index_args->es_mappings_path, "Path to Elasticsearch mappings."),
            OPT_STRING(0, "settings-file", &index_args->es_settings_path, "Path to Elasticsearch settings."),
            OPT_BOOLEAN(0, "async-script", &common_async_script, "Execute user script asynchronously."),
-            OPT_INTEGER(0, "batch-size", &index_args->batch_size, "Index batch size. DEFAULT: 100"),
-            OPT_BOOLEAN('f', "force-reset", &index_args->force_reset, "Reset Elasticsearch mappings and settings. "
-                                                                      "(You must use this option the first time you use the index command)"),
+            OPT_INTEGER(0, "batch-size", &index_args->batch_size, "Index batch size. DEFAULT: 70"),
+            OPT_BOOLEAN('f', "force-reset", &index_args->force_reset, "Reset Elasticsearch mappings and settings."),

            OPT_GROUP("Web options"),
-            OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"),
-            OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl, "Do not verify SSL connections to Elasticsearch."),
-            OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"),
-            OPT_STRING(0, "bind", &web_args->listen_address, "Listen on this address. DEFAULT=localhost:4090"),
+            OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT: http://localhost:9200"),
+            OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl,
+                        "Do not verify SSL connections to Elasticsearch."),
+            OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT: sist2"),
+            OPT_STRING(0, "bind", &web_args->listen_address,
+                       "Listen for connections on this address. DEFAULT: localhost:4090"),
            OPT_STRING(0, "auth", &web_args->credentials, "Basic auth in user:password format"),
            OPT_STRING(0, "auth0-audience", &web_args->auth0_audience, "API audience/identifier"),
            OPT_STRING(0, "auth0-domain", &web_args->auth0_domain, "Application domain"),
            OPT_STRING(0, "auth0-client-id", &web_args->auth0_client_id, "Application client ID"),
-            OPT_STRING(0, "auth0-public-key-file", &web_args->auth0_public_key_path, "Path to Auth0 public key file extracted from <domain>/pem"),
+            OPT_STRING(0, "auth0-public-key-file", &web_args->auth0_public_key_path,
+                       "Path to Auth0 public key file extracted from <domain>/pem"),
            OPT_STRING(0, "tag-auth", &web_args->tag_credentials, "Basic auth in user:password format for tagging"),
            OPT_STRING(0, "tagline", &web_args->tagline, "Tagline in navbar"),
            OPT_BOOLEAN(0, "dev", &web_args->dev, "Serve html & js files from disk (for development)"),
            OPT_STRING(0, "lang", &web_args->lang, "Default UI language. Can be changed by the user"),

            OPT_GROUP("Exec-script options"),
-            OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"),
-            OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl, "Do not verify SSL connections to Elasticsearch."),
-            OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"),
+            OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT: http://localhost:9200"),
+            OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl,
+                        "Do not verify SSL connections to Elasticsearch."),
+            OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT: sist2"),
            OPT_STRING(0, "script-file", &common_script_path, "Path to user script."),
            OPT_BOOLEAN(0, "async-script", &common_async_script, "Execute user script asynchronously."),

@@ -738,7 +551,11 @@ int main(int argc, const char *argv[]) {

    struct argparse argparse;
    argparse_init(&argparse, options, usage, 0);
-    argparse_describe(&argparse, DESCRIPTION, EPILOG);
+    argparse_describe(
+            &argparse,
+            "\nLightning-fast file system indexer and search tool.",
+            "\nMade by simon987 <me@simon987.net>. Released under GPL-3.0"
+    );
    argc = argparse_parse(&argparse, argc, argv);

    if (arg_version) {
@@ -806,7 +623,7 @@ int main(int argc, const char *argv[]) {

    } else {
        argparse_usage(&argparse);
-        LOG_FATALF("main.c", "Invalid command: '%s'\n", argv[0])
+        LOG_FATALF("main.c", "Invalid command: '%s'\n", argv[0]);
    }
    printf("\n");

--- a/src/parsing/fs_util.h
+++ b/src/parsing/fs_util.h
@@ -0,0 +1,41 @@
+#ifndef SIST2_FS_UTIL_H
+#define SIST2_FS_UTIL_H
+
+#include "src/sist.h"
+
+#define CLOSE_FILE(f) if ((f).close != NULL) {(f).close(&(f));};
+
+static int fs_read(struct vfile *f, void *buf, size_t size) {
+    if (f->fd == -1) {
+        SHA1_Init(&f->sha1_ctx);
+
+        f->fd = open(f->filepath, O_RDONLY);
+        if (f->fd == -1) {
+            return -1;
+        }
+    }
+
+    int ret = (int) read(f->fd, buf, size);
+
+    if (ret != 0 && f->calculate_checksum) {
+        f->has_checksum = TRUE;
+        safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, ret);
+    }
+
+    return ret;
+}
+
+static void fs_close(struct vfile *f) {
+    if (f->fd != -1) {
+        SHA1_Final(f->sha1_digest, &f->sha1_ctx);
+        close(f->fd);
+    }
+}
+
+static void fs_reset(struct vfile *f) {
+    if (f->fd != -1) {
+        lseek(f->fd, 0, SEEK_SET);
+    }
+}
+
+#endif
--- a/src/parsing/magic_util.c
+++ b/src/parsing/magic_util.c
@@ -0,0 +1,32 @@
+#include "magic_util.h"
+#include "src/log.h"
+#include "mime.h"
+#include <magic.h>
+#include "src/magic_generated.c"
+
+
+char *magic_buffer_embedded(void *buffer, size_t buffer_size) {
+
+    magic_t magic = magic_open(MAGIC_MIME_TYPE);
+
+    const char *magic_buffers[1] = {magic_database_buffer,};
+    size_t sizes[1] = {sizeof(magic_database_buffer),};
+
+    // TODO optimisation: check if we can reuse the magic instance
+    int load_ret = magic_load_buffers(magic, (void **) &magic_buffers, sizes, 1);
+
+    if (load_ret != 0) {
+        LOG_FATALF("parse.c", "Could not load libmagic database: (%d)", load_ret);
+    }
+
+    const char *magic_mime_str = magic_buffer(magic, buffer, buffer_size);
+    char *return_value = NULL;
+
+    if (magic_mime_str != NULL) {
+        return_value = malloc(strlen(magic_mime_str) + 1);
+        strcpy(return_value, magic_mime_str);
+    }
+
+    magic_close(magic);
+    return return_value;
+}
--- a/src/parsing/magic_util.h
+++ b/src/parsing/magic_util.h
@@ -0,0 +1,8 @@
+#ifndef SIST2_MAGIC_UTIL_H
+#define SIST2_MAGIC_UTIL_H
+
+#include <stdio.h>
+
+char *magic_buffer_embedded(void *buffer, size_t buffer_size);
+
+#endif //SIST2_MAGIC_UTIL_H
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
simon987	d32bda0d68	Bug fixes	2023-04-23 14:15:31 -04:00
simon987	499ed0be79	Fix readme link	2023-04-23 12:54:33 -04:00
simon987	dc39c0ec4b	Add NER support	2023-04-23 12:53:27 -04:00
simon987	b5cdd9a5df	Work on README, optimize database storage	2023-04-22 16:02:19 -04:00
simon987	a8b6886f7b	Fix stats page	2023-04-16 19:46:01 -04:00
simon987	a7e9b6af96	Flush documents in index	2023-04-15 13:49:18 -04:00
simon987	0710dc6d3d	Update readme / format support	2023-04-15 13:39:25 -04:00
simon987	75b66b5982	Fix #351	2023-04-15 13:06:13 -04:00
simon987	9813646c11	Fix #343	2023-04-15 12:39:47 -04:00
simon987	ebc9468251	Fix some memory leaks	2023-04-15 11:54:56 -04:00
simon987	7baaca5078	add_work fix for problem in #349 pt 2	2023-04-15 09:48:50 -04:00
simon987	6c4bdc87cf	add_work fix for problem in #349	2023-04-15 09:18:17 -04:00
simon987	1ea78887c3	Fix aarch64 build	2023-04-14 21:53:25 -04:00
simon987	886fa720ec	Fix for ES 8.X #302	2023-04-14 21:48:29 -04:00
simon987	d43aac735f	Add build flag to toggle debug info in web module	2023-04-14 21:07:48 -04:00
simon987	faf438a798	Add error message in home page on ES connection error #331	2023-04-14 20:51:35 -04:00
simon987	5b3b9911bd	Bug fix for delete iterator	2023-04-13 18:35:36 -04:00
simon987	237d55ec9c	Merge pull request #348 from simon987/dependabot/npm_and_yarn/sist2-vue/d3-color-and-d3-3.1.0 Bump d3-color and d3 in /sist2-vue	2023-04-10 20:40:35 -04:00
dependabot[bot]	ced4c7de88	Bump d3-color and d3 in /sist2-vue Bumps [d3-color](https://github.com/d3/d3-color) to 3.1.0 and updates ancestor dependency [d3](https://github.com/d3/d3). These dependencies need to be updated together. Updates `d3-color` from 1.4.1 to 3.1.0 - [Release notes](https://github.com/d3/d3-color/releases) - [Commits](https://github.com/d3/d3-color/compare/v1.4.1...v3.1.0) Updates `d3` from 5.16.0 to 7.8.4 - [Release notes](https://github.com/d3/d3/releases) - [Changelog](https://github.com/d3/d3/blob/main/CHANGES.md) - [Commits](https://github.com/d3/d3/compare/v5.16.0...v7.8.4) --- updated-dependencies: - dependency-name: d3-color dependency-type: indirect - dependency-name: d3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com>	2023-04-11 00:35:05 +00:00
simon987	90ee318981	Update CI build script	2023-04-10 20:07:52 -04:00
simon987	785121e46c	Merge pull request #347 from simon987/dependabot/npm_and_yarn/sist2-admin/frontend/webpack-5.78.0 Bump webpack from 5.75.0 to 5.78.0 in /sist2-admin/frontend	2023-04-10 19:57:27 -04:00
simon987	585c57a2ad	Fix antiword version	2023-04-10 19:57:02 -04:00
simon987	42abbbce95	Fix libmobi version	2023-04-10 19:54:05 -04:00
dependabot[bot]	e8607df26f	Bump webpack from 5.75.0 to 5.78.0 in /sist2-admin/frontend Bumps [webpack](https://github.com/webpack/webpack) from 5.75.0 to 5.78.0. - [Release notes](https://github.com/webpack/webpack/releases) - [Commits](https://github.com/webpack/webpack/compare/v5.75.0...v5.78.0) --- updated-dependencies: - dependency-name: webpack dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com>	2023-04-10 23:51:06 +00:00
simon987	f1726ca0a9	Merge pull request #342 from simon987/dependabot/npm_and_yarn/sist2-vue/webpack-5.76.1 Bump webpack from 5.75.0 to 5.76.1 in /sist2-vue	2023-04-10 19:50:40 -04:00
simon987	3ef675abcf	Merge pull request #345 from simon987/process-pool Process pool	2023-04-10 19:50:23 -04:00
simon987	01490d1cbf	Update sist2-admin for 3.x.x, more fixes	2023-04-10 19:45:08 -04:00
simon987	6182338f29	Update dependencies, fix some build issues	2023-04-10 15:10:56 -04:00
simon987	300c70883d	Fixes and cleanup	2023-04-10 11:04:16 -04:00
simon987	fc36f33d52	use sqlite to save index, major thread pool refactor	2023-04-03 21:39:50 -04:00
dependabot[bot]	81658efb19	Bump webpack from 5.75.0 to 5.76.1 in /sist2-vue Bumps [webpack](https://github.com/webpack/webpack) from 5.75.0 to 5.76.1. - [Release notes](https://github.com/webpack/webpack/releases) - [Commits](https://github.com/webpack/webpack/compare/v5.75.0...v5.76.1) --- updated-dependencies: - dependency-name: webpack dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com>	2023-03-15 10:25:29 +00:00
simon987	ca973d63a4	Still WIP..	2023-03-12 11:38:31 -04:00
simon987	f8abffba81	process pool mostly works, still WIP	2023-03-09 22:11:21 -05:00
simon987	60c77678b4	Merge pull request #339 from einfachTobi/patch-1 Update messages.ts	2023-02-28 17:40:45 -05:00
einfachTobi	bf1d2f7d55	Update messages.ts	2023-02-28 11:24:02 +01:00
simon987	8c662bb8f8	Adjust some structs	2023-02-27 20:44:25 -05:00
simon987	9c40dddd41	remove deprecated note	2023-02-26 11:03:29 -05:00
simon987	d259b95017	Update sist2-admin database schema, fix thumbnail-size	2023-02-26 10:42:20 -05:00
simon987	707bac86b3	Fix #329 , version bump	2023-02-23 21:21:54 -05:00
simon987	8b9b067c06	Fix #332	2023-02-23 19:53:05 -05:00
simon987	b17f3ff924	Merge pull request #338 from simon987/dependabot/npm_and_yarn/sist2-admin/frontend/sideway/formula-3.0.1 Bump @sideway/formula from 3.0.0 to 3.0.1 in /sist2-admin/frontend	2023-02-23 19:32:09 -05:00
simon987	e44fbf741c	update libscan-test-files	2023-02-23 18:13:27 -05:00
simon987	fa14efbeb6	Handle zipbomb files	2023-02-22 22:25:21 -05:00
simon987	c510162dd9	Fix duration formatting in sist2-admin	2023-02-16 21:07:30 -05:00
simon987	f5c664507f	use index name in sist2-admin auto-named dir	2023-02-16 09:03:06 -05:00
simon987	2805fd509f	Fix tag-auth param in sist2-admin #337	2023-02-13 20:19:24 -05:00
simon987	20adcce4a9	Remove default tags, add configurable featured line	2023-02-13 20:14:11 -05:00
simon987	1e6e24111b	Add german in loading page	2023-02-13 20:13:07 -05:00
dependabot[bot]	5a76b855c9	Bump @sideway/formula from 3.0.0 to 3.0.1 in /sist2-admin/frontend Bumps [@sideway/formula](https://github.com/sideway/formula) from 3.0.0 to 3.0.1. - [Release notes](https://github.com/sideway/formula/releases) - [Commits](https://github.com/sideway/formula/compare/v3.0.0...v3.0.1) --- updated-dependencies: - dependency-name: "@sideway/formula" dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com>	2023-02-09 08:44:17 +00:00
simon987	6f759642fc	Rework duration/resolution badge style	2023-02-07 20:39:12 -05:00
simon987	587c9a2c90	Add de lang option in config page	2023-02-03 09:27:30 -05:00
simon987	821a571ecf	Merge pull request #335 from einfachTobi/UI-localization-german UI localization german + equations in tesseract	2023-02-03 09:19:33 -05:00
einfachTobi	9020246a01	Merge branch 'simon987:master' into UI-localization-german	2023-02-03 10:18:54 +01:00
einfachTobi	200c000c5a	Update Dockerfile	2023-02-03 10:18:43 +01:00
einfachTobi	a43f930d00	Update messages.ts	2023-02-03 10:12:24 +01:00
simon987	abe120197a	Remove generated files from repo, build vue frontends in Dockerfile	2023-02-02 20:31:16 -05:00
simon987	9e0d7bf992	Add test files as submodule, remove support for msword thumbnails	2023-02-02 19:52:37 -05:00
einfachTobi	959d4b4386	Update messages.ts	2023-02-01 14:55:37 +01:00
einfachTobi	742a50be03	Update messages.ts	2023-02-01 12:54:06 +01:00
simon987	87ecc5ef6d	Update USAGE.md	2023-01-29 12:47:17 -05:00
simon987	2e3d648796	Update --thumbnail-quality argument, add documentation	2023-01-29 11:24:34 -05:00
simon987	9972e21fcc	Fix lightbox	2023-01-26 20:20:58 -05:00
simon987	c625c03552	Fix #328	2023-01-25 21:30:18 -05:00
simon987	5863b9cd6e	Merge pull request #327 from simon987/auth0 Add support for auth0	2023-01-24 19:56:05 -05:00
				`@@ -1 +0,0 @@`
				.navbar[data-v-27bc1d68]{box-shadow:0 .125rem .25rem rgba(0,0,0,.08)!important;border-radius:0}.theme-black .navbar[data-v-27bc1d68]{background:rgba(84,107,122,.18823529411764706);border-bottom:none}.navbar-brand[data-v-27bc1d68]{color:#222!important;font-size:1.75rem;padding:0}.navbar-brand[data-v-27bc1d68]:hover{color:#000!important}.version[data-v-27bc1d68]{color:#222!important;margin-left:-18px;margin-top:-14px;font-size:11px;font-family:monospace}.btn-link[data-v-27bc1d68]{color:#222}body,html{height:100%}#app{-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;color:#2c3e50;padding-bottom:1em;min-height:100%}.info-icon{width:1rem;margin-right:.2rem;cursor:pointer;line-height:1rem;height:1rem;background-image:url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCA0MjYuNjY3IDQyNi42NjciIGZpbGw9IiNmZmYiPjxwYXRoIGQ9Ik0xOTIgMTkyaDQyLjY2N3YxMjhIMTkyeiIvPjxwYXRoIGQ9Ik0yMTMuMzMzIDBDOTUuNDY3IDAgMCA5NS40NjcgMCAyMTMuMzMzczk1LjQ2NyAyMTMuMzMzIDIxMy4zMzMgMjEzLjMzM1M0MjYuNjY3IDMzMS4yIDQyNi42NjcgMjEzLjMzMyAzMzEuMiAwIDIxMy4zMzMgMHptMCAzODRjLTk0LjA4IDAtMTcwLjY2Ny03Ni41ODctMTcwLjY2Ny0xNzAuNjY3UzExOS4yNTMgNDIuNjY3IDIxMy4zMzMgNDIuNjY3IDM4NCAxMTkuMjUzIDM4NCAyMTMuMzMzIDMwNy40MTMgMzg0IDIxMy4zMzMgMzg0eiIvPjxwYXRoIGQ9Ik0xOTIgMTA2LjY2N2g0Mi42Njd2NDIuNjY3SDE5MnoiLz48L3N2Zz4=);filter:brightness(45%);display:block}.tabs{margin-top:10px}.modal-title{text-overflow:ellipsis;overflow:hidden;white-space:nowrap}@media screen and (min-width:1500px){.container{max-width:1440px}}label{margin-top:.5rem;margin-bottom:0}.shrink[data-v-9b017c42]{flex-grow:inherit}#task-history[data-v-46960281]{font-family:monospace;font-size:12px}#log-tail-output span{display:block}span.DEBUG{color:#9e9e9e}span.WARNING{color:#ffb300}span.INFO{color:#039be5}span.ERROR,span.FATAL{color:#f4511e}span.ADMIN{color:#ee05ff}#log-tail-output{font-size:13px;font-family:monospace;padding:6px;background-color:#f5f5f5;border:1px solid #ccc;border-radius:4px;margin:3px;white-space:pre;color:#000;overflow:hidden}
				`@@ -1 +0,0 @@`
				<!DOCTYPE html><html lang=""><head><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width,initial-scale=1"><link rel="icon" href="favicon.ico"><title>sist2-admin</title><link href="css/app.css" rel="preload" as="style"><link href="css/chunk-vendors.css" rel="preload" as="style"><link href="js/app.js" rel="preload" as="script"><link href="js/chunk-vendors.js" rel="preload" as="script"><link href="css/chunk-vendors.css" rel="stylesheet"><link href="css/app.css" rel="stylesheet"></head><body><noscript><strong>We're sorry but sist2-admin-vue doesn't work properly without JavaScript enabled. Please enable it to continue.</strong></noscript><div id="app"></div><script src="js/chunk-vendors.js"></script><script src="js/app.js"></script></body></html>