Push compiled vue changes

Merge pull request #231 from simon987/dev
v2.11.6
2025-12-12 23:18:51 +00:00 · 2022-01-09 09:30:31 -05:00 · 2022-01-09 09:28:24 -05:00 · 2022-01-08 18:23:22 -05:00 · 2022-01-08 15:04:07 -05:00 · 2022-01-08 14:24:50 -05:00
104 changed files with 6787 additions and 241 deletions
--- a/.drone.yml
+++ b/.drone.yml
@@ -10,22 +10,7 @@ steps:
  - name: build
    image: simon987/sist2-build
    commands:
-      - ./ci/build.sh
-  - name: docker
-    image: plugins/docker
-    settings:
-      username:
-        from_secret: DOCKER_USER
-      password:
-        from_secret: DOCKER_PASSWORD
-      repo: simon987/sist2
-      context: ./
-      dockerfile: ./Dockerfile
-      auto_tag: true
-      auto_tag_suffix: x64-linux
-      when:
-        event:
-          - tag
+      - ./scripts/build.sh
  - name: scp files
    image: appleboy/drone-scp
    settings:
@@ -42,6 +27,21 @@ steps:
        - ./VERSION
        - ./sist2-x64-linux
        - ./sist2-x64-linux-debug
+  - name: docker
+    image: plugins/docker
+    settings:
+      username:
+        from_secret: DOCKER_USER
+      password:
+        from_secret: DOCKER_PASSWORD
+      repo: simon987/sist2
+      context: ./
+      dockerfile: ./Dockerfile
+      auto_tag: true
+      auto_tag_suffix: x64-linux
+      when:
+        event:
+          - tag

 ---
 kind: pipeline
@@ -55,7 +55,7 @@ steps:
  - name: build
    image: simon987/sist2-build-arm64
    commands:
-      - ./ci/build_arm64.sh
+      - ./scripts/build_arm64.sh
  - name: scp files
    image: appleboy/drone-scp
    settings:
--- a/.gitignore
+++ b/.gitignore
@@ -10,17 +10,19 @@ Makefile
 LOG
 sist2*
 !sist2-vue/
-index.sist2/
+*.sist2/
 bundle*.css
 bundle.js
 *.a
 vgcore.*
 build/
-third-party/
+third-party/argparse
 *.idx/
 VERSION
 git_hash.h
 Testing/
 test_i
 test_i_inc
-node_modules/
+node_modules/
+.cmake/
+i_inc/
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,9 @@
-[submodule "third-party/libscan"]
-	path = third-party/libscan
-	url = https://github.com/simon987/libscan
 [submodule "third-party/argparse"]
 	path = third-party/argparse
-	url = https://github.com/cofyc/argparse
+	url = https://github.com/simon987/argparse
+[submodule "third-party/libscan/third-party/utf8.h"]
+	path = third-party/libscan/third-party/utf8.h
+	url = https://github.com/sheredom/utf8.h
+[submodule "third-party/libscan/third-party/antiword"]
+	path = third-party/libscan/third-party/antiword
+	url = https://github.com/simon987/antiword
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,9 +22,6 @@ add_subdirectory(third-party/argparse)

 add_executable(sist2

-        # argparse
-        third-party/argparse/argparse.h third-party/argparse/argparse.c
-
        src/main.c
        src/sist.h
        src/io/walk.h src/io/walk.c
@@ -41,7 +38,11 @@ add_executable(sist2
        src/log.c src/log.h
        src/cli.c src/cli.h
        src/stats.c src/stats.h src/ctx.c
-        src/parsing/sidecar.c src/parsing/sidecar.h)
+        src/parsing/sidecar.c src/parsing/sidecar.h
+
+        # argparse
+        third-party/argparse/argparse.h third-party/argparse/argparse.c
+        )

 target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/)
 set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib)
@@ -86,6 +87,7 @@ if (SIST_DEBUG)
            sist2
            PRIVATE
            -fsanitize=address
+            -static-libasan
    )
    set_target_properties(
            sist2
--- a/10
+++ b/10
@@ -6,12 +6,10 @@ COPY . .
 RUN cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake .
 RUN make -j$(nproc)
 RUN strip sist2
-RUN ls -lh
-RUN ls -lh sist2-vue/dist/

-FROM ubuntu:20.10
+FROM ubuntu:21.10

-RUN apt update && apt install -y curl libasan5
+RUN apt update && apt install -y curl libasan5 && rm -rf /var/lib/apt/lists/*

 RUN mkdir -p /usr/share/tessdata && \
    cd /usr/share/tessdata/ && \
@@ -22,9 +20,9 @@ RUN mkdir -p /usr/share/tessdata && \
    curl -o /usr/share/tessdata/rus.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/rus.traineddata &&\
    curl -o /usr/share/tessdata/spa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata

-COPY --from=build /build/sist2 /root/sist2
+ENTRYPOINT ["/root/sist2"]

 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8

-ENTRYPOINT ["/root/sist2"]
+COPY --from=build /build/sist2 /root/sist2
--- a/Dockerfile.arm64
+++ b/Dockerfile.arm64
@@ -7,9 +7,9 @@ RUN cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE
 RUN make -j$(nproc)
 RUN strip sist2

-FROM ubuntu:20.10
+FROM --platform="linux/arm64/v8" ubuntu:21.10

-RUN apt update && apt install -y curl libasan5
+RUN apt update && apt install -y curl libasan5 && rm -rf /var/lib/apt/lists/*

 RUN mkdir -p /usr/share/tessdata && \
    cd /usr/share/tessdata/ && \
@@ -20,9 +20,9 @@ RUN mkdir -p /usr/share/tessdata && \
    curl -o /usr/share/tessdata/rus.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/rus.traineddata &&\
    curl -o /usr/share/tessdata/spa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata

-COPY --from=build /build/sist2 /root/sist2
-
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8

-ENTRYPOINT ["/root/sist2"]
+ENTRYPOINT ["/root/sist2"]
+
+COPY --from=build /build/sist2 /root/sist2
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 [![CodeFactor](https://www.codefactor.io/repository/github/simon987/sist2/badge?s=05daa325188aac4eae32c786f3d9cf4e0593f822)](https://www.codefactor.io/repository/github/simon987/sist2)
 [![Development snapshots](https://ci.simon987.net/api/badges/simon987/sist2/status.svg)](https://files.simon987.net/.gate/sist2/simon987_sist2/)

-**Demo**: [sist2.simon987.net](https://sist2.simon987.net/?i=Demo%20files)
+**Demo**: [sist2.simon987.net](https://sist2.simon987.net/)

 # sist2

@@ -10,7 +10,7 @@ sist2 (Simple incremental search tool)

 *Warning: sist2 is in early development*

-![sist2.png](docs/sist2.png)
+![search panel](docs/sist2.png)

 ## Features

@@ -33,12 +33,11 @@ sist2 (Simple incremental search tool)

 ## Getting Started

-1. Have an Elasticsearch (>= 6.X.X) instance running
+1. Have an Elasticsearch (>= 6.8.X, ideally >=7.14.0) instance running
    1. Download [from official website](https://www.elastic.co/downloads/elasticsearch)
    1. *(or)* Run using docker:
        ```bash
-       docker run -d --name es1 --net sist2_net -p 9200:9200 \
-            -e "discovery.type=single-node" elasticsearch:7.14.0
+        docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.14.0
        ```
    1. *(or)* Run using docker-compose:
        ```yaml
@@ -52,7 +51,7 @@ sist2 (Simple incremental search tool)
    1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases) *
    1. *(or)* Download a [development snapshot](https://files.simon987.net/.gate/sist2/simon987_sist2/) *(Not
       recommended!)*
-    1. *(or)* `docker pull simon987/sist2:2.11.2-x64-linux`
+    1. *(or)* `docker pull simon987/sist2:2.11.6-x64-linux`

 1. See [Usage guide](docs/USAGE.md)

@@ -68,23 +67,23 @@ See [Usage guide](docs/USAGE.md) for more details

 ## Format support

-File type | Library | Content | Thumbnail | Metadata
-:---|:---|:---|:---|:---
-pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
-cbz,cbr | [libscan](https://github.com/simon987/libscan) | - | yes | - |
-`audio/*` | ffmpeg | - | yes | ID3 tags |
-`video/*` | ffmpeg | - | yes | title, comment, artist |
-`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
-raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf  | LibRaw | - | yes | Common EXIF tags, GPS tags |
-ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
-`text/plain` | [libscan](https://github.com/simon987/libscan) | yes | no | - |
-html, xml | [libscan](https://github.com/simon987/libscan) | yes | no | - |
-tar, zip, rar, 7z, ar ...  | Libarchive | yes\* | - | no |
-docx, xlsx, pptx | [libscan](https://github.com/simon987/libscan) | yes | if embedded | creator, modified_by, title |
-doc (MS Word 97-2003) | antiword | yes | yes | author, title |
-mobi, azw, azw3 | libmobi | yes | no | author, title |
-wpd (WordPerfect) | libwpd | yes | no | *planned* |
-json, jsonl, ndjson | [libscan](https://github.com/simon987/libscan) | yes | - | - |
+| File type                                                                 | Library                                                                      | Content  | Thumbnail   | Metadata                                                                                                                               |
+|:--------------------------------------------------------------------------|:-----------------------------------------------------------------------------|:---------|:------------|:---------------------------------------------------------------------------------------------------------------------------------------|
+| pdf,xps,fb2,epub                                                          | MuPDF                                                                        | text+ocr | yes         | author, title                                                                                                                          |
+| cbz,cbr                                                                   | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | -        | yes         | -                                                                                                                                      |
+| `audio/*`                                                                 | ffmpeg                                                                       | -        | yes         | ID3 tags                                                                                                                               |
+| `video/*`                                                                 | ffmpeg                                                                       | -        | yes         | title, comment, artist                                                                                                                 |
+| `image/*`                                                                 | ffmpeg                                                                       | -        | yes         | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
+| raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw                                                                       | -        | yes         | Common EXIF tags, GPS tags                                                                                                             |
+| ttf,ttc,cff,woff,fnt,otf                                                  | Freetype2                                                                    | -        | yes, `bmp`  | Name & style                                                                                                                           |
+| `text/plain`                                                              | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes      | no          | -                                                                                                                                      |
+| html, xml                                                                 | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes      | no          | -                                                                                                                                      |
+| tar, zip, rar, 7z, ar ...                                                 | Libarchive                                                                   | yes\*    | -           | no                                                                                                                                     |
+| docx, xlsx, pptx                                                          | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes      | if embedded | creator, modified_by, title                                                                                                            |
+| doc (MS Word 97-2003)                                                     | antiword                                                                     | yes      | yes         | author, title                                                                                                                          |
+| mobi, azw, azw3                                                           | libmobi                                                                      | yes      | no          | author, title                                                                                                                          |
+| wpd (WordPerfect)                                                         | libwpd                                                                       | yes      | no          | *planned*                                                                                                                              |
+| json, jsonl, ndjson                                                       | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes      | -           | -                                                                                                                                      |

 \* *See [Archive files](#archive-files)*

@@ -103,18 +102,24 @@ scan is also supported.

 ### OCR

-You can enable OCR support for pdf,xps,fb2,epub file types with the
-`--ocr <lang>` option. Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
+You can enable OCR support for ebook (pdf,xps,fb2,epub) or image file types with the
+`--ocr-lang <lang>` option in combination with `--ocr-images` and/or `--ocr-ebooks`.
+Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
 directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).

 The `simon987/sist2` image comes with common languages
 (hin, jpn, eng, fra, rus, spa) pre-installed.

-Examples
+You can use the `+` separator to specify multiple languages. The language
+name must be identical to the `*.traineddata` file installed on your system 
+(use `chi_sim` rather than `chi-sim`).
+
+Examples:

 ```bash
-sist2 scan --ocr jpn ~/Books/Manga/
-sist2 scan --ocr eng ~/Books/Textbooks/
+sist2 scan --ocr-ebooks --ocr-lang jpn ~/Books/Manga/
+sist2 scan --ocr-images --ocr-lang eng ~/Images/Screenshots/
+sist2 scan --ocr-ebooks --ocr-images --ocr-lang eng+chi_sim ~/Chinese-Bilingual/
 ```

 ## Build from source
@@ -127,7 +132,7 @@ You can compile **sist2** by yourself if you don't want to use the pre-compiled
 git clone --recursive https://github.com/simon987/sist2/
 cd sist2
 docker build . -f ./Dockerfile -t my-sist2-image
-docker run --rm my-sist2-image cat /root/sist2 > sist2-x64-linux
+docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
 ```

 ### On a linux computer
@@ -144,7 +149,7 @@ docker run --rm my-sist2-image cat /root/sist2 > sist2-x64-linux

    ```bash
    vcpkg install curl[core,openssl]
-    vcpkg install lmdb cjson glib brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libuuid libmagic libraw jasper lcms gumbo
+    vcpkg install lmdb cjson glib brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw jasper lcms gumbo
    ```

 1. Build
--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@@ -14,6 +14,7 @@
    * [examples](#web-examples)
    * [rewrite_url](#rewrite_url)
    * [link to specific indices](#link-to-specific-indices)
+* [elasticsearch](#elasticsearch)
 * [exec-script](#exec-script)
 * [tagging](#tagging)
 * [sidecar files](#sidecar-files)
@@ -42,7 +43,7 @@ Scan options
    --depth=<int>                 Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1
    --archive=<str>               Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse
    --archive-passphrase=<str>    Passphrase for encrypted archive files
-    --ocr=<str>                   Tesseract language (use tesseract --list-langs to see which are installed on your machine)
+    # TODO: add new --ocr-* options here
    -e, --exclude=<str>           Files that match this regex will not be scanned
    --fast                        Only index file names & mime type
    --treemap-threshold=<str>     Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
@@ -84,7 +85,7 @@ Exec-script options
 ### Scan options

 * `-t, --threads` 
-      Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-WmiObject Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
+      Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-CimInstance Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
 * `-q, --quality` 
    Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best.
 * `--size` 
@@ -266,9 +267,20 @@ sist2 web index1 index2 index3 index4
 When the `rewrite_url` field is not empty, the web module ignores the `root`
 field and will return a HTTP redirect to `<rewrite_url><path>/<name><extension>`
 instead of serving the file from disk. 
-Both the `root` and `rewrite_url` fields are safe to manually modify from the 
+Both the `root` and `rewrite_url` fields are safe to manually modify from the 
 `descriptor.json` file.

+# Elasticsearch
+
+Elasticsearch versions >=6.8.0, <8.0.0 are supported by sist2. 
+
+Using a version >=7.14.0 is recommended to enable the following features:
+
+- Bug fix for large documents (See #198)
+
+When using a legacy version of ES, a notice will be displayed next to the sist2 version in the web UI.
+If you don't care about the features above, you can ignore it or disable it in the configuration page.
+
 ## exec-script

 The `exec-script` command is used to execute a user script for an index that has already been imported to Elasticsearch with the `index` command. Note that the documents will not be reset to their default state before each execution as the `index` command does: if you make undesired changes to the documents by accident, you will need to run `index` again to revert to the original state.
@@ -303,7 +315,7 @@ See [scripting](scripting.md) documentation.
 # Sidecar files

 When scanning, sist2 will read metadata from `.s2meta` JSON files and overwrite the 
-original document's metadata. Sidecar metadata files will also work inside archives.
+original document's indexed metadata (does not modify the actual file). Sidecar metadata files will also work inside archives.
 Sidecar files themselves are not saved in the index.

 This feature is useful to leverage third-party applications such as speech-to-text or
--- a/docs/manual_tag.png
+++ b/docs/manual_tag.png
--- a/docs/sist2.png
+++ b/docs/sist2.png
--- a/schema/mappings.json
+++ b/schema/mappings.json
@@ -78,6 +78,7 @@
    "name": {
      "analyzer": "content_analyzer",
      "type": "text",
+      "fielddata": true,
      "fields": {
        "nGram": {
          "type": "text",
--- a/schema/settings_legacy.json
+++ b/schema/settings_legacy.json
@@ -0,0 +1,58 @@
+{
+  "index": {
+    "refresh_interval": "30s",
+    "codec": "best_compression",
+    "number_of_replicas": 0
+  },
+  "analysis": {
+    "tokenizer": {
+      "path_tokenizer": {
+        "type": "path_hierarchy",
+        "delimiter": "/"
+      },
+      "tag_tokenizer": {
+        "type": "path_hierarchy",
+        "delimiter": "."
+      },
+      "my_nGram_tokenizer": {
+        "type": "nGram",
+        "min_gram": 3,
+        "max_gram": 3
+      }
+    },
+    "analyzer": {
+      "path_analyzer": {
+        "tokenizer": "path_tokenizer",
+        "filter": [
+          "lowercase"
+        ]
+      },
+      "tag_analyzer": {
+        "tokenizer": "tag_tokenizer",
+        "filter": [
+          "lowercase"
+        ]
+      },
+      "case_insensitive_kw_analyzer": {
+        "tokenizer": "keyword",
+        "filter": [
+          "lowercase"
+        ]
+      },
+      "my_nGram": {
+        "tokenizer": "my_nGram_tokenizer",
+        "filter": [
+          "lowercase",
+          "asciifolding"
+        ]
+      },
+      "content_analyzer": {
+        "tokenizer": "standard",
+        "filter": [
+          "lowercase",
+          "asciifolding"
+        ]
+      }
+    }
+  }
+}
--- a/scripts/before_build.sh
+++ b/scripts/before_build.sh
@@ -6,5 +6,4 @@ python3 scripts/mime.py > src/parsing/mime_generated.c
 python3 scripts/serve_static.py > src/web/static_generated.c
 python3 scripts/index_static.py > src/index/static_generated.c

-printf "static const char *const Sist2CommitHash = \"%s\";\n" $(git rev-parse HEAD) > src/git_hash.h
-printf "static const char *const LibScanCommitHash = \"%s\";\n" $(cd third-party/libscan/ && git rev-parse HEAD) >> src/git_hash.h
+printf "static const char *const Sist2CommitHash = \"%s\";\n" $(git rev-parse HEAD) > src/git_hash.h
--- a/scripts/build.sh
+++ b/scripts/build.sh
--- a/scripts/build_arm64.sh
+++ b/scripts/build_arm64.sh
--- a/scripts/index_static.py
+++ b/scripts/index_static.py
@@ -3,6 +3,7 @@ import json
 files = [
    "schema/mappings.json",
    "schema/settings.json",
+    "schema/settings_legacy.json",
    "schema/pipeline.json",
 ]

--- a/sist2-vue/dist/css/index.css
+++ b/sist2-vue/dist/css/index.css
--- a/sist2-vue/dist/js/chunk-vendors.js
+++ b/sist2-vue/dist/js/chunk-vendors.js
--- a/sist2-vue/dist/js/index.js
+++ b/sist2-vue/dist/js/index.js
--- a/sist2-vue/src/Sist2Api.ts
+++ b/sist2-vue/src/Sist2Api.ts
@@ -51,6 +51,7 @@ export interface EsHit {
        duration: number
        tag: string[]
        checksum: string
+        thumbnail: string
    }
    _props: {
        isSubDocument: boolean
@@ -61,6 +62,8 @@ export interface EsHit {
        isPlayableImage: boolean
        isAudio: boolean
        hasThumbnail: boolean
+        tnW: number
+        tnH: number
    }
    highlight: {
        name: string[] | undefined,
@@ -131,6 +134,8 @@ class Sist2Api {

        if ("thumbnail" in hit._source) {
            hit._props.hasThumbnail = true;
+            hit._props.tnW = Number(hit._source.thumbnail.split(",")[0]);
+            hit._props.tnH = Number(hit._source.thumbnail.split(",")[1]);
        }

        switch (mimeCategory) {
@@ -251,20 +256,31 @@ class Sist2Api {
        });
    }

-    getMimeTypes() {
-        return this.esQuery({
-            aggs: {
-                mimeTypes: {
-                    terms: {
-                        field: "mime",
-                        size: 10000
-                    }
+    getMimeTypes(query = undefined) {
+        const AGGS = {
+            mimeTypes: {
+                terms: {
+                    field: "mime",
+                    size: 10000
                }
-            },
-            size: 0,
-        }).then(resp => {
+            }
+        };
+
+        if (!query) {
+            query = {
+                aggs: AGGS,
+                size: 0,
+            };
+        } else {
+            query.size = 0;
+            query.aggs = AGGS;
+        }
+
+        return this.esQuery(query).then(resp => {
            const mimeMap: any[] = [];
-            resp["aggregations"]["mimeTypes"]["buckets"].sort((a: any, b: any) => a.key > b.key).forEach((bucket: any) => {
+            const buckets = resp["aggregations"]["mimeTypes"]["buckets"];
+
+            buckets.sort((a: any, b: any) => a.key > b.key).forEach((bucket: any) => {
                const tmp = bucket["key"].split("/");
                const category = tmp[0];
                const mime = tmp[1];
@@ -284,11 +300,18 @@ class Sist2Api {
                });

                if (!category_exists) {
-                    mimeMap.push({"text": category, children: [child]});
+                    mimeMap.push({text: category, children: [child], id: category});
                }
            })

-            return mimeMap;
+            mimeMap.forEach(node => {
+                if (node.children) {
+                    node.children.sort((a, b) => a.id.localeCompare(b.id));
+                }
+            })
+            mimeMap.sort((a, b) => a.id.localeCompare(b.id))
+
+            return {buckets, mimeMap};
        });
    }

--- a/sist2-vue/src/Sist2Query.ts
+++ b/sist2-vue/src/Sist2Query.ts
@@ -43,6 +43,20 @@ const SORT_MODES = {
            {_tie: {order: "asc"}}
        ],
        key: (hit: EsHit) => hit._source.size
+    },
+    nameAsc: {
+        mode: [
+            {name: {order: "asc"}},
+            {_tie: {order: "asc"}}
+        ],
+        key: (hit: EsHit) => hit._source.name
+    },
+    nameDesc: {
+        mode: [
+            {name: {order: "desc"}},
+            {_tie: {order: "asc"}}
+        ],
+        key: (hit: EsHit) => hit._source.name
    }
 } as any;

@@ -73,6 +87,8 @@ class Sist2Query {
        const selectedMimeTypes = getters.selectedMimeTypes;
        const selectedTags = getters.selectedTags;

+        const legacyES = store.state.sist2Info.esVersionLegacy;
+
        const filters = [
            {terms: {index: selectedIndexIds}}
        ] as any[];
@@ -187,9 +203,13 @@ class Sist2Query {
                    "name.nGram": {},
                    "content.nGram": {},
                    font_name: {},
-                },
-                max_analyzed_offset: 9_999_999
+                }
            };
+
+            if (!legacyES) {
+                q.highlight.max_analyzed_offset = 9_999_999;
+            }
+
            if (getters.optSearchInPath) {
                q.highlight.fields["path.text"] = {};
                q.highlight.fields["path.nGram"] = {};
--- a/sist2-vue/src/components/DebugInfo.vue
+++ b/sist2-vue/src/components/DebugInfo.vue
@@ -5,7 +5,6 @@

    <b-card-body>

-      <!-- TODO: ES connectivity, Link to GH page -->
      <b-table :items="tableItems" small borderless responsive="md" thead-class="hidden" class="mb-0"></b-table>

      <hr />
@@ -16,7 +15,7 @@

 <script>
 import IndexDebugInfo from "@/components/IndexDebugInfo";
-import DebugIcon from "@/components/DebugIcon";
+import DebugIcon from "@/components/icons/DebugIcon";

 export default {
  name: "DebugInfo.vue",
@@ -28,10 +27,12 @@ export default {
        {key: "platform", value: this.$store.state.sist2Info.platform},
        {key: "debugBinary", value: this.$store.state.sist2Info.debug},
        {key: "sist2CommitHash", value: this.$store.state.sist2Info.sist2Hash},
-        {key: "libscanCommitHash", value: this.$store.state.sist2Info.libscanHash},
        {key: "esIndex", value: this.$store.state.sist2Info.esIndex},
        {key: "tagline", value: this.$store.state.sist2Info.tagline},
        {key: "dev", value: this.$store.state.sist2Info.dev},
+        {key: "esVersion", value: this.$store.state.sist2Info.esVersion},
+        {key: "esVersionSupported", value: this.$store.state.sist2Info.esVersionSupported},
+        {key: "esVersionLegacy", value: this.$store.state.sist2Info.esVersionLegacy},
      ]
    }
  }
--- a/sist2-vue/src/components/DocCard.vue
+++ b/sist2-vue/src/components/DocCard.vue
@@ -15,11 +15,15 @@
          <span class="badge badge-resolution">{{ humanTime(doc._source.duration) }}</span>
        </div>

-        <div v-if="doc._props.isImage && !hover" class="card-img-overlay" :class="{'small-badge': smallBadge}">
+        <div
+            v-if="doc._props.isImage && !hover && doc._props.tnW / doc._props.tnH < 5"
+            class="card-img-overlay"
+            :class="{'small-badge': smallBadge}">
          <span class="badge badge-resolution">{{ `${doc._source.width}x${doc._source.height}` }}</span>
        </div>

-        <div v-if="(doc._props.isVideo || doc._props.isGif) && doc._source.duration > 0 && !hover" class="card-img-overlay"
+        <div v-if="(doc._props.isVideo || doc._props.isGif) && doc._source.duration > 0 && !hover"
+             class="card-img-overlay"
             :class="{'small-badge': smallBadge}">
          <span class="badge badge-resolution">{{ humanTime(doc._source.duration) }}</span>
        </div>
@@ -30,16 +34,19 @@
          </svg>
        </div>

-        <img v-if="doc._props.isPlayableImage || doc._props.isPlayableVideo"
+        <img ref="tn"
+             v-if="doc._props.isPlayableImage || doc._props.isPlayableVideo"
             :src="(doc._props.isGif && hover) ? `f/${doc._id}` : `t/${doc._source.index}/${doc._id}`"
             alt=""
+             :style="{height: (doc._props.isGif && hover) ? `${tnHeight()}px` : undefined}"
             class="pointer fit card-img-top" @click="onThumbnailClick()">
        <img v-else :src="`t/${doc._source.index}/${doc._id}`" alt=""
             class="fit card-img-top">
      </div>

      <!-- Audio player-->
-      <audio v-if="doc._props.isAudio" ref="audio" preload="none" class="audio-fit fit" controls :type="doc._source.mime"
+      <audio v-if="doc._props.isAudio" ref="audio" preload="none" class="audio-fit fit" controls
+             :type="doc._source.mime"
             :src="`f/${doc._id}`"
             @play="onAudioPlay()"></audio>

@@ -117,6 +124,9 @@ export default {
    },
    onTnLeave() {
      this.hover = false;
+    },
+    tnHeight() {
+      return this.$refs.tn.height;
    }
  },
 }
--- a/sist2-vue/src/components/DocListItem.vue
+++ b/sist2-vue/src/components/DocListItem.vue
@@ -1,5 +1,6 @@
 <template>
-  <b-list-group-item class="flex-column align-items-start mb-2" :class="{'sub-document': doc._props.isSubDocument}">
+  <b-list-group-item class="flex-column align-items-start mb-2" :class="{'sub-document': doc._props.isSubDocument}"
+                     @mouseenter="onTnEnter()" @mouseleave="onTnLeave()" >

    <!-- Info modal-->
    <DocInfoModal :show="showInfo" :doc="doc" @close="showInfo = false"></DocInfoModal>
@@ -56,7 +57,7 @@ import TagContainer from "@/components/TagContainer";
 import DocFileTitle from "@/components/DocFileTitle";
 import DocInfoModal from "@/components/DocInfoModal";
 import ContentDiv from "@/components/ContentDiv";
-import FileIcon from "@/components/FileIcon";
+import FileIcon from "@/components/icons/FileIcon";

 export default {
  name: "DocListItem",
@@ -85,7 +86,13 @@ export default {
        return this.doc.highlight["path.nGram"] + "/"
      }
      return this.doc._source.path + "/"
-    }
+    },
+    onTnEnter() {
+      this.hover = true;
+    },
+    onTnLeave() {
+      this.hover = false;
+    },
  }
 }
 </script>
--- a/sist2-vue/src/components/IndexPicker.vue
+++ b/sist2-vue/src/components/IndexPicker.vue
@@ -7,11 +7,27 @@
        value-field="id"></b-form-select>
  </div>
  <div v-else>
-    <b-list-group id="index-picker-desktop">
+
+    <div class="d-flex justify-content-between align-content-center">
+      <span>
+        {{ selectedIndices.length }}
+        {{ selectedIndices.length === 1 ? $t("indexPicker.selectedIndex") : $t("indexPicker.selectedIndices") }}
+      </span>
+
+      <div>
+        <b-button variant="link" @click="selectAll()"> {{ $t("indexPicker.selectAll") }}</b-button>
+        <b-button variant="link" @click="selectNone()"> {{ $t("indexPicker.selectNone") }}</b-button>
+      </div>
+    </div>
+
+    <b-list-group id="index-picker-desktop" class="unselectable">
      <b-list-group-item
          v-for="idx in indices"
-          @click="toggleIndex(idx)"
-          class="d-flex justify-content-between align-items-center list-group-item-action pointer">
+          @click="toggleIndex(idx, $event)"
+          @click.shift="shiftClick(idx, $event)"
+          class="d-flex justify-content-between align-items-center list-group-item-action pointer"
+          :class="{active: lastClickIndex === idx}"
+      >
        <div class="d-flex">
          <b-checkbox @change="toggleIndex(idx)" :checked="isSelected(idx)"></b-checkbox>
          {{ idx.name }}
@@ -36,6 +52,7 @@ export default Vue.extend({
  data() {
    return {
      loading: true,
+      lastClickIndex: null
    }
  },
  computed: {
@@ -53,13 +70,50 @@ export default Vue.extend({
    ...mapActions({
      setSelectedIndices: "setSelectedIndices"
    }),
+    shiftClick(index, e) {
+      if (this.lastClickIndex === null) {
+        return;
+      }
+
+      const select = this.isSelected(this.lastClickIndex);
+
+      let leftBoundary = this.indices.indexOf(this.lastClickIndex);
+      let rightBoundary = this.indices.indexOf(index);
+
+      if (rightBoundary < leftBoundary) {
+        let tmp = leftBoundary;
+        leftBoundary = rightBoundary;
+        rightBoundary = tmp;
+      }
+
+      for (let i = leftBoundary; i <= rightBoundary; i++) {
+        if (select) {
+          if (!this.isSelected(this.indices[i])) {
+            this.setSelectedIndices([this.indices[i], ...this.selectedIndices]);
+          }
+        } else {
+          this.setSelectedIndices(this.selectedIndices.filter(idx => idx !== this.indices[i]));
+        }
+      }
+    },
+    selectAll() {
+      this.setSelectedIndices(this.indices);
+    },
+    selectNone() {
+      this.setSelectedIndices([]);
+    },
    onSelect(value) {
      this.setSelectedIndices(this.indices.filter(idx => value.includes(idx.id)));
    },
    formatIdxDate(timestamp: number): string {
      return format(new Date(timestamp * 1000), "yyyy-MM-dd");
    },
-    toggleIndex(index) {
+    toggleIndex(index, e) {
+      if (e.shiftKey) {
+        return;
+      }
+
+      this.lastClickIndex = index;
      if (this.isSelected(index)) {
        this.setSelectedIndices(this.selectedIndices.filter(idx => idx.id != index.id));
      } else {
@@ -79,6 +133,11 @@ export default Vue.extend({
  font-size: 80%;
 }

+.theme-black .version-badge {
+  color: #eee !important;
+  background: none;
+}
+
 .version-badge {
  color: #222 !important;
  background: none;
@@ -92,4 +151,21 @@ export default Vue.extend({
  overflow-y: auto;
  max-height: 132px;
 }
+
+.btn-link:focus {
+  box-shadow: none;
+}
+
+.unselectable {
+  user-select: none;
+  -ms-user-select: none;
+  -moz-user-select: none;
+  -webkit-user-select: none;
+}
+
+.list-group-item.active {
+  z-index: 2;
+  background-color: inherit;
+  color: inherit;
+}
 </style>
--- a/sist2-vue/src/components/Lightbox.vue
+++ b/sist2-vue/src/components/Lightbox.vue
@@ -1,6 +1,5 @@
 <template>
  <div>
-    <!-- TODO: Set slideshowTime as a configurable option-->
    <FsLightbox
        :key="lightboxKey"
        :toggler="showLightbox"
@@ -10,7 +9,7 @@
        :types="lightboxTypes"
        :source-index="lightboxSlide"
        :custom-toolbar-buttons="customButtons"
-        :slideshow-time="1000 * 10"
+        :slideshow-time="$store.getters.optLightboxSlideDuration * 1000"
        :zoom-increment="0.5"
        :load-only-current-source="$store.getters.optLightboxLoadOnlyCurrent"
        :on-close="onClose"
--- a/sist2-vue/src/components/MimePicker.vue
+++ b/sist2-vue/src/components/MimePicker.vue
@@ -7,37 +7,24 @@ import InspireTree from "inspire-tree";
 import InspireTreeDOM from "inspire-tree-dom";

 import "inspire-tree-dom/dist/inspire-tree-light.min.css";
-import {getSelectedTreeNodes} from "@/util";
+import {getSelectedTreeNodes, getTreeNodeAttributes} from "@/util";
+import Sist2Api from "@/Sist2Api";
+import Sist2Query from "@/Sist2Query";

 export default {
  name: "MimePicker",
  data() {
    return {
      mimeTree: null,
+      stashedMimeTreeAttributes: null
    }
  },
  mounted() {
    this.$store.subscribe((mutation) => {
-      if (mutation.type === "setUiMimeMap") {
-        const mimeMap = mutation.payload.slice();
-
-        this.mimeTree = new InspireTree({
-          selection: {
-            mode: 'checkbox'
-          },
-          data: mimeMap
-        });
-        new InspireTreeDOM(this.mimeTree, {
-          target: '#mimeTree'
-        });
-        this.mimeTree.on("node.state.changed", this.handleTreeClick);
-        this.mimeTree.deselect();
-
-        if (this.$store.state._onLoadSelectedMimeTypes.length > 0) {
-          this.$store.state._onLoadSelectedMimeTypes.forEach(mime => {
-            this.mimeTree.node(mime).select();
-          });
-        }
+      if (mutation.type === "setUiMimeMap" && this.mimeTree === null) {
+        this.initializeTree();
+      } else if (mutation.type === "busSearch") {
+        this.updateTree();
      }
    });
  },
@@ -49,6 +36,73 @@ export default {

      this.$store.commit("setSelectedMimeTypes", getSelectedTreeNodes(this.mimeTree));
    },
+    updateTree() {
+
+      if (this.$store.getters.optUpdateMimeMap === false) {
+        return;
+      }
+
+      if (this.stashedMimeTreeAttributes === null) {
+        this.stashedMimeTreeAttributes = getTreeNodeAttributes(this.mimeTree);
+      }
+
+      const query = Sist2Query.searchQuery();
+
+      Sist2Api.getMimeTypes(query).then(({buckets, mimeMap}) => {
+        this.$store.commit("setUiMimeMap", mimeMap);
+        this.$store.commit("setUiDetailsMimeAgg", buckets);
+
+        this.mimeTree.removeAll();
+        this.mimeTree.addNodes(mimeMap);
+
+        // Restore selected mimes
+        if (this.stashedMimeTreeAttributes === null) {
+          // NOTE: This happens when successive fast searches are triggered
+          this.stashedMimeTreeAttributes = {};
+          // Always add the selected mime types
+          this.$store.state.selectedMimeTypes.forEach(mime => {
+            this.stashedMimeTreeAttributes[mime] = {
+              checked: true
+            }
+          });
+        }
+
+        Object.entries(this.stashedMimeTreeAttributes).forEach(([mime, attributes]) => {
+          if (this.mimeTree.node(mime)) {
+            if (attributes.checked) {
+              this.mimeTree.node(mime).select();
+            }
+            if (attributes.collapsed === false) {
+              this.mimeTree.node(mime).expand();
+            }
+          }
+        });
+        this.stashedMimeTreeAttributes = null;
+      });
+    },
+
+    initializeTree() {
+      const mimeMap = this.$store.state.uiMimeMap;
+
+      this.mimeTree = new InspireTree({
+        selection: {
+          mode: "checkbox"
+        },
+        data: mimeMap
+      });
+
+      new InspireTreeDOM(this.mimeTree, {
+        target: "#mimeTree"
+      });
+      this.mimeTree.on("node.state.changed", this.handleTreeClick);
+      this.mimeTree.deselect();
+
+      if (this.$store.state._onLoadSelectedMimeTypes.length > 0) {
+        this.$store.state._onLoadSelectedMimeTypes.forEach(mime => {
+          this.mimeTree.node(mime).select();
+        });
+      }
+    }
  }
 }
 </script>
--- a/sist2-vue/src/components/NavBar.vue
+++ b/sist2-vue/src/components/NavBar.vue
@@ -8,7 +8,8 @@
    </b-navbar-brand>

    <span class="badge badge-pill version" v-if="$store && $store.state.sist2Info">
-      v{{ sist2Version() }}<span v-if="isDebug()">-dbg</span>
+      v{{ sist2Version() }}<span v-if="isDebug()">-dbg</span><span v-if="isLegacy() && !hideLegacy()">-<a
+        href="https://github.com/simon987/sist2/blob/master/docs/USAGE.md#elasticsearch" target="_blank">legacyES</a></span>
    </span>

    <span v-if="$store && $store.state.sist2Info" class="tagline" v-html="tagline()"></span>
@@ -19,7 +20,8 @@
 </template>

 <script>
-import Sist2Icon from "@/components/Sist2Icon";
+import Sist2Icon from "@/components/icons/Sist2Icon";
+
 export default {
  name: "NavBar",
  components: {Sist2Icon},
@@ -32,6 +34,12 @@ export default {
    },
    isDebug() {
      return this.$store.state.sist2Info.debug;
+    },
+    isLegacy() {
+      return this.$store.state.sist2Info.esVersionLegacy;
+    },
+    hideLegacy() {
+      return this.$store.state.optHideLegacy;
    }
  }
 }
@@ -95,7 +103,7 @@ export default {
  }
 }

-.theme-light .btn-link{
+.theme-light .btn-link {
  color: #222;
 }
 </style>
--- a/sist2-vue/src/components/ResultsCard.vue
+++ b/sist2-vue/src/components/ResultsCard.vue
@@ -3,31 +3,56 @@
    <span>{{ hitCount }} {{ hitCount === 1 ? $t("hit") : $t("hits") }}</span>

    <div style="float: right">
-      <b-button v-b-toggle.collapse-1 variant="primary" class="not-mobile">{{ $t("details") }}</b-button>
+      <b-button v-b-toggle.collapse-1 variant="primary" class="not-mobile" @click="onToggle()">{{
+          $t("details")
+        }}
+      </b-button>

-      <SortSelect class="ml-2"></SortSelect>
+      <template v-if="hitCount !== 0">
+        <SortSelect class="ml-2"></SortSelect>

-      <DisplayModeToggle class="ml-2"></DisplayModeToggle>
+        <DisplayModeToggle class="ml-2"></DisplayModeToggle>
+      </template>
    </div>

    <b-collapse id="collapse-1" class="pt-2" style="clear:both;">
      <b-card>
-        <b-table :items="tableItems" small borderless thead-class="hidden" class="mb-0"></b-table>
+        <b-table :items="tableItems" small borderless bordered thead-class="hidden" class="mb-0"></b-table>
+
+        <br/>
+        <h4>
+          {{$t("mimeTypes")}}
+          <b-button size="sm" variant="primary" class="float-right" @click="onCopyClick"><ClipboardIcon/></b-button>
+        </h4>
+        <Preloader v-if="$store.state.uiDetailsMimeAgg == null"></Preloader>
+        <b-table
+            v-else
+            sort-by="doc_count"
+            :sort-desc="true"
+            thead-class="hidden"
+            :items="$store.state.uiDetailsMimeAgg" small bordered class="mb-0"
+        ></b-table>
      </b-card>
    </b-collapse>
  </b-card>
 </template>

 <script lang="ts">
-import {EsResult} from "@/Sist2Api";
+import Sist2Api, {EsResult} from "@/Sist2Api";
 import Vue from "vue";
-import {humanFileSize, humanTime} from "@/util";
+import {humanFileSize} from "@/util";
 import DisplayModeToggle from "@/components/DisplayModeToggle.vue";
 import SortSelect from "@/components/SortSelect.vue";
+import Preloader from "@/components/Preloader.vue";
+import Sist2Query from "@/Sist2Query";
+import ClipboardIcon from "@/components/icons/ClipboardIcon.vue";

 export default Vue.extend({
  name: "ResultsCard",
-  components: {SortSelect, DisplayModeToggle},
+  components: {ClipboardIcon, Preloader, SortSelect, DisplayModeToggle},
+  created() {
+
+  },
  computed: {
    lastResultsLoaded() {
      return this.$store.state.lastQueryResults != null;
@@ -52,6 +77,39 @@ export default Vue.extend({
    totalSize() {
      return humanFileSize((this.$store.state.lastQueryResults as EsResult).aggregations.total_size.value);
    },
+    onToggle() {
+      const show = !document.getElementById("collapse-1").classList.contains("show");
+      this.$store.commit("setUiShowDetails", show);
+
+      if (show && this.$store.state.uiDetailsMimeAgg == null && !this.$store.state.optUpdateMimeMap) {
+        // Mime aggs are not updated automatically, update now
+        this.forceUpdateMimeAgg();
+      }
+    },
+    onCopyClick() {
+      let tsvString = "";
+      this.$store.state.uiDetailsMimeAgg.slice().sort((a,b) => b["doc_count"] - a["doc_count"]).forEach(row => {
+        tsvString += `${row["key"]}\t${row["doc_count"]}\n`;
+      });
+
+      navigator.clipboard.writeText(tsvString);
+
+      this.$bvToast.toast(
+          this.$t("toast.copiedToClipboard"),
+          {
+            title: null,
+            noAutoHide: false,
+            toaster: "b-toaster-bottom-right",
+            headerClass: "hidden",
+            bodyClass: "toast-body-info",
+          });
+    },
+    forceUpdateMimeAgg() {
+      const query = Sist2Query.searchQuery();
+      Sist2Api.getMimeTypes(query).then(({buckets}) => {
+        this.$store.commit("setUiDetailsMimeAgg", buckets);
+      });
+    }
  },
 });

--- a/sist2-vue/src/components/SortSelect.vue
+++ b/sist2-vue/src/components/SortSelect.vue
@@ -19,6 +19,14 @@
      {{ $t("sort.sizeDesc") }}
    </b-dropdown-item>

+    <b-dropdown-item :class="{'dropdown-active': sort === 'nameDesc'}" @click="onSelect('nameDesc')">
+      {{ $t("sort.nameDesc") }}
+    </b-dropdown-item>
+
+    <b-dropdown-item :class="{'dropdown-active': sort === 'nameAsc'}" @click="onSelect('nameAsc')">
+      {{ $t("sort.nameAsc") }}
+    </b-dropdown-item>
+
    <b-dropdown-item :class="{'dropdown-active': sort === 'random'}" @click="onSelect('random')">
      {{ $t("sort.random") }}
    </b-dropdown-item>
--- a/sist2-vue/src/components/TagContainer.vue
+++ b/sist2-vue/src/components/TagContainer.vue
@@ -51,7 +51,7 @@
        >{{ tag.text.split(".").pop() }}</span>

        <b-popover :target="hit._id+tag.rawText" triggers="focus blur" placement="top">
-          <b-button variant="danger" @click="onTagDeleteClick(tag, $event)">Delete</b-button>
+          <b-button variant="danger" @click="onTagDeleteClick(tag, $event)">{{$t("deleteTag")}}</b-button>
        </b-popover>
      </div>

@@ -63,7 +63,7 @@
    </template>

    <!-- Add button -->
-    <small v-if="showAddButton" class="badge add-tag-button" @click="tagAdd()">Add</small>
+    <small v-if="showAddButton" class="badge add-tag-button" @click="tagAdd()">{{$t("addTag")}}</small>

    <!-- Size tag-->
    <small v-else class="text-muted badge-size">{{
--- a/sist2-vue/src/components/TagPicker.vue
+++ b/sist2-vue/src/components/TagPicker.vue
@@ -120,7 +120,7 @@ export default {
  },
  mounted() {
    this.$store.subscribe((mutation) => {
-      if (mutation.type === "setUiMimeMap") {
+      if (mutation.type === "setUiMimeMap" && this.tagTree === null) {
        this.initializeTree();
        this.updateTree();
      } else if (mutation.type === "busUpdateTags") {
@@ -147,6 +147,7 @@ export default {
      this.tagTree.on("node.state.changed", this.handleTreeClick);
    },
    updateTree() {
+      // TODO: remember which tags are selected and restore?
      const tagMap = [];
      Sist2Api.getTags().then(tags => {
        tags.forEach(tag => addTag(tagMap, tag.id, tag.id, tag.count));
--- a/sist2-vue/src/components/icons/ClipboardIcon.vue
+++ b/sist2-vue/src/components/icons/ClipboardIcon.vue
@@ -0,0 +1,21 @@
+<template>
+  <svg style="width:24px;height:24px" viewBox="0 0 24 24">
+    <path
+        fill="currentColor"
+        d="M17,9H7V7H17M17,13H7V11H17M14,17H7V15H14M12,3A1,1 0 0,1 13,4A1,1 0 0,1 12,5A1,1 0 0,1 11,4A1,1 0 0,1 12,3M19,3H14.82C14.4,1.84 13.3,1 12,1C10.7,1 9.6,1.84 9.18,3H5A2,2 0 0,0 3,5V19A2,2 0 0,0 5,21H19A2,2 0 0,0 21,19V5A2,2 0 0,0 19,3Z"/>
+  </svg>
+</template>
+
+<script>
+export default {
+  name: "ClipboardIcon"
+}
+</script>
+
+<style scoped>
+svg {
+  display: inline-block;
+  width: 20px;
+  height: 20px;
+}
+</style>
--- a/sist2-vue/src/components/icons/DebugIcon.vue
+++ b/sist2-vue/src/components/icons/DebugIcon.vue
--- a/sist2-vue/src/components/icons/FileIcon.vue
+++ b/sist2-vue/src/components/icons/FileIcon.vue
--- a/sist2-vue/src/components/icons/GearIcon.vue
+++ b/sist2-vue/src/components/icons/GearIcon.vue
--- a/sist2-vue/src/components/icons/LanguageIcon.vue
+++ b/sist2-vue/src/components/icons/LanguageIcon.vue
@@ -0,0 +1,21 @@
+<template>
+  <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
+    <path
+        fill="currentColor"
+        d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm1 16.057v-3.057h2.994c-.059 1.143-.212 2.24-.456 3.279-.823-.12-1.674-.188-2.538-.222zm1.957 2.162c-.499 1.33-1.159 2.497-1.957 3.456v-3.62c.666.028 1.319.081 1.957.164zm-1.957-7.219v-3.015c.868-.034 1.721-.103 2.548-.224.238 1.027.389 2.111.446 3.239h-2.994zm0-5.014v-3.661c.806.969 1.471 2.15 1.971 3.496-.642.084-1.3.137-1.971.165zm2.703-3.267c1.237.496 2.354 1.228 3.29 2.146-.642.234-1.311.442-2.019.607-.344-.992-.775-1.91-1.271-2.753zm-7.241 13.56c-.244-1.039-.398-2.136-.456-3.279h2.994v3.057c-.865.034-1.714.102-2.538.222zm2.538 1.776v3.62c-.798-.959-1.458-2.126-1.957-3.456.638-.083 1.291-.136 1.957-.164zm-2.994-7.055c.057-1.128.207-2.212.446-3.239.827.121 1.68.19 2.548.224v3.015h-2.994zm1.024-5.179c.5-1.346 1.165-2.527 1.97-3.496v3.661c-.671-.028-1.329-.081-1.97-.165zm-2.005-.35c-.708-.165-1.377-.373-2.018-.607.937-.918 2.053-1.65 3.29-2.146-.496.844-.927 1.762-1.272 2.753zm-.549 1.918c-.264 1.151-.434 2.36-.492 3.611h-3.933c.165-1.658.739-3.197 1.617-4.518.88.361 1.816.67 2.808.907zm.009 9.262c-.988.236-1.92.542-2.797.9-.89-1.328-1.471-2.879-1.637-4.551h3.934c.058 1.265.231 2.488.5 3.651zm.553 1.917c.342.976.768 1.881 1.257 2.712-1.223-.49-2.326-1.211-3.256-2.115.636-.229 1.299-.435 1.999-.597zm9.924 0c.7.163 1.362.367 1.999.597-.931.903-2.034 1.625-3.257 2.116.489-.832.915-1.737 1.258-2.713zm.553-1.917c.27-1.163.442-2.386.501-3.651h3.934c-.167 1.672-.748 3.223-1.638 4.551-.877-.358-1.81-.664-2.797-.9zm.501-5.651c-.058-1.251-.229-2.46-.492-3.611.992-.237 1.929-.546 2.809-.907.877 1.321 1.451 2.86 1.616 4.518h-3.933z"/>
+  </svg>
+</template>
+
+<script>
+export default {
+  name: "LanguageIcon"
+}
+</script>
+
+<style scoped>
+svg {
+  display: inline-block;
+  width: 20px;
+  height: 20px;
+}
+</style>
--- a/sist2-vue/src/components/icons/Sist2Icon.vue
+++ b/sist2-vue/src/components/icons/Sist2Icon.vue
--- a/sist2-vue/src/i18n/messages.ts
+++ b/sist2-vue/src/i18n/messages.ts
@@ -5,6 +5,8 @@ export default {
            advanced: "Advanced search",
            fuzzy: "Fuzzy"
        },
+        addTag: "Add",
+        deleteTag: "Delete",
        download: "Download",
        and: "and",
        page: "page",
@@ -63,7 +65,9 @@ export default {
            slideDuration: "Slide duration",
            resultSize: "Number of results per page",
            tagOrOperator: "Use OR operator when specifying multiple tags.",
-            hideDuplicates: "Hide duplicate results based on checksum"
+            hideDuplicates: "Hide duplicate results based on checksum",
+            hideLegacy: "Hide the 'legacyES' Elasticsearch notice",
+            updateMimeMap: "Update the Media Types tree in real time"
        },
        queryMode: {
            simple: "Simple",
@@ -71,7 +75,8 @@ export default {
        },
        lang: {
            en: "English",
-            fr: "Français"
+            fr: "Français",
+            "zh-CN": "简体中文",
        },
        displayMode: {
            grid: "Grid",
@@ -125,18 +130,21 @@ export default {
            esQueryErr: "Could not parse or execute query, please check the Advanced search documentation. " +
                "See server logs for more information.",
            dupeTagTitle: "Duplicate tag",
-            dupeTag: "This tag already exists for this document."
+            dupeTag: "This tag already exists for this document.",
+            copiedToClipboard: "Copied to clipboard"
        },
        saveTagModalTitle: "Add tag",
        saveTagPlaceholder: "Tag name",
        confirm: "Confirm",
-        indexPickerPlaceholder: "Select indices",
+        indexPickerPlaceholder: "Select an index",
        sort: {
            relevance: "Relevance",
            dateAsc: "Date (Older first)",
            dateDesc: "Date (Newer first)",
            sizeAsc: "Size (Smaller first)",
            sizeDesc: "Size (Larger first)",
+            nameAsc: "Name (A-z)",
+            nameDesc: "Name (Z-a)",
            random: "Random",
        },
        d3: {
@@ -144,7 +152,13 @@ export default {
            mimeSize: "Size distribution by media type",
            dateHistogram: "File modification time distribution",
            sizeHistogram: "File size distribution",
-        }
+        },
+        indexPicker: {
+            selectNone: "Select None",
+            selectAll: "Select All",
+            selectedIndex: "selected index",
+            selectedIndices: "selected indices",
+        },
    },
    fr: {
        searchBar: {
@@ -152,6 +166,8 @@ export default {
            advanced: "Recherche avancée",
            fuzzy: "Approximatif"
        },
+        addTag: "Ajouter",
+        deleteTag: "Supprimer",
        download: "Télécharger",
        and: "et",
        page: "page",
@@ -211,7 +227,9 @@ export default {
            slideDuration: "Durée des diapositives",
            resultSize: "Nombre de résultats par page",
            tagOrOperator: "Utiliser l'opérateur OU lors de la spécification de plusieurs tags",
-            hideDuplicates: "Masquer les résultats en double"
+            hideDuplicates: "Masquer les résultats en double",
+            hideLegacy: "Masquer la notice 'legacyES' Elasticsearch",
+            updateMimeMap: "Mettre à jour l'arbre de Types de médias en temps réel"
        },
        queryMode: {
            simple: "Simple",
@@ -219,7 +237,8 @@ export default {
        },
        lang: {
            en: "English",
-            fr: "Français"
+            fr: "Français",
+            "zh-CN": "简体中文",
        },
        displayMode: {
            grid: "Grille",
@@ -274,7 +293,8 @@ export default {
            esQueryErr: "Impossible d'analyser ou d'exécuter la requête, veuillez consulter la documentation sur la " +
                "recherche avancée. Voir les journaux du serveur pour plus d'informations.",
            dupeTagTitle: "Tag en double",
-            dupeTag: "Ce tag existe déjà pour ce document."
+            dupeTag: "Ce tag existe déjà pour ce document.",
+            copiedToClipboard: "Copié dans le presse-papier"
        },
        saveTagModalTitle: "Ajouter un tag",
        saveTagPlaceholder: "Nom du tag",
@@ -286,6 +306,8 @@ export default {
            dateDesc: "Date (Plus récent)",
            sizeAsc: "Taille (Plus petit)",
            sizeDesc: "Taille (Plus grand)",
+            nameAsc: "Nom (A-z)",
+            nameDesc: "Nom (Z-a)",
            random: "Aléatoire",
        },
        d3: {
@@ -293,6 +315,173 @@ export default {
            mimeSize: "Distribution des tailles de fichiers par type de média",
            dateHistogram: "Distribution des dates de modification",
            sizeHistogram: "Distribution des tailles de fichier",
-        }
-    }
-}
+        },
+        indexPicker: {
+            selectNone: "Sélectionner aucun",
+            selectAll: "Sélectionner tout",
+            selectedIndex: "indice sélectionné",
+            selectedIndices: "indices sélectionnés",
+        },
+    },
+    "zh-CN": {
+        searchBar: {
+            simple: "搜索",
+            advanced: "高级搜索",
+            fuzzy: "模糊搜索"
+        },
+        addTag: "添加",
+        deleteTag: "删除",
+        download: "下载",
+        and: "与",
+        page: "页",
+        pages: "页",
+        mimeTypes: "文件类型",
+        tags: "标签",
+        help: {
+            simpleSearch: "简易搜索",
+            advancedSearch: "高级搜索",
+            help: "帮助",
+            term: "<关键词>",
+            and: "与操作",
+            or: "或操作",
+            not: "反选单个关键词",
+            quotes: "括起来的部分视为一个关键词，保序",
+            prefix: "在词尾使用时，匹配前缀",
+            parens: "表达式编组",
+            tildeTerm: "匹配编辑距离以内的关键词",
+            tildePhrase: "匹配短语，容忍一些非匹配词",
+            example1:
+                "例如: <code>\"番茄\" +(炒蛋 | 牛腩) -饭</code> 将匹配" +
+                "短语 <i>番茄炒蛋</i>、<i>炒蛋</i> 或者 <i>牛腩</i>，而忽略任何带有" +
+                "<i>饭</i>的关键词.",
+            defaultOperator:
+                "表达式中无<code>+</code>或者<code>|</code>时，默认使用" +
+                "<code>+</code>（与操作）。",
+            fuzzy:
+                "选中<b>模糊搜索</b>选项时，返回部分匹配的结果（3-grams)。",
+            moreInfoSimple: "详细信息：<a target=\"_blank\" " +
+                "rel=\"noreferrer\" href=\"//www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html\">Elasticsearch文档</a>",
+            moreInfoAdvanced: "高级搜索模式文档：<a target=\"_blank\" rel=\"noreferrer\" href=\"//www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax\">Elasticsearch文档</a>"
+        },
+        config: "配置",
+        configDescription: "配置在此浏览器中实时保存。",
+        configReset: "重置所有设置",
+        searchOptions: "搜索选项",
+        treemapOptions: "树状图选项",
+        displayOptions: "显示选项",
+        opt: {
+            lang: "语言",
+            highlight: "启用高亮",
+            fuzzy: "默认使用模糊搜索",
+            searchInPath: "匹配文档路径",
+            suggestPath: "搜索框启用自动补全",
+            fragmentSize: "高亮上下文大小",
+            queryMode: "搜索模式",
+            displayMode: "显示",
+            columns: "列数",
+            treemapType: "树状图类属性",
+            treemapTiling: "树状图平铺",
+            treemapColorGroupingDepth: "树状图颜色编组深度（展开）",
+            treemapColor: "树状图颜色（折叠）",
+            treemapSize: "树状图大小",
+            theme: "主题",
+            lightboxLoadOnlyCurrent: "在图片查看器中，不要预读相邻的全图",
+            slideDuration: "幻灯片时长",
+            resultSize: "每页结果数",
+            tagOrOperator: "使用或操作（OR）匹配多个标签。",
+            hideDuplicates: "使用校验码隐藏重复结果",
+            hideLegacy: "隐藏'legacyES' Elasticsearch 通知",
+            updateMimeMap: "媒体类型树的实时更新"
+        },
+        queryMode: {
+            simple: "简单",
+            advanced: "高级",
+        },
+        lang: {
+            en: "English",
+            fr: "Français",
+            "zh-CN": "简体中文",
+        },
+        displayMode: {
+            grid: "网格",
+            list: "列表",
+        },
+        columns: {
+            auto: "自动"
+        },
+        treemapType: {
+            cascaded: "折叠",
+            flat: "平铺（紧凑）"
+        },
+        treemapSize: {
+            small: "小",
+            medium: "中",
+            large: "大",
+            xLarge: "加大",
+            xxLarge: "加加大",
+            custom: "自订",
+        },
+        treemapTiling: {
+            binary: "Binary",
+            squarify: "Squarify",
+            slice: "Slice",
+            dice: "Dice",
+            sliceDice: "Slice & Dice",
+        },
+        theme: {
+            light: "亮",
+            black: "暗"
+        },
+        hit: "命中",
+        hits: "命中",
+        details: "详细信息",
+        stats: "统计信息",
+        queryTime: "查询时间",
+        totalSize: "总大小",
+        pathBar: {
+            placeholder: "过滤路径",
+            modalTitle: "选择路径"
+        },
+        debug: "调试信息",
+        debugDescription: "对调试除错有用的信息。 若您遇到bug或者想建议新功能，请提交新Issue到" +
+            "<a href='https://github.com/simon987/sist2/issues/new/choose'>这里</a>.",
+        tagline: "标签栏",
+        toast: {
+            esConnErrTitle: "Elasticsearch连接错误",
+            esConnErr: "sist2 web 模块连接Elasticsearch出错。" +
+                "查看服务日志以获取更多信息。",
+            esQueryErrTitle: "查询错误",
+            esQueryErr: "无法识别或执行查询，请查阅高级搜索文档。" +
+                "查看服务日志以获取更多信息。",
+            dupeTagTitle: "重复标签",
+            dupeTag: "该标签已存在于此文档。",
+            copiedToClipboard: "复制到剪贴板"
+        },
+        saveTagModalTitle: "增加标签",
+        saveTagPlaceholder: "标签名",
+        confirm: "确认",
+        indexPickerPlaceholder: "选择一个索引",
+        sort: {
+            relevance: "相关度",
+            dateAsc: "日期（由旧到新）",
+            dateDesc: "日期（由新到旧）",
+            sizeAsc: "大小（从小到大）",
+            sizeDesc: "大小（从大到小）",
+            nameAsc: "名字（A-z）",
+            nameDesc: "名字 （Z-a）",
+            random: "随机",
+        },
+        d3: {
+            mimeCount: "各类文件数量分布",
+            mimeSize: "各类文件大小分布",
+            dateHistogram: "文件修改时间分布",
+            sizeHistogram: "文件大小分布",
+        },
+        indexPicker: {
+            selectNone: "清空",
+            selectAll: "全选",
+            selectedIndex: "选中索引",
+            selectedIndices: "选中索引",
+        },
+    },
+}
--- a/sist2-vue/src/store/index.ts
+++ b/sist2-vue/src/store/index.ts
@@ -27,6 +27,7 @@ export default new Vuex.Store({
        size: 60,

        optLang: "en",
+        optLangIsDefault: true,
        optHideDuplicates: true,
        optTheme: "light",
        optDisplay: "grid",
@@ -46,6 +47,8 @@ export default new Vuex.Store({
        optTreemapColor: "PuBuGn",
        optLightboxLoadOnlyCurrent: false,
        optLightboxSlideDuration: 15,
+        optHideLegacy: false,
+        optUpdateMimeMap: true,

        _onLoadSelectedIndices: [] as string[],
        _onLoadSelectedMimeTypes: [] as string[],
@@ -70,9 +73,14 @@ export default new Vuex.Store({
        uiLightboxSlide: 0,
        uiReachedScrollEnd: false,

+        uiDetailsMimeAgg: null,
+        uiShowDetails: false,
+
        uiMimeMap: [] as any[]
    },
    mutations: {
+        setUiShowDetails: (state, val) => state.uiShowDetails = val,
+        setUiDetailsMimeAgg: (state, val) => state.uiDetailsMimeAgg = val,
        setUiReachedScrollEnd: (state, val) => state.uiReachedScrollEnd = val,
        setTags: (state, val) => state.tags = val,
        setPathText: (state, val) => state.pathText = val,
@@ -81,7 +89,10 @@ export default new Vuex.Store({
        setSist2Info: (state, val) => state.sist2Info = val,
        setSeed: (state, val) => state.seed = val,
        setOptHideDuplicates: (state, val) => state.optHideDuplicates = val,
-        setOptLang: (state, val) => state.optLang = val,
+        setOptLang: (state, val) => {
+            state.optLang = val;
+            state.optLangIsDefault = false;
+        },
        setSortMode: (state, val) => state.sortMode = val,
        setIndices: (state, val) => {
            state.indices = val;
@@ -144,8 +155,11 @@ export default new Vuex.Store({
        setOptTreemapColorGroupingDepth: (state, val) => state.optTreemapColorGroupingDepth = val,
        setOptTreemapSize: (state, val) => state.optTreemapSize = val,
        setOptTreemapColor: (state, val) => state.optTreemapColor = val,
+        setOptHideLegacy: (state, val) => state.optHideLegacy = val,
+        setOptUpdateMimeMap: (state, val) => state.optUpdateMimeMap = val,

        setOptLightboxLoadOnlyCurrent: (state, val) => state.optLightboxLoadOnlyCurrent = val,
+        setOptLightboxSlideDuration: (state, val) => state.optLightboxSlideDuration = val,

        setUiMimeMap: (state, val) => state.uiMimeMap = val,

@@ -155,8 +169,18 @@ export default new Vuex.Store({
        busUpdateTags: () => {
            // noop
        },
+        busSearch: () => {
+            // noop
+        },
    },
    actions: {
+        setSist2Info: (store, val) => {
+            store.commit("setSist2Info", val);
+
+            if (store.state.optLangIsDefault) {
+                store.commit("setOptLang", val.lang);
+            }
+        },
        loadFromArgs({commit}, route: Route) {

            if (route.query.q) {
@@ -276,6 +300,7 @@ export default new Vuex.Store({
            commit("setUiLightboxTypes", []);
            commit("setUiLightboxCaptions", []);
            commit("setUiLightboxKey", 0);
+            commit("setUiDetailsMimeAgg", null);
        }
    },
    modules: {},
@@ -339,5 +364,7 @@ export default new Vuex.Store({
        optLightboxLoadOnlyCurrent: state => state.optLightboxLoadOnlyCurrent,
        optLightboxSlideDuration: state => state.optLightboxSlideDuration,
        optResultSize: state => state.size,
+        optHideLegacy: state => state.optHideLegacy,
+        optUpdateMimeMap: state => state.optUpdateMimeMap,
    }
 })
--- a/sist2-vue/src/util.ts
+++ b/sist2-vue/src/util.ts
@@ -97,6 +97,30 @@ export function getSelectedTreeNodes(tree: any) {
    return Array.from(selectedNodes);
 }

+export function getTreeNodeAttributes(tree: any) {
+    const nodes = tree.selectable();
+    const attributes = {};
+
+    for (let i = 0; i < nodes.length; i++) {
+
+        let id = null;
+
+        if (nodes[i].text.indexOf("(") !== -1 && nodes[i].values) {
+            id = nodes[i].values.slice(-1)[0];
+        } else {
+            id = nodes[i].id
+        }
+
+        attributes[id] = {
+            checked: nodes[i].itree.state.checked,
+            collapsed: nodes[i].itree.state.collapsed,
+        }
+    }
+
+    return attributes;
+}
+
+
 export function serializeMimes(mimes: string[]): string | undefined {
    if (mimes.length == 0) {
        return undefined;
--- a/sist2-vue/src/views/Configuration.vue
+++ b/sist2-vue/src/views/Configuration.vue
@@ -15,11 +15,8 @@
        <h4>{{ $t("displayOptions") }}</h4>

        <b-card>
-          <b-form-checkbox :checked="optLightboxLoadOnlyCurrent" @input="setOptLightboxLoadOnlyCurrent">
-            {{ $t("opt.lightboxLoadOnlyCurrent") }}
-          </b-form-checkbox>

-          <label>{{ $t("opt.lang") }}</label>
+          <label><LanguageIcon/><span style="vertical-align: middle">&nbsp;{{ $t("opt.lang") }}</span></label>
          <b-form-select :options="langOptions" :value="optLang" @input="setOptLang"></b-form-select>

          <label>{{ $t("opt.theme") }}</label>
@@ -30,6 +27,20 @@

          <label>{{ $t("opt.columns") }}</label>
          <b-form-select :options="columnsOptions" :value="optColumns" @input="setOptColumns"></b-form-select>
+
+          <div style="height: 10px"></div>
+
+          <b-form-checkbox :checked="optLightboxLoadOnlyCurrent" @input="setOptLightboxLoadOnlyCurrent">
+            {{ $t("opt.lightboxLoadOnlyCurrent") }}
+          </b-form-checkbox>
+
+          <b-form-checkbox :checked="optHideLegacy" @input="setOptHideLegacy">
+            {{ $t("opt.hideLegacy") }}
+          </b-form-checkbox>
+
+          <b-form-checkbox :checked="optUpdateMimeMap" @input="setOptUpdateMimeMap">
+            {{ $t("opt.updateMimeMap") }}
+          </b-form-checkbox>
        </b-card>

        <br/>
@@ -113,15 +124,15 @@
 </template>

 <script>
-import Vue from "vue";
-import {mapGetters, mapMutations} from "vuex";
+import {mapActions, mapGetters, mapMutations} from "vuex";
 import DebugInfo from "@/components/DebugInfo.vue";
 import Preloader from "@/components/Preloader.vue";
 import sist2 from "@/Sist2Api";
-import GearIcon from "@/components/GearIcon.vue";
+import GearIcon from "@/components/icons/GearIcon.vue";
+import LanguageIcon from "@/components/icons/LanguageIcon";

 export default {
-  components: {GearIcon, DebugInfo, Preloader},
+  components: {LanguageIcon, GearIcon, DebugInfo, Preloader},
  data() {
    return {
      loading: true,
@@ -129,6 +140,7 @@ export default {
      langOptions: [
        {value: "en", text: this.$t("lang.en")},
        {value: "fr", text: this.$t("lang.fr")},
+        {value: "zh-CN", text: this.$t("lang.zh-CN")},
      ],
      queryModeOptions: [
        {value: "simple", text: this.$t("queryMode.simple")},
@@ -215,6 +227,8 @@ export default {
      "optTagOrOperator",
      "optLang",
      "optHideDuplicates",
+      "optHideLegacy",
+      "optUpdateMimeMap",
    ]),
    clientWidth() {
      return window.innerWidth;
@@ -222,7 +236,7 @@ export default {
  },
  mounted() {
    sist2.getSist2Info().then(data => {
-      this.$store.commit("setSist2Info", data)
+      this.setSist2Info(data);
      this.loading = false;
    });

@@ -233,6 +247,9 @@ export default {
    });
  },
  methods: {
+    ...mapActions({
+      setSist2Info: "setSist2Info",
+    }),
    ...mapMutations([
      "setOptTheme",
      "setOptDisplay",
@@ -250,11 +267,12 @@ export default {
      "setOptTreemapSize",
      "setOptLightboxLoadOnlyCurrent",
      "setOptLightboxSlideDuration",
-      "setOptContainerWidth",
      "setOptResultSize",
      "setOptTagOrOperator",
      "setOptLang",
-      "setOptHideDuplicates"
+      "setOptHideDuplicates",
+      "setOptHideLegacy",
+      "setOptUpdateMimeMap"
    ]),
    onResetClick() {
      localStorage.removeItem("sist2_configuration");
--- a/sist2-vue/src/views/SearchPage.vue
+++ b/sist2-vue/src/views/SearchPage.vue
@@ -31,7 +31,7 @@
          </b-row>
        </b-col>
        <b-col>
-          <b-tabs>
+          <b-tabs justified>
            <b-tab :title="$t('mimeTypes')">
              <MimePicker></MimePicker>
            </b-tab>
@@ -43,9 +43,13 @@
      </b-row>
    </b-card>

-    <Preloader v-if="searchBusy && docs.length === 0" class="mt-3"></Preloader>
+    <div v-show="docs.length === 0 && !uiLoading">
+      <Preloader v-if="searchBusy" class="mt-3"></Preloader>

-    <div v-else-if="docs.length > 0">
+      <ResultsCard></ResultsCard>
+    </div>
+
+    <div v-if="docs.length > 0">
      <ResultsCard></ResultsCard>

      <DocCardWall v-if="optDisplay==='grid'" :docs="docs" :append="appendFunc"></DocCardWall>
@@ -56,7 +60,7 @@

 <script lang="ts">
 import Preloader from "@/components/Preloader.vue";
-import {mapGetters, mapMutations} from "vuex";
+import {mapActions, mapGetters, mapMutations} from "vuex";
 import sist2 from "../Sist2Api";
 import Sist2Api, {EsHit, EsResult} from "../Sist2Api";
 import SearchBar from "@/components/SearchBar.vue";
@@ -109,10 +113,6 @@ export default Vue.extend({

    }, 350, {leading: false});

-    Sist2Api.getMimeTypes().then(mimeMap => {
-      this.$store.commit("setUiMimeMap", mimeMap);
-    });
-
    this.$store.dispatch("loadFromArgs", this.$route).then(() => {
      this.$store.subscribe(() => this.$store.dispatch("updateArgs", this.$router));
      this.$store.subscribe((mutation) => {
@@ -138,17 +138,23 @@ export default Vue.extend({
      sist2.getSist2Info().then(data => {
        this.setSist2Info(data);
        this.setIndices(data.indices);
-        this.uiLoading = false;

-        this.search(true);
+        Sist2Api.getMimeTypes(Sist2Query.searchQuery()).then(({mimeMap}) => {
+          this.$store.commit("setUiMimeMap", mimeMap);
+          this.uiLoading = false;
+          this.search(true);
+        });
+
      }).catch(() => {
        this.showErrorToast();
      });
    });
  },
  methods: {
-    ...mapMutations({
+    ...mapActions({
      setSist2Info: "setSist2Info",
+    }),
+    ...mapMutations({
      setIndices: "setIndices",
      setDateBoundsMin: "setDateBoundsMin",
      setDateBoundsMax: "setDateBoundsMax",
@@ -179,6 +185,7 @@ export default Vue.extend({
    async searchNow(q: any) {
      this.searchBusy = true;
      await this.$store.dispatch("incrementQuerySequence");
+      this.$store.commit("busSearch");

      Sist2Api.esQuery(q).then(async (resp: EsResult) => {
        await this.handleSearch(resp);
@@ -209,7 +216,7 @@ export default Vue.extend({
        resp.hits.hits = resp.hits.hits.filter(hit => {

          if (!("checksum" in hit._source)) {
-              return true;
+            return true;
          }

          const isDupe = !this.docChecksums.has(hit._source.checksum);
@@ -280,6 +287,11 @@ export default Vue.extend({
  border: none;
 }

+.toast-header-info, .toast-body-info {
+  background: #2196f3;
+  color: #fff !important;
+}
+
 .toast-header-error, .toast-body-error {
  background: #a94442;
  color: #f2dede !important;
--- a/src/cli.c
+++ b/src/cli.c
@@ -22,6 +22,7 @@
 const char *TESS_DATAPATHS[] = {
        "/usr/share/tessdata/",
        "/usr/share/tesseract-ocr/tessdata/",
+        "/usr/share/tesseract-ocr/4.00/tessdata/",
        "./",
        NULL
 };
@@ -145,7 +146,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
    if (args->name == NULL) {
        args->name = g_path_get_basename(args->output);
    } else {
-        char* tmp = malloc(strlen(args->name) + 1);
+        char *tmp = malloc(strlen(args->name) + 1);
        strcpy(tmp, args->name);
        args->name = tmp;
    }
@@ -167,17 +168,50 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
        return 1;
    }

-    if (args->tesseract_lang != NULL) {
-        TessBaseAPI *api = TessBaseAPICreate();
+    if (args->ocr_images && args->tesseract_lang == NULL) {
+        fprintf(stderr, "You must specify --ocr-lang <LANG> to use --ocr-images");
+        return 1;
+    }

-        char filename[128];
-        sprintf(filename, "%s.traineddata", args->tesseract_lang);
-        const char *path = find_file_in_paths(TESS_DATAPATHS, filename);
-        if (path == NULL) {
-            LOG_FATAL("cli.c", "Could not find tesseract language file!");
+    if (args->ocr_ebooks && args->tesseract_lang == NULL) {
+        fprintf(stderr, "You must specify --ocr-lang <LANG> to use --ocr-ebooks");
+        return 1;
+    }
+
+    if (args->tesseract_lang != NULL) {
+
+        if (!args->ocr_ebooks && !args->ocr_images) {
+            fprintf(stderr, "You must specify at least one of --ocr-ebooks, --ocr-images");
+            return 1;
        }

-        ret = TessBaseAPIInit3(api, path, args->tesseract_lang);
+        TessBaseAPI *api = TessBaseAPICreate();
+
+        const char *trained_data_path = NULL;
+        char *lang = malloc(strlen(args->tesseract_lang) + 1);
+        strcpy(lang, args->tesseract_lang);
+
+        lang = strtok(lang, "+");
+
+        while (lang != NULL) {
+            char filename[128];
+            sprintf(filename, "%s.traineddata", lang);
+
+            const char *path = find_file_in_paths(TESS_DATAPATHS, filename);
+            if (path == NULL) {
+                LOG_FATALF("cli.c", "Could not find tesseract language file: %s!", filename);
+            }
+            if (trained_data_path != NULL && path != trained_data_path) {
+                LOG_FATAL("cli.c", "When specifying more than one tesseract language, all the traineddata "
+                                   "files must be in the same folder")
+            }
+            trained_data_path = path;
+
+            lang = strtok(NULL, "+");
+        }
+        free(lang);
+
+        ret = TessBaseAPIInit3(api, trained_data_path, args->tesseract_lang);
        if (ret != 0) {
            fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang);
            return 1;
@@ -185,7 +219,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
        TessBaseAPIEnd(api);
        TessBaseAPIDelete(api);

-        args->tesseract_path = path;
+        args->tesseract_path = trained_data_path;
    }

    if (args->exclude_regex != NULL) {
@@ -218,6 +252,19 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
        args->max_memory_buffer = DEFAULT_MAX_MEM_BUFFER;
    }

+    if (args->list_path != NULL) {
+        if (strcmp(args->list_path, "-") == 0) {
+            args->list_file = stdin;
+            LOG_DEBUG("cli.c", "Using stdin as list file")
+        } else {
+            args->list_file = fopen(args->list_path, "r");
+
+            if (args->list_file == NULL) {
+                LOG_FATALF("main.c", "List file could not be opened: %s (%s)", args->list_path, errno);
+            }
+        }
+    }
+
    LOG_DEBUGF("cli.c", "arg quality=%f", args->quality)
    LOG_DEBUGF("cli.c", "arg size=%d", args->size)
    LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size)
@@ -237,6 +284,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
    LOG_DEBUGF("cli.c", "arg fast_epub=%d", args->fast_epub)
    LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold)
    LOG_DEBUGF("cli.c", "arg max_memory_buffer=%d", args->max_memory_buffer)
+    LOG_DEBUGF("cli.c", "arg list_path=%s", args->list_path)

    return 0;
 }
@@ -362,15 +410,15 @@ int web_args_validate(web_args_t *args, int argc, const char **argv) {
        args->es_index = DEFAULT_ES_INDEX;
    }

-    if (args->lang == NULL) {
-        args->lang = DEFAULT_LANG;
-    }
-
    if (args->tagline == NULL) {
        args->tagline = DEFAULT_TAGLINE;
    }

-    if (strlen(args->lang) != 2) {
+    if (args->lang == NULL) {
+        args->lang = DEFAULT_LANG;
+    }
+
+    if (strlen(args->lang) != 2 && strlen(args->lang) != 5) {
        fprintf(stderr, "Invalid --lang value, see usage\n");
        return 1;
    }
--- a/src/cli.h
+++ b/src/cli.h
@@ -21,6 +21,8 @@ typedef struct scan_args {
    char *archive_passphrase;
    char *tesseract_lang;
    const char *tesseract_path;
+    int ocr_images;
+    int ocr_ebooks;
    char *exclude_regex;
    int fast;
    const char* treemap_threshold_str;
@@ -29,6 +31,8 @@ typedef struct scan_args {
    int read_subtitles;
    int fast_epub;
    int calculate_checksums;
+    char *list_path;
+    FILE *list_file;
 } scan_args_t;

 scan_args_t *scan_args_create();
--- a/src/ctx.c
+++ b/src/ctx.c
@@ -2,6 +2,7 @@

 ScanCtx_t ScanCtx = {
        .stat_index_size = 0,
+        .stat_tn_size = 0,
        .dbg_current_files = NULL,
        .pool = NULL
 };
--- a/src/ctx.h
+++ b/src/ctx.h
@@ -17,6 +17,7 @@
 #include "libscan/wpd/wpd.h"
 #include "libscan/json/json.h"
 #include "src/io/store.h"
+#include "src/index/elastic.h"

 #include <glib.h>
 #include <pcre.h>
@@ -40,6 +41,7 @@ typedef struct {

    GHashTable *original_table;
    GHashTable *copy_table;
+    pthread_mutex_t copy_table_mu;

    pcre *exclude;
    pcre_extra *exclude_extra;
@@ -75,6 +77,7 @@ typedef struct {

 typedef struct {
    char *es_url;
+    es_version_t *es_version;
    char *es_index;
    int batch_size;
    tpool_t *pool;
@@ -86,6 +89,7 @@ typedef struct {

 typedef struct {
    char *es_url;
+    es_version_t *es_version;
    char *es_index;
    int index_count;
    char *auth_user;
@@ -94,7 +98,7 @@ typedef struct {
    int tag_auth_enabled;
    char *tagline;
    struct index_t indices[256];
-    char lang[3];
+    char lang[10];
    int dev;
 } WebCtx_t;

--- a/src/index/elastic.c
+++ b/src/index/elastic.c
@@ -253,7 +253,7 @@ void _elastic_flush(int max) {
    } else {

        print_errors(r);
-        LOG_INFOF("elastic.c", "Indexed %d documents (%zukB) <%d>", count, buf_len / 1024, r->status_code);
+        LOG_DEBUGF("elastic.c", "Indexed %d documents (%zukB) <%d>", count, buf_len / 1024, r->status_code);
        delete_queue(max);

        if (Indexer->queued != 0) {
@@ -356,7 +356,65 @@ void finish_indexer(char *script, int async_script, char *index_id) {
    free_response(r);
 }

-void elastic_init(int force_reset, const char* user_mappings, const char* user_settings) {
+es_version_t *elastic_get_version(const char *es_url) {
+    response_t *r = web_get(es_url, 30);
+
+    char *tmp = malloc(r->size + 1);
+    memcpy(tmp, r->body, r->size);
+    *(tmp + r->size) = '\0';
+    cJSON *response = cJSON_Parse(tmp);
+    free(tmp);
+    free_response(r);
+
+    if (response == NULL) {
+        return NULL;
+    }
+
+    if (cJSON_GetObjectItem(response, "version") == NULL ||
+        cJSON_GetObjectItem(cJSON_GetObjectItem(response, "version"), "number") == NULL) {
+        cJSON_Delete(response);
+        return NULL;
+    }
+
+    char *version_str = cJSON_GetObjectItem(cJSON_GetObjectItem(response, "version"), "number")->valuestring;
+
+    es_version_t *version = malloc(sizeof(es_version_t));
+
+    const char *tok = strtok(version_str, ".");
+    version->major = atoi(tok);
+    tok = strtok(NULL, ".");
+    version->minor = atoi(tok);
+    tok = strtok(NULL, ".");
+    version->patch = atoi(tok);
+
+    cJSON_Delete(response);
+
+    return version;
+}
+
+void elastic_init(int force_reset, const char *user_mappings, const char *user_settings) {
+
+    es_version_t *es_version = elastic_get_version(IndexCtx.es_url);
+    IndexCtx.es_version = es_version;
+
+    if (es_version == NULL) {
+        LOG_FATAL("elastic.c", "Could not get ES version")
+    }
+
+    LOG_INFOF("elastic.c",
+              "Elasticsearch version is %s (supported=%d, legacy=%d)",
+              format_es_version(es_version), IS_SUPPORTED_ES_VERSION(es_version), USE_LEGACY_ES_SETTINGS(es_version));
+
+    if (!IS_SUPPORTED_ES_VERSION(es_version)) {
+        LOG_FATAL("elastic.c", "sist2 only supports Elasticsearch v6.8 or newer")
+    }
+
+    char *settings = NULL;
+    if (USE_LEGACY_ES_SETTINGS(es_version)) {
+        settings = settings_json;
+    } else {
+        settings = settings_legacy_json;
+    }

    // Check if index exists
    char url[4096];
@@ -392,7 +450,7 @@ void elastic_init(int force_reset, const char* user_mappings, const char* user_s
        free_response(r);

        snprintf(url, sizeof(url), "%s/%s/_settings", IndexCtx.es_url, IndexCtx.es_index);
-        r = web_put(url, user_settings ? user_settings : settings_json);
+        r = web_put(url, user_settings ? user_settings : settings);
        LOG_INFOF("elastic.c", "Update ES settings <%d>", r->status_code);
        if (r->status_code != 200) {
            print_error(r);
--- a/src/index/elastic.h
+++ b/src/index/elastic.h
@@ -9,6 +9,26 @@ typedef struct es_bulk_line {
    char line[0];
 } es_bulk_line_t;

+typedef struct {
+    int major;
+    int minor;
+    int patch;
+} es_version_t;
+
+#define VERSION_GE(version, maj, min) ((version)->major > (maj) || ((version)->major == (maj) && (version)->minor >= (min)))
+#define IS_SUPPORTED_ES_VERSION(es_version) VERSION_GE((es_version), 6, 8)
+#define USE_LEGACY_ES_SETTINGS(es_version) (!VERSION_GE((es_version), 7, 14))
+
+__always_inline
+static const char *format_es_version(es_version_t *version) {
+    static char buf[64];
+
+    snprintf(buf, sizeof(buf), "%d.%d.%d", version->major, version->minor, version->patch);
+
+    return buf;
+}
+
+
 /**
 * Note: indexer is *not* thread safe
 */
@@ -31,6 +51,8 @@ cJSON *elastic_get_document(const char *id_str);

 char *elastic_get_status();

+es_version_t *elastic_get_version(const char *es_url);
+
 void execute_update_script(const char *script, int async, const char index_id[MD5_STR_LENGTH]);

 #endif
--- a/src/index/static_generated.c
+++ b/src/index/static_generated.c
--- a/src/io/serialize.c
+++ b/src/io/serialize.c
@@ -38,6 +38,8 @@ char *get_meta_key_text(enum metakey meta_key) {
            return "parent";
        case MetaExifMake:
            return "exif_make";
+        case MetaExifDescription:
+            return "exif_description";
        case MetaExifSoftware:
            return "exif_software";
        case MetaExifExposureTime:
@@ -150,6 +152,7 @@ char *build_json_string(document_t *doc) {
            case MetaFontName:
            case MetaParent:
            case MetaExifMake:
+            case MetaExifDescription:
            case MetaExifSoftware:
            case MetaExifExposureTime:
            case MetaExifFNumber:
--- a/src/io/store.c
+++ b/src/io/store.c
@@ -23,7 +23,6 @@ store_t *store_create(const char *path, size_t chunk_size) {
    }

    store->size = (size_t) store->chunk_size;
-    ScanCtx.stat_tn_size = 0;
    mdb_env_set_mapsize(store->env, store->size);

    // Open dbi
--- a/src/io/walk.c
+++ b/src/io/walk.c
@@ -4,6 +4,8 @@

 #include <ftw.h>

+#define STR_STARTS_WITH(x, y) (strncmp(y, x, strlen(y) - 1) == 0)
+
 __always_inline
 parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info, int base) {
    int len = (int) strlen(filepath);
@@ -43,26 +45,91 @@ int sub_strings[30];

 int handle_entry(const char *filepath, const struct stat *info, int typeflag, struct FTW *ftw) {

-    if (typeflag == FTW_F && S_ISREG(info->st_mode) && ftw->level <= ScanCtx.depth) {
+    if (ftw->level > ScanCtx.depth) {
+        if (typeflag == FTW_D) {
+            return FTW_SKIP_SUBTREE;
+        }
+        return FTW_CONTINUE;
+    }

-        if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) {
-            LOG_DEBUGF("walk.c", "Excluded: %s", filepath)
+    if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) {
+        LOG_DEBUGF("walk.c", "Excluded: %s", filepath)

+        if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
            pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
            ScanCtx.dbg_excluded_files_count += 1;
            pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
-            return 0;
+        } else if (typeflag == FTW_D) {
+            return FTW_SKIP_SUBTREE;
        }

+        return FTW_CONTINUE;
+    }
+
+    if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
        parse_job_t *job = create_fs_parse_job(filepath, info, ftw->base);
        tpool_add_work(ScanCtx.pool, parse, job);
    }

-    return 0;
+    return FTW_CONTINUE;
 }

 #define MAX_FILE_DESCRIPTORS 64

 int walk_directory_tree(const char *dirpath) {
-    return nftw(dirpath, handle_entry, MAX_FILE_DESCRIPTORS, FTW_PHYS | FTW_DEPTH);
+    return nftw(dirpath, handle_entry, MAX_FILE_DESCRIPTORS, FTW_PHYS | FTW_ACTIONRETVAL);
 }
+
+int iterate_file_list(void *input_file) {
+
+    char buf[PATH_MAX];
+    struct stat info;
+
+    while (fgets(buf, sizeof(buf), input_file) != NULL) {
+
+        // Remove trailing newline
+        *(buf + strlen(buf) - 1) = '\0';
+
+        int stat_ret = stat(buf, &info);
+
+        if (stat_ret != 0) {
+            LOG_ERRORF("walk.c", "Could not stat file %s (%s)", buf, strerror(errno));
+            continue;
+        }
+
+        if (!S_ISREG(info.st_mode)) {
+            LOG_ERRORF("walk.c", "Is not a regular file: %s", buf);
+            continue;
+        }
+
+        char *absolute_path = canonicalize_file_name(buf);
+
+        if (absolute_path == NULL) {
+            LOG_FATALF("walk.c", "FIXME: Could not get absolute path of %s", buf);
+        }
+
+        if (ScanCtx.exclude != NULL && EXCLUDED(absolute_path)) {
+            LOG_DEBUGF("walk.c", "Excluded: %s", absolute_path)
+
+            if (S_ISREG(info.st_mode)) {
+                pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
+                ScanCtx.dbg_excluded_files_count += 1;
+                pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
+            }
+
+            continue;
+        }
+
+        if (!STR_STARTS_WITH(absolute_path, ScanCtx.index.desc.root)) {
+            LOG_FATALF("walk.c", "File is not a children of root folder (%s): %s", ScanCtx.index.desc.root, buf);
+        }
+
+        int base = (int) (strrchr(buf, '/') - buf) + 1;
+
+        parse_job_t *job = create_fs_parse_job(absolute_path, &info, base);
+        free(absolute_path);
+        tpool_add_work(ScanCtx.pool, parse, job);
+    }
+
+    return 0;
+}
--- a/src/io/walk.h
+++ b/src/io/walk.h
@@ -5,4 +5,6 @@

 int walk_directory_tree(const char *);

+int iterate_file_list(void* input_file);
+
 #endif
--- a/src/log.c
+++ b/src/log.c
@@ -55,10 +55,14 @@ void vsist_logf(const char *filepath, int level, char *format, va_list ap) {
        log_len += 1;
    }

-    int ret = write(STDERR_FILENO, log_str, log_len);
-    if (ret == -1) {
-        LOG_FATALF("serialize.c", "Could not write index descriptor: %s", strerror(errno))
+    if (PrintingProgressBar) {
+        PrintingProgressBar = FALSE;
+        memmove(log_str + 1, log_str, log_len);
+        log_str[0] = '\n';
+        log_len += 1;
    }
+
+    write(STDERR_FILENO, log_str, log_len);
 }

 void sist_logf(const char *filepath, int level, char *format, ...) {
@@ -104,8 +108,12 @@ void sist_log(const char *filepath, int level, char *str) {
        );
    }

-    int ret = write(STDERR_FILENO, log_str, log_len);
-    if (ret == -1) {
-        LOG_FATALF("serialize.c", "Could not write index descriptor: %s", strerror(errno));
+    if (PrintingProgressBar) {
+        PrintingProgressBar = FALSE;
+        memmove(log_str + 1, log_str, log_len);
+        log_str[0] = '\n';
+        log_len += 1;
    }
+
+    write(STDERR_FILENO, log_str, log_len);
 }
--- a/src/main.c
+++ b/src/main.c
@@ -14,6 +14,9 @@
 #include "parsing/mime.h"
 #include "parsing/parse.h"

+#include <signal.h>
+#include <unistd.h>
+
 #include "stats.h"

 #define DESCRIPTION "Lightning-fast file system indexer and search tool."
@@ -29,8 +32,6 @@ static const char *const usage[] = {
        NULL,
 };

-#include<signal.h>
-#include<unistd.h>

 static __sighandler_t sigsegv_handler = NULL;
 static __sighandler_t sigabrt_handler = NULL;
@@ -169,6 +170,7 @@ void initialize_scan_context(scan_args_t *args) {
    ScanCtx.dbg_current_files = g_hash_table_new_full(g_int64_hash, g_int64_equal, NULL, NULL);
    pthread_mutex_init(&ScanCtx.dbg_current_files_mu, NULL);
    pthread_mutex_init(&ScanCtx.dbg_file_counts_mu, NULL);
+    pthread_mutex_init(&ScanCtx.copy_table_mu, NULL);

    ScanCtx.calculate_checksums = args->calculate_checksums;

@@ -218,6 +220,11 @@ void initialize_scan_context(scan_args_t *args) {
    ScanCtx.media_ctx.store = _store;
    ScanCtx.media_ctx.max_media_buffer = (long) args->max_memory_buffer * 1024 * 1024;
    ScanCtx.media_ctx.read_subtitles = args->read_subtitles;
+
+    if (args->ocr_images) {
+        ScanCtx.media_ctx.tesseract_lang = args->tesseract_lang;
+        ScanCtx.media_ctx.tesseract_path = args->tesseract_path;
+    }
    init_media();

    // OOXML
@@ -334,10 +341,20 @@ void sist2_scan(scan_args_t *args) {
    ScanCtx.writer_pool = tpool_create(1, writer_cleanup, TRUE, FALSE);
    tpool_start(ScanCtx.writer_pool);

-    int walk_ret = walk_directory_tree(ScanCtx.index.desc.root);
-    if (walk_ret == -1) {
-        LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno)
+    if (args->list_path) {
+        // Scan using file list
+        int list_ret = iterate_file_list(args->list_file);
+        if (list_ret != 0) {
+            LOG_FATALF("main.c", "iterate_file_list() failed! (%d)", list_ret)
+        }
+    } else {
+        // Scan directory recursively
+        int walk_ret = walk_directory_tree(ScanCtx.index.desc.root);
+        if (walk_ret == -1) {
+            LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno)
+        }
    }
+
    tpool_wait(ScanCtx.pool);
    tpool_destroy(ScanCtx.pool);

@@ -433,7 +450,7 @@ void sist2_index(index_args_t *args) {
        cleanup = elastic_cleanup;
    }

-    IndexCtx.pool = tpool_create(args->threads, cleanup, FALSE, FALSE);
+    IndexCtx.pool = tpool_create(args->threads, cleanup, FALSE, args->print == 0);
    tpool_start(IndexCtx.pool);

    struct dirent *de;
@@ -489,7 +506,7 @@ void sist2_web(web_args_t *args) {
    WebCtx.tag_auth_enabled = args->tag_auth_enabled;
    WebCtx.tagline = args->tagline;
    WebCtx.dev = args->dev;
-    strcpy(WebCtx.lang, "en");
+    strcpy(WebCtx.lang, args->lang);

    for (int i = 0; i < args->index_count; i++) {
        char *abs_path = abspath(args->indices[i]);
@@ -518,8 +535,8 @@ void sist2_web(web_args_t *args) {


 int main(int argc, const char *argv[]) {
-//    sigsegv_handler = signal(SIGSEGV, sig_handler);
-//    sigabrt_handler = signal(SIGABRT, sig_handler);
+    sigsegv_handler = signal(SIGSEGV, sig_handler);
+    sigabrt_handler = signal(SIGABRT, sig_handler);

    setlocale(LC_ALL, "");

@@ -564,8 +581,11 @@ int main(int argc, const char *argv[]) {
            OPT_STRING(0, "archive-passphrase", &scan_args->archive_passphrase,
                       "Passphrase for encrypted archive files"),

-            OPT_STRING(0, "ocr", &scan_args->tesseract_lang, "Tesseract language (use tesseract --list-langs to see "
-                                                             "which are installed on your machine)"),
+            OPT_STRING(0, "ocr-lang", &scan_args->tesseract_lang,
+                       "Tesseract language (use 'tesseract --list-langs' to see "
+                       "which are installed on your machine)"),
+            OPT_BOOLEAN(0, "ocr-images", &scan_args->ocr_images, "Enable OCR'ing of image files."),
+            OPT_BOOLEAN(0, "ocr-ebooks", &scan_args->ocr_ebooks, "Enable OCR'ing of ebook files."),
            OPT_STRING('e', "exclude", &scan_args->exclude_regex, "Files that match this regex will not be scanned"),
            OPT_BOOLEAN(0, "fast", &scan_args->fast, "Only index file names & mime type"),
            OPT_STRING(0, "treemap-threshold", &scan_args->treemap_threshold_str, "Relative size threshold for treemap "
@@ -577,6 +597,9 @@ int main(int argc, const char *argv[]) {
            OPT_BOOLEAN(0, "fast-epub", &scan_args->fast_epub,
                        "Faster but less accurate EPUB parsing (no thumbnails, metadata)"),
            OPT_BOOLEAN(0, "checksums", &scan_args->calculate_checksums, "Calculate file checksums when scanning."),
+            OPT_STRING(0, "list-file", &scan_args->list_path, "Specify a list of newline-delimited paths to be scanned"
+                                                              " instead of normal directory traversal. Use '-' to read"
+                                                              " from stdin."),

            OPT_GROUP("Index options"),
            OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"),
@@ -599,6 +622,7 @@ int main(int argc, const char *argv[]) {
            OPT_STRING(0, "tag-auth", &web_args->tag_credentials, "Basic auth in user:password format for tagging"),
            OPT_STRING(0, "tagline", &web_args->tagline, "Tagline in navbar"),
            OPT_BOOLEAN(0, "dev", &web_args->dev, "Serve html & js files from disk (for development)"),
+            OPT_STRING(0, "lang", &web_args->lang, "Default UI language. Can be changed by the user"),

            OPT_GROUP("Exec-script options"),
            OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"),
--- a/src/parsing/parse.c
+++ b/src/parsing/parse.c
@@ -79,7 +79,9 @@ void parse(void *arg) {

    int inc_ts = incremental_get(ScanCtx.original_table, doc->path_md5);
    if (inc_ts != 0 && inc_ts == job->vfile.info.st_mtim.tv_sec) {
+        pthread_mutex_lock(&ScanCtx.copy_table_mu);
        incremental_mark_file_for_copy(ScanCtx.copy_table, doc->path_md5);
+        pthread_mutex_unlock(&ScanCtx.copy_table_mu);

        pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
        ScanCtx.dbg_skipped_files_count += 1;
--- a/src/sist.h
+++ b/src/sist.h
@@ -1,6 +1,8 @@
 #ifndef SIST_H
 #define SIST_H

+#define _GNU_SOURCE
+
 #ifndef	FALSE
 #define	FALSE	(0)
 #define BOOL int
@@ -51,7 +53,7 @@
 #include <ctype.h>
 #include "git_hash.h"

-#define VERSION "2.11.3"
+#define VERSION "2.11.6"
 static const char *const Version = VERSION;

 #ifndef SIST_PLATFORM
--- a/src/tpool.c
+++ b/src/tpool.c
@@ -177,7 +177,7 @@ static void *tpool_worker(void *arg) {
 }

 void tpool_wait(tpool_t *pool) {
-    LOG_INFO("tpool.c", "Waiting for worker threads to finish")
+    LOG_DEBUG("tpool.c", "Waiting for worker threads to finish")
    pthread_mutex_lock(&(pool->work_mutex));
    while (TRUE) {
        if (pool->done_cnt < pool->work_cnt) {
@@ -191,7 +191,9 @@ void tpool_wait(tpool_t *pool) {
            }
        }
    }
-    progress_bar_print(1.0, ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
+    if (pool->print_progress) {
+        progress_bar_print(1.0, ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
+    }
    pthread_mutex_unlock(&(pool->work_mutex));

    LOG_INFO("tpool.c", "Worker threads finished")
--- a/src/util.c
+++ b/src/util.c
@@ -84,11 +84,13 @@ char *expandpath(const char *path) {
    return expanded;
 }

+int PrintingProgressBar = 0;
+
 void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {

    static int last_val = -1;
    int val = (int) (percentage * 100);
-    if (last_val == val || val > 100 || index_size < 1024) {
+    if (last_val == val || val > 100) {
        return;
    }
    last_val = val;
@@ -114,13 +116,21 @@ void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {
        index_unit = 'M';
    }

-    printf(
-            "\r%3d%%[%.*s>%*s] TN:%3d%c IDX:%3d%c",
-            val, lpad, PBSTR, rpad, "",
-            (int) tn_size, tn_unit,
-            (int) index_size, index_unit
-    );
-    fflush(stdout);
+    if (tn_size == 0 && index_size == 0) {
+        fprintf(stderr,
+                "\r%3d%%[%.*s>%*s]",
+                val, lpad, PBSTR, rpad, ""
+        );
+    } else {
+        fprintf(stderr,
+                "\r%3d%%[%.*s>%*s] TN:%3d%c IDX:%3d%c",
+                val, lpad, PBSTR, rpad, "",
+                (int) tn_size, tn_unit,
+                (int) index_size, index_unit
+        );
+    }
+
+    PrintingProgressBar = TRUE;
 }

 GHashTable *incremental_get_table() {
--- a/src/util.h
+++ b/src/util.h
@@ -19,6 +19,8 @@ char *expandpath(const char *path);

 dyn_buffer_t url_escape(char *str);

+extern int PrintingProgressBar;
+
 void progress_bar_print(double percentage, size_t tn_size, size_t index_size);

 GHashTable *incremental_get_table();
@@ -131,6 +133,9 @@ static int incremental_get_str(GHashTable *table, const char *path_md5) {
    }
 }

+/**
+ * Not thread safe!
+ */
 __always_inline
 static int incremental_mark_file_for_copy(GHashTable *table, const unsigned char path_md5[MD5_DIGEST_LENGTH]) {
    char *ptr = malloc(MD5_STR_LENGTH);
--- a/src/web/serve.c
+++ b/src/web/serve.c
@@ -252,15 +252,34 @@ void serve_file_from_disk(cJSON *json, index_t *idx, struct mg_connection *nc, s
    mg_http_serve_file(nc, hm, full_path, mime, disposition);
 }

+void cache_es_version() {
+    static int is_cached = FALSE;
+
+    if (is_cached == TRUE) {
+        return;
+    }
+
+    es_version_t *es_version = elastic_get_version(WebCtx.es_url);
+    if (es_version != NULL) {
+        WebCtx.es_version = es_version;
+        is_cached = TRUE;
+    }
+}
+
 void index_info(struct mg_connection *nc) {
+
+    cache_es_version();
+
    cJSON *json = cJSON_CreateObject();
    cJSON *arr = cJSON_AddArrayToObject(json, "indices");

    cJSON_AddStringToObject(json, "esIndex", WebCtx.es_index);
    cJSON_AddStringToObject(json, "version", Version);
+    cJSON_AddStringToObject(json, "esVersion", format_es_version(WebCtx.es_version));
+    cJSON_AddBoolToObject(json, "esVersionSupported", IS_SUPPORTED_ES_VERSION(WebCtx.es_version));
+    cJSON_AddBoolToObject(json, "esVersionLegacy", USE_LEGACY_ES_SETTINGS(WebCtx.es_version));
    cJSON_AddStringToObject(json, "platform", QUOTE(SIST_PLATFORM));
    cJSON_AddStringToObject(json, "sist2Hash", Sist2CommitHash);
-    cJSON_AddStringToObject(json, "libscanHash", LibScanCommitHash);
    cJSON_AddStringToObject(json, "lang", WebCtx.lang);
    cJSON_AddBoolToObject(json, "dev", WebCtx.dev);
 #ifdef SIST_DEBUG
--- a/src/web/static_generated.c
+++ b/src/web/static_generated.c
--- a/third-party/argparse
+++ b/third-party/argparse
--- a/third-party/libscan
+++ b/third-party/libscan
--- a/third-party/libscan/.gitignore
+++ b/third-party/libscan/.gitignore
@@ -0,0 +1,12 @@
+.idea/
+cmake_install.cmake
+Makefile
+libscan.a
+libscan.so
+*.cbp
+CMakeFiles
+CMakeCache.txt
+scan_test
+third-party/ext_*
+libscan-test-files
+scan_*_test
--- a/third-party/libscan/CMakeLists.txt
+++ b/third-party/libscan/CMakeLists.txt
@@ -0,0 +1,233 @@
+cmake_minimum_required(VERSION 3.15)
+
+project(scan)
+set(CMAKE_C_STANDARD 11)
+
+option(BUILD_TESTS "Build tests" on)
+
+add_subdirectory(third-party/antiword)
+add_compile_definitions(
+        antiword
+        NDEBUG
+)
+
+add_library(
+        scan
+        libscan/util.c libscan/util.h
+        libscan/scan.h
+        libscan/macros.h
+
+        libscan/text/text.c libscan/text/text.h
+        libscan/arc/arc.c libscan/arc/arc.h
+        libscan/ebook/ebook.c libscan/ebook/ebook.h
+        libscan/comic/comic.c libscan/comic/comic.h
+        libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h
+        libscan/media/media.c libscan/media/media.h
+        libscan/font/font.c libscan/font/font.h
+        libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h
+        libscan/json/json.c libscan/json/json.h
+        libscan/wpd/wpd.c libscan/wpd/wpd.h libscan/wpd/libwpd_c_api.h libscan/wpd/libwpd_c_api.cpp
+
+        third-party/utf8.h
+        libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h)
+set_target_properties(scan PROPERTIES LINKER_LANGUAGE C)
+
+set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib .so)
+
+find_package(cJSON CONFIG REQUIRED)
+find_package(LibArchive REQUIRED)
+find_package(BZip2 REQUIRED)
+find_package(lz4 REQUIRED)
+
+find_package(Threads REQUIRED)
+find_package(Tesseract CONFIG REQUIRED)
+find_package(OpenJPEG CONFIG REQUIRED)
+find_package(JPEG REQUIRED)
+find_package(LibXml2 REQUIRED)
+find_package(LibLZMA REQUIRED)
+find_package(ZLIB REQUIRED)
+find_package(unofficial-pcre CONFIG REQUIRED)
+
+
+find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec)
+find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd)
+find_library(FREETYPE_LIB NAMES freetype freetyped)
+find_package(unofficial-brotli CONFIG REQUIRED)
+find_library(LZO2_LIB NAMES lzo2)
+
+find_library(RAW_LIB NAMES libraw.a)
+find_library(MUPDF_LIB NAMES liblibmupdf.a)
+find_library(CMS_LIB NAMES lcms2)
+find_library(JAS_LIB NAMES jasper)
+find_library(GUMBO_LIB NAMES gumbo)
+find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/)
+
+
+target_compile_options(
+        scan
+        PRIVATE
+        -g
+)
+
+include(ExternalProject)
+find_program(MAKE_EXE NAMES gmake nmake make)
+ExternalProject_Add(
+        libmobi
+        GIT_REPOSITORY https://github.com/simon987/libmobi.git
+        GIT_TAG "public"
+
+        UPDATE_COMMAND ""
+        PATCH_COMMAND ""
+        TEST_COMMAND ""
+        CONFIGURE_COMMAND ./autogen.sh && ./configure
+        INSTALL_COMMAND ""
+
+        PREFIX "third-party/ext_libmobi"
+        SOURCE_DIR "third-party/ext_libmobi/src/libmobi"
+        BINARY_DIR "third-party/ext_libmobi/src/libmobi"
+
+        BUILD_COMMAND ${MAKE_EXE} -j 8 --silent
+)
+
+SET(MOBI_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/.libs/)
+SET(MOBI_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/)
+
+if (SIST_DEBUG)
+    SET(FFMPEG_DEBUG "--enable-debug=3" "--disable-optimizations")
+else()
+    SET(FFMPEG_DEBUG "")
+endif()
+
+ExternalProject_Add(
+        ffmpeg
+        GIT_REPOSITORY https://git.ffmpeg.org/ffmpeg.git
+        GIT_TAG "n4.4"
+
+        UPDATE_COMMAND ""
+        PATCH_COMMAND ""
+        TEST_COMMAND ""
+        CONFIGURE_COMMAND ./configure --disable-shared --enable-static --disable-ffmpeg --disable-ffplay
+        --disable-ffprobe --disable-doc --disable-manpages --disable-postproc --disable-avfilter --disable-alsa
+        --disable-lzma --disable-xlib --disable-vdpau --disable-vaapi --disable-sdl2
+        --disable-network  ${FFMPEG_DEBUG}
+        INSTALL_COMMAND ""
+
+        PREFIX "third-party/ext_ffmpeg"
+        SOURCE_DIR "third-party/ext_ffmpeg/src/ffmpeg"
+        BINARY_DIR "third-party/ext_ffmpeg/src/ffmpeg"
+
+        BUILD_COMMAND ${MAKE_EXE} -j33 --silent
+)
+
+SET(FFMPEG_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
+SET(FFMPEG_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
+
+ExternalProject_Add(
+        libwpd
+        URL http://prdownloads.sourceforge.net/libwpd/libwpd-0.9.9.tar.gz
+
+        UPDATE_COMMAND ""
+        PATCH_COMMAND ""
+        TEST_COMMAND ""
+        CONFIGURE_COMMAND ./configure --without-docs --enable-static --disable-shared
+        INSTALL_COMMAND ""
+
+        PREFIX "third-party/ext_libwpd"
+        SOURCE_DIR "third-party/ext_libwpd/src/libwpd"
+        BINARY_DIR "third-party/ext_libwpd/src/libwpd"
+
+        BUILD_COMMAND ${MAKE_EXE} -j33
+)
+SET(WPD_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/src/lib/.libs/)
+SET(WPD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/inc/)
+
+add_dependencies(
+        scan
+        libmobi
+        ffmpeg
+        antiword
+        libwpd
+)
+
+target_link_libraries(
+        scan
+        PUBLIC
+
+        cjson
+        ${LibArchive_LIBRARIES}
+        ZLIB::ZLIB
+        BZip2::BZip2
+        lz4::lz4
+        ${LZO2_LIB}
+        LibLZMA::LibLZMA
+
+        ${MUPDF_LIB}
+        openjp2
+
+        ${MOBI_LIB_DIR}/libmobi.a
+
+        ${WPD_LIB_DIR}/libwpd-0.9.a
+        ${WPD_LIB_DIR}/libwpd-stream-0.9.a
+
+        ${FREETYPE_LIB}
+        ${HARFBUZZ_LIB}
+        ${JBIG2DEC_LIB}
+
+        stdc++
+
+        -Wl,--whole-archive
+        m
+        -Wl,--no-whole-archive
+
+        ${JPEG_LIBRARIES}
+        ${Tesseract_LIBRARIES}
+        ${LIBXML2_LIBRARIES}
+        ${FREETYPE_LIB}
+        unofficial::brotli::brotlidec-static
+
+        ${FFMPEG_LIB_DIR}/libavformat/libavformat.a
+        ${FFMPEG_LIB_DIR}/libavcodec/libavcodec.a
+        ${FFMPEG_LIB_DIR}/libavutil/libavutil.a
+        ${FFMPEG_LIB_DIR}/libswresample/libswresample.a
+        ${FFMPEG_LIB_DIR}/libswscale/libswscale.a
+
+        z
+
+        ${CMAKE_THREAD_LIBS_INIT}
+
+        ${RAW_LIB}
+        ${GOMP_LIB}
+        ${CMS_LIB}
+        ${JAS_LIB}
+        ${GUMBO_LIB}
+        dl
+        antiword
+        unofficial::pcre::pcre unofficial::pcre::pcre16 unofficial::pcre::pcre32 unofficial::pcre::pcrecpp
+)
+
+target_include_directories(
+        scan
+        PUBLIC
+        ${MUPDF_INC_DIR}
+        ${JPEG_INCLUDE_DIR}
+        ${LIBXML2_INCLUDE_DIR}
+        ${FFMPEG_INCLUDE_DIR}
+        ${MOBI_INCLUDE_DIR}
+        ${WPD_INCLUDE_DIR}
+)
+
+if (BUILD_TESTS)
+    find_package(GTest CONFIG REQUIRED)
+
+    add_executable(scan_ub_test test/main.cpp test/test_util.cpp test/test_util.h)
+    target_compile_options(scan_ub_test PRIVATE -g -fsanitize=undefined -fno-omit-frame-pointer)
+    target_link_libraries(scan_ub_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=undefined scan)
+
+    add_executable(scan_a_test test/main.cpp test/test_util.cpp test/test_util.h)
+    target_compile_options(scan_a_test PRIVATE -g -fsanitize=address -fno-omit-frame-pointer)
+    target_link_libraries(scan_a_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=address scan)
+
+    add_executable(scan_test test/main.cpp test/test_util.cpp test/test_util.h)
+    target_compile_options(scan_test PRIVATE -g -fno-omit-frame-pointer)
+    target_link_libraries(scan_test PRIVATE GTest::gtest GTest::gtest_main scan)
+endif()
--- a/third-party/libscan/README.md
+++ b/third-party/libscan/README.md
@@ -0,0 +1,4 @@
+### Run fuzz tests:
+```bash
+./scan_a_test --gtest_filter=*Fuzz* --gtest_repeat=100
+```
--- a/third-party/libscan/libscan/arc/arc.c
+++ b/third-party/libscan/libscan/arc/arc.c
@@ -0,0 +1,244 @@
+#include "arc.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <openssl/evp.h>
+#include <pcre.h>
+
+
+int should_parse_filtered_file(const char *filepath, int ext) {
+    char tmp[PATH_MAX * 2];
+
+    if (ext == 0) {
+        return FALSE;
+    }
+
+    if (strncmp(filepath + ext, "tgz", 3) == 0) {
+        return TRUE;
+    }
+
+    memcpy(tmp, filepath, ext - 1);
+    *(tmp + ext - 1) = '\0';
+
+    char *idx = strrchr(tmp, '.');
+
+    if (idx == NULL) {
+        return FALSE;
+    }
+
+    if (strcmp(idx, ".tar") == 0) {
+        return TRUE;
+    }
+
+    return FALSE;
+}
+
+void arc_close(struct vfile *f) {
+    SHA1_Final(f->sha1_digest, &f->sha1_ctx);
+
+    if (f->rewind_buffer != NULL) {
+        free(f->rewind_buffer);
+        f->rewind_buffer = NULL;
+        f->rewind_buffer_size = 0;
+        f->rewind_buffer_cursor = 0;
+    }
+}
+
+
+int arc_read(struct vfile *f, void *buf, size_t size) {
+
+    int bytes_copied = 0;
+
+    if (f->rewind_buffer_size != 0) {
+        if (size > f->rewind_buffer_size) {
+            memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, f->rewind_buffer_size);
+
+            bytes_copied = f->rewind_buffer_size;
+            size -= f->rewind_buffer_size;
+            buf += f->rewind_buffer_size;
+            f->rewind_buffer_size = 0;
+        } else {
+            memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, size);
+            f->rewind_buffer_size -= (int) size;
+            f->rewind_buffer_cursor += (int) size;
+
+            return (int) size;
+        }
+    }
+
+    size_t bytes_read = archive_read_data(f->arc, buf, size);
+
+    if (bytes_read != 0 && bytes_read <= size && f->calculate_checksum) {
+        f->has_checksum = TRUE;
+
+        safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, bytes_read);
+    }
+
+    if (bytes_read != size && archive_errno(f->arc) != 0) {
+        const char *error_str = archive_error_string(f->arc);
+        if (error_str != NULL) {
+            f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
+        }
+        return -1;
+    }
+
+    return (int) bytes_read + bytes_copied;
+}
+
+int arc_read_rewindable(struct vfile *f, void *buf, size_t size) {
+
+    if (f->rewind_buffer != NULL) {
+        fprintf(stderr, "Allocated rewind buffer more than once for %s", f->filepath);
+        exit(-1);
+    }
+
+    size_t bytes_read = archive_read_data(f->arc, buf, size);
+
+    if (bytes_read != size && archive_errno(f->arc) != 0) {
+        const char *error_str = archive_error_string(f->arc);
+        if (error_str != NULL) {
+            f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
+        }
+        return -1;
+    }
+
+    f->rewind_buffer = malloc(size);
+    f->rewind_buffer_size = (int) size;
+    f->rewind_buffer_cursor = 0;
+    memcpy(f->rewind_buffer, buf, size);
+
+    return (int) bytes_read;
+}
+
+int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) {
+    arc_data->f = f;
+
+    if (f->is_fs_file) {
+        *a = archive_read_new();
+        archive_read_support_filter_all(*a);
+        archive_read_support_format_all(*a);
+        if (ctx->passphrase[0] != 0) {
+            archive_read_add_passphrase(*a, ctx->passphrase);
+        }
+
+        return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE);
+    } else if (allow_recurse) {
+        *a = archive_read_new();
+        archive_read_support_filter_all(*a);
+        archive_read_support_format_all(*a);
+        if (ctx->passphrase[0] != 0) {
+            archive_read_add_passphrase(*a, ctx->passphrase);
+        }
+
+        return archive_read_open(
+                *a, arc_data,
+                vfile_open_callback,
+                vfile_read_callback,
+                vfile_close_callback
+        );
+    } else {
+        return ARC_SKIPPED;
+    }
+}
+
+static __thread int sub_strings[30];
+#define EXCLUDED(str) (pcre_exec(exclude, exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
+
+scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra) {
+
+    struct archive *a = NULL;
+    struct archive_entry *entry = NULL;
+
+    arc_data_t arc_data;
+    arc_data.f = f;
+
+    int ret = arc_open(ctx, f, &a, &arc_data, ctx->mode == ARC_MODE_RECURSE);
+    if (ret == ARC_SKIPPED) {
+        return SCAN_OK;
+    }
+
+    if (ret != ARCHIVE_OK) {
+        CTX_LOG_ERRORF(f->filepath, "(arc.c) [%d] %s", ret, archive_error_string(a))
+        archive_read_free(a);
+        return SCAN_ERR_READ;
+    }
+
+    if (ctx->mode == ARC_MODE_LIST) {
+        dyn_buffer_t buf = dyn_buffer_create();
+
+        while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
+            if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
+                const char *utf8_name = archive_entry_pathname_utf8(entry);
+                const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
+
+                dyn_buffer_append_string(&buf, file_path);
+                dyn_buffer_write_char(&buf, ' ');
+            }
+        }
+        dyn_buffer_write_char(&buf, '\0');
+
+        meta_line_t *meta_list = malloc(sizeof(meta_line_t) + buf.cur);
+        meta_list->key = MetaContent;
+        strcpy(meta_list->str_val, buf.buf);
+        APPEND_META(doc, meta_list)
+        dyn_buffer_destroy(&buf);
+
+    } else {
+
+        parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2);
+
+        sub_job->vfile.close = arc_close;
+        sub_job->vfile.read = arc_read;
+        sub_job->vfile.read_rewindable = arc_read_rewindable;
+        sub_job->vfile.reset = NULL;
+        sub_job->vfile.arc = a;
+        sub_job->vfile.filepath = sub_job->filepath;
+        sub_job->vfile.is_fs_file = FALSE;
+        sub_job->vfile.rewind_buffer_size = 0;
+        sub_job->vfile.rewind_buffer = NULL;
+        sub_job->vfile.log = ctx->log;
+        sub_job->vfile.logf = ctx->logf;
+        sub_job->vfile.has_checksum = FALSE;
+        sub_job->vfile.calculate_checksum = f->calculate_checksum;
+        memcpy(sub_job->parent, doc->path_md5, MD5_DIGEST_LENGTH);
+
+        while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
+            sub_job->vfile.info = *archive_entry_stat(entry);
+            if (S_ISREG(sub_job->vfile.info.st_mode)) {
+
+                const char *utf8_name = archive_entry_pathname_utf8(entry);
+
+                if (utf8_name == NULL) {
+                    sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry));
+                } else {
+                    sprintf(sub_job->filepath, "%s#/%s", f->filepath, utf8_name);
+                }
+                sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
+
+                // Handle excludes
+                if (exclude != NULL && EXCLUDED(sub_job->filepath)) {
+                    CTX_LOG_DEBUGF("arc.c", "Excluded: %s", sub_job->filepath)
+                    continue;
+                }
+
+                char *p = strrchr(sub_job->filepath, '.');
+                if (p != NULL && (p - sub_job->filepath) > strlen(f->filepath)) {
+                    sub_job->ext = (int) (p - sub_job->filepath + 1);
+                } else {
+                    sub_job->ext = (int) strlen(sub_job->filepath);
+                }
+
+                SHA1_Init(&sub_job->vfile.sha1_ctx);
+
+                ctx->parse(sub_job);
+            }
+        }
+
+        free(sub_job);
+    }
+
+    archive_read_free(a);
+    return SCAN_OK;
+}
--- a/third-party/libscan/libscan/arc/arc.h
+++ b/third-party/libscan/libscan/arc/arc.h
@@ -0,0 +1,80 @@
+#ifndef SCAN_ARC_H
+#define SCAN_ARC_H
+
+#include <archive.h>
+#include <archive_entry.h>
+#include <fcntl.h>
+#include <pcre.h>
+#include "../scan.h"
+
+# define ARC_SKIPPED (-1)
+#define ARC_MODE_SKIP 0
+#define ARC_MODE_LIST 1
+#define ARC_MODE_SHALLOW 2
+#define ARC_MODE_RECURSE 3
+typedef int archive_mode_t;
+
+typedef struct {
+    archive_mode_t mode;
+
+    parse_callback_t parse;
+    log_callback_t log;
+    logf_callback_t logf;
+    store_callback_t store;
+    char passphrase[4096];
+} scan_arc_ctx_t;
+
+#define ARC_BUF_SIZE 8192
+
+typedef struct {
+    vfile_t *f;
+    char buf[ARC_BUF_SIZE];
+} arc_data_t;
+
+static int vfile_open_callback(struct archive *a, void *user_data) {
+    arc_data_t *data = (arc_data_t *) user_data;
+
+    if (!data->f->is_fs_file) {
+        SHA1_Init(&data->f->sha1_ctx);
+    }
+
+    return ARCHIVE_OK;
+}
+
+static long vfile_read_callback(struct archive *a, void *user_data, const void **buf) {
+    arc_data_t *data = (arc_data_t *) user_data;
+
+    *buf = data->buf;
+    long ret = data->f->read(data->f, data->buf, sizeof(data->buf));
+
+    if (!data->f->is_fs_file && ret > 0) {
+        data->f->has_checksum = TRUE;
+        safe_sha1_update(&data->f->sha1_ctx, (unsigned char*)data->buf, ret);
+    }
+
+    return ret;
+}
+
+static int vfile_close_callback(struct archive *a, void *user_data) {
+    arc_data_t *data = (arc_data_t *) user_data;
+
+    if (!data->f->is_fs_file) {
+        SHA1_Final((unsigned char *) data->f->sha1_digest, &data->f->sha1_ctx);
+    }
+
+    return ARCHIVE_OK;
+}
+
+int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse);
+
+int should_parse_filtered_file(const char *filepath, int ext);
+
+scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra);
+
+int arc_read(struct vfile *f, void *buf, size_t size);
+
+int arc_read_rewindable(struct vfile *f, void *buf, size_t size);
+
+void arc_close(struct vfile *f);
+
+#endif
--- a/third-party/libscan/libscan/comic/comic.c
+++ b/third-party/libscan/libscan/comic/comic.c
@@ -0,0 +1,58 @@
+#include "comic.h"
+#include "../media/media.h"
+#include "../arc/arc.h"
+
+#include <stdlib.h>
+#include <archive.h>
+
+static scan_arc_ctx_t arc_ctx = (scan_arc_ctx_t) {.passphrase = {0,}};
+
+void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc) {
+    struct archive *a = NULL;
+    struct archive_entry *entry = NULL;
+    arc_data_t arc_data;
+
+    if (ctx->tn_size <= 0) {
+        return;
+    }
+
+    int ret = arc_open(&arc_ctx, f, &a, &arc_data, TRUE);
+    if (ret != ARCHIVE_OK) {
+        CTX_LOG_ERRORF(f->filepath, "(cbr.c) [%d] %s", ret, archive_error_string(a))
+        archive_read_free(a);
+        return;
+    }
+
+    while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
+        struct stat info = *archive_entry_stat(entry);
+        if (S_ISREG(info.st_mode)) {
+            const char *utf8_name = archive_entry_pathname_utf8(entry);
+            const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
+
+            char *p = strrchr(file_path, '.');
+            if (p != NULL && (strcmp(p, ".png") == 0 || strcmp(p, ".jpg") == 0 || strcmp(p, ".jpeg") == 0)) {
+                size_t entry_size = archive_entry_size(entry);
+                void *buf = malloc(entry_size);
+                size_t read = archive_read_data(a, buf, entry_size);
+
+                if (read != entry_size) {
+                    const char *err_str = archive_error_string(a);
+                    if (err_str) {
+                        CTX_LOG_ERRORF("comic.c", "Error while reading entry: %s", err_str)
+                    }
+                    free(buf);
+                    break;
+                }
+
+                ret = store_image_thumbnail((scan_media_ctx_t *) ctx, buf, entry_size, doc, file_path);
+                free(buf);
+
+                if (ret == TRUE) {
+                    break;
+                }
+            }
+        }
+    }
+
+    archive_read_free(a);
+}
--- a/third-party/libscan/libscan/comic/comic.h
+++ b/third-party/libscan/libscan/comic/comic.h
@@ -0,0 +1,31 @@
+#ifndef SCAN_CBR_H
+#define SCAN_CBR_H
+
+#include <stdlib.h>
+#include "../ebook/ebook.h"
+
+typedef struct {
+    log_callback_t log;
+    logf_callback_t logf;
+    store_callback_t store;
+
+    int tn_size;
+    float tn_qscale;
+
+    unsigned int cbr_mime;
+    unsigned int cbz_mime;
+} scan_comic_ctx_t;
+
+__always_inline
+static int is_cbr(scan_comic_ctx_t *ctx, unsigned int mime) {
+    return mime == ctx->cbr_mime;
+}
+
+__always_inline
+static int is_cbz(scan_comic_ctx_t *ctx, unsigned int mime) {
+    return mime == ctx->cbz_mime;
+}
+
+void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc);
+
+#endif
--- a/third-party/libscan/libscan/ebook/ebook.c
+++ b/third-party/libscan/libscan/ebook/ebook.c
@@ -0,0 +1,478 @@
+#include "ebook.h"
+#include <mupdf/fitz.h>
+#include <pthread.h>
+#include <tesseract/capi.h>
+
+#include "../media/media.h"
+#include "../arc/arc.h"
+#include "../ocr/ocr.h"
+
+/* fill_image callback doesn't let us pass opaque pointers unless I create my own device */
+__thread text_buffer_t thread_buffer;
+__thread scan_ebook_ctx_t thread_ctx;
+
+pthread_mutex_t Mutex;
+
+static void my_fz_lock(UNUSED(void *user), int lock) {
+    if (lock == FZ_LOCK_FREETYPE) {
+        pthread_mutex_lock(&Mutex);
+    }
+}
+
+static void my_fz_unlock(UNUSED(void *user), int lock) {
+    if (lock == FZ_LOCK_FREETYPE) {
+        pthread_mutex_unlock(&Mutex);
+    }
+}
+
+
+int pixmap_is_blank(const fz_pixmap *pixmap) {
+    int pixmap_size = pixmap->n * pixmap->w * pixmap->h;
+    const int pixel0 = pixmap->samples[0];
+    for (int i = 0; i < pixmap_size; i++) {
+        if (pixmap->samples[i] != pixel0) {
+            return FALSE;
+        }
+    }
+    return TRUE;
+}
+
+fz_pixmap *
+load_pixmap(scan_ebook_ctx_t *ctx, int page, fz_context *fzctx, fz_document *fzdoc, document_t *doc, fz_page **cover) {
+
+    int err = 0;
+
+    fz_var(cover);
+    fz_var(err);
+    fz_try(fzctx)*cover = fz_load_page(fzctx, fzdoc, page);
+    fz_catch(fzctx)err = 1;
+
+    if (err != 0) {
+        CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message)
+        return NULL;
+    }
+
+    fz_rect bounds = fz_bound_page(fzctx, *cover);
+
+    float scale;
+    float w = bounds.x1 - bounds.x0;
+    float h = bounds.y1 - bounds.y0;
+    if (w > h) {
+        scale = (float) ctx->tn_size / w;
+    } else {
+        scale = (float) ctx->tn_size / h;
+    }
+    fz_matrix m = fz_scale(scale, scale);
+
+    bounds = fz_transform_rect(bounds, m);
+    fz_irect bbox = fz_round_rect(bounds);
+    fz_pixmap *pixmap = fz_new_pixmap_with_bbox(fzctx, fz_device_rgb(fzctx), bbox, NULL, 0);
+
+    fz_clear_pixmap_with_value(fzctx, pixmap, 0xFF);
+    fz_device *dev = fz_new_draw_device(fzctx, m, pixmap);
+
+    fz_var(err);
+    fz_try(fzctx) {
+                fz_run_page(fzctx, *cover, dev, fz_identity, NULL);
+            } fz_always(fzctx) {
+            fz_close_device(fzctx, dev);
+            fz_drop_device(fzctx, dev);
+        } fz_catch(fzctx)err = fzctx->error.errcode;
+
+    if (err != 0) {
+        CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message)
+        fz_drop_page(fzctx, *cover);
+        fz_drop_pixmap(fzctx, pixmap);
+        return NULL;
+    }
+
+    if (pixmap->n != 3) {
+        CTX_LOG_ERRORF(doc->filepath, "Got unexpected pixmap depth: %d", pixmap->n)
+        fz_drop_page(fzctx, *cover);
+        fz_drop_pixmap(fzctx, pixmap);
+        return NULL;
+    }
+
+    return pixmap;
+}
+
+int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_document *fzdoc) {
+
+    fz_page *cover = NULL;
+    fz_pixmap *pixmap = load_pixmap(ctx, 0, fzctx, fzdoc, doc, &cover);
+    if (pixmap == NULL) {
+        return FALSE;
+    }
+
+    if (pixmap_is_blank(pixmap)) {
+        fz_drop_page(fzctx, cover);
+        fz_drop_pixmap(fzctx, pixmap);
+        CTX_LOG_DEBUG(doc->filepath, "Cover page is blank, using page 1 instead")
+        pixmap = load_pixmap(ctx, 1, fzctx, fzdoc, doc, &cover);
+        if (pixmap == NULL) {
+            return FALSE;
+        }
+    }
+
+    // RGB24 -> YUV420p
+    AVFrame *scaled_frame = av_frame_alloc();
+
+    struct SwsContext *sws_ctx = sws_getContext(
+            pixmap->w, pixmap->h, AV_PIX_FMT_RGB24,
+            pixmap->w, pixmap->h, AV_PIX_FMT_YUV420P,
+            SIST_SWS_ALGO, 0, 0, 0
+    );
+
+    int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, pixmap->w, pixmap->h, 1);
+    uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len);
+
+    av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, pixmap->w, pixmap->h,
+                         1);
+
+    unsigned char *samples = calloc(1, 1024 * 1024 * 1024);
+    memcpy(samples, pixmap->samples, pixmap->stride * pixmap->h);
+
+    const uint8_t *in_data[1] = {samples,};
+    int in_line_size[1] = {(int) pixmap->stride};
+
+    sws_scale(sws_ctx,
+              in_data, in_line_size,
+              0, pixmap->h,
+              scaled_frame->data, scaled_frame->linesize
+    );
+
+    scaled_frame->width = pixmap->w;
+    scaled_frame->height = pixmap->h;
+    scaled_frame->format = AV_PIX_FMT_YUV420P;
+
+    sws_freeContext(sws_ctx);
+
+    // YUV420p -> JPEG
+    AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(pixmap->w, pixmap->h, ctx->tn_qscale);
+    avcodec_send_frame(jpeg_encoder, scaled_frame);
+
+    AVPacket jpeg_packet;
+    av_init_packet(&jpeg_packet);
+    avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
+
+    APPEND_TN_META(doc, pixmap->w, pixmap->h)
+    ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
+
+    free(samples);
+    av_packet_unref(&jpeg_packet);
+    av_free(*scaled_frame->data);
+    av_frame_free(&scaled_frame);
+    avcodec_free_context(&jpeg_encoder);
+
+    fz_drop_pixmap(fzctx, pixmap);
+    fz_drop_page(fzctx, cover);
+
+    return TRUE;
+}
+
+void fz_err_callback(void *user, const char *message) {
+    document_t *doc = (document_t *) user;
+
+    const scan_ebook_ctx_t *ctx = &thread_ctx;
+    CTX_LOG_WARNINGF(doc->filepath, "FZ: %s", message)
+}
+
+void fz_warn_callback(void *user, const char *message) {
+    document_t *doc = (document_t *) user;
+
+    const scan_ebook_ctx_t *ctx = &thread_ctx;
+    CTX_LOG_DEBUGF(doc->filepath, "FZ: %s", message)
+}
+
+static void init_fzctx(fz_context *fzctx, document_t *doc) {
+    fz_register_document_handlers(fzctx);
+
+    static int mu_is_initialized = FALSE;
+    if (!mu_is_initialized) {
+        pthread_mutex_init(&Mutex, NULL);
+        mu_is_initialized = TRUE;
+    }
+
+    fzctx->warn.print_user = doc;
+    fzctx->warn.print = fz_warn_callback;
+    fzctx->error.print_user = doc;
+    fzctx->error.print = fz_err_callback;
+
+    fzctx->locks.lock = my_fz_lock;
+    fzctx->locks.unlock = my_fz_unlock;
+}
+
+static int read_stext_block(fz_stext_block *block, text_buffer_t *tex) {
+    if (block->type != FZ_STEXT_BLOCK_TEXT) {
+        return 0;
+    }
+
+    fz_stext_line *line = block->u.t.first_line;
+    while (line != NULL) {
+        text_buffer_append_char(tex, ' ');
+        fz_stext_char *c = line->first_char;
+        while (c != NULL) {
+            if (text_buffer_append_char(tex, c->c) == TEXT_BUF_FULL) {
+                return TEXT_BUF_FULL;
+            }
+            c = c->next;
+        }
+        line = line->next;
+    }
+    text_buffer_append_char(tex, ' ');
+    return 0;
+}
+
+static void fill_image_ocr_cb(const char* text, size_t len) {
+  text_buffer_append_string(&thread_buffer, text, len - 1);
+}
+
+void fill_image(fz_context *fzctx, UNUSED(fz_device *dev),
+                fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
+                UNUSED(fz_color_params color_params)) {
+
+    int l2factor = 0;
+
+    if (img->w >= MIN_OCR_WIDTH && img->h >= MIN_OCR_HEIGHT && OCR_IS_VALID_BPP(img->n)) {
+        fz_pixmap *pix = img->get_pixmap(fzctx, img, NULL, img->w, img->h, &l2factor);
+        ocr_extract_text(thread_ctx.tesseract_path, thread_ctx.tesseract_lang, pix->samples, pix->w, pix->h, pix->n, pix->stride, pix->xres, fill_image_ocr_cb);
+        fz_drop_pixmap(fzctx, pix);
+    }
+}
+
+void
+parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only) {
+
+    fz_context *fzctx = fz_new_context(NULL, NULL, FZ_STORE_DEFAULT);
+    thread_ctx = *ctx;
+
+    init_fzctx(fzctx, doc);
+
+    int err = 0;
+
+    fz_document *fzdoc = NULL;
+    fz_stream *stream = NULL;
+    fz_var(fzdoc);
+    fz_var(stream);
+    fz_var(err);
+
+    fz_try(fzctx) {
+                stream = fz_open_memory(fzctx, buf, buf_len);
+                fzdoc = fz_open_document_with_stream(fzctx, mime_str, stream);
+            } fz_catch(fzctx)err = fzctx->error.errcode;
+
+    if (err != 0) {
+        fz_drop_stream(fzctx, stream);
+        fz_drop_document(fzctx, fzdoc);
+        fz_drop_context(fzctx);
+        return;
+    }
+
+    int page_count = -1;
+    fz_var(err);
+    fz_try(fzctx)page_count = fz_count_pages(fzctx, fzdoc);
+    fz_catch(fzctx)err = fzctx->error.errcode;
+
+    if (err) {
+        CTX_LOG_WARNINGF(doc->filepath, "fz_count_pages() returned error code [%d] %s", err, fzctx->error.message)
+        fz_drop_stream(fzctx, stream);
+        fz_drop_document(fzctx, fzdoc);
+        fz_drop_context(fzctx);
+        return;
+    }
+
+    APPEND_LONG_META(doc, MetaPages, page_count)
+
+    if (ctx->tn_size > 0) {
+        if (render_cover(ctx, fzctx, doc, fzdoc) == FALSE) {
+            fz_drop_stream(fzctx, stream);
+            fz_drop_document(fzctx, fzdoc);
+            fz_drop_context(fzctx);
+            return;
+        }
+    }
+
+    if (tn_only) {
+        fz_drop_stream(fzctx, stream);
+        fz_drop_document(fzctx, fzdoc);
+        fz_drop_context(fzctx);
+        return;
+    }
+
+    char title[8192] = {'\0',};
+    fz_try(fzctx)fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title));
+    fz_catch(fzctx);
+
+    if (strlen(title) > 0) {
+        APPEND_UTF8_META(doc, MetaTitle, title)
+    }
+
+    char author[4096] = {'\0',};
+    fz_try(fzctx)fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_AUTHOR, author, sizeof(author));
+    fz_catch(fzctx);
+
+    if (strlen(author) > 0) {
+        APPEND_UTF8_META(doc, MetaAuthor, author)
+    }
+
+
+    if (ctx->content_size > 0) {
+        fz_stext_options opts = {0};
+        thread_buffer = text_buffer_create(ctx->content_size);
+
+        for (int current_page = 0; current_page < page_count; current_page++) {
+            fz_page *page = NULL;
+            fz_var(err);
+            fz_try(fzctx)page = fz_load_page(fzctx, fzdoc, current_page);
+            fz_catch(fzctx)err = fzctx->error.errcode;
+            if (err != 0) {
+                CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message)
+                text_buffer_destroy(&thread_buffer);
+                fz_drop_page(fzctx, page);
+                fz_drop_stream(fzctx, stream);
+                fz_drop_document(fzctx, fzdoc);
+                fz_drop_context(fzctx);
+                return;
+            }
+
+            fz_stext_page *stext = fz_new_stext_page(fzctx, fz_bound_page(fzctx, page));
+            fz_device *dev = fz_new_stext_device(fzctx, stext, &opts);
+            dev->stroke_path = NULL;
+            dev->stroke_text = NULL;
+            dev->clip_text = NULL;
+            dev->clip_stroke_path = NULL;
+            dev->clip_stroke_text = NULL;
+
+            if (ctx->tesseract_lang != NULL) {
+                dev->fill_image = fill_image;
+            }
+
+            fz_var(err);
+            fz_try(fzctx)fz_run_page(fzctx, page, dev, fz_identity, NULL);
+            fz_always(fzctx) {
+                    fz_close_device(fzctx, dev);
+                    fz_drop_device(fzctx, dev);
+                } fz_catch(fzctx)err = fzctx->error.errcode;
+
+            if (err != 0) {
+                CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message)
+                text_buffer_destroy(&thread_buffer);
+                fz_drop_page(fzctx, page);
+                fz_drop_stext_page(fzctx, stext);
+                fz_drop_stream(fzctx, stream);
+                fz_drop_document(fzctx, fzdoc);
+                fz_drop_context(fzctx);
+                return;
+            }
+
+            fz_stext_block *block = stext->first_block;
+            while (block != NULL) {
+                int ret = read_stext_block(block, &thread_buffer);
+                if (ret == TEXT_BUF_FULL) {
+                    break;
+                }
+                block = block->next;
+            }
+            fz_drop_stext_page(fzctx, stext);
+            fz_drop_page(fzctx, page);
+
+            if (thread_buffer.dyn_buffer.cur >= ctx->content_size) {
+                break;
+            }
+        }
+        text_buffer_terminate_string(&thread_buffer);
+
+        meta_line_t *meta_content = malloc(sizeof(meta_line_t) + thread_buffer.dyn_buffer.cur);
+        meta_content->key = MetaContent;
+        memcpy(meta_content->str_val, thread_buffer.dyn_buffer.buf, thread_buffer.dyn_buffer.cur);
+        APPEND_META(doc, meta_content)
+
+        text_buffer_destroy(&thread_buffer);
+    }
+
+    fz_drop_stream(fzctx, stream);
+    fz_drop_document(fzctx, fzdoc);
+    fz_drop_context(fzctx);
+}
+
+static scan_arc_ctx_t arc_ctx = (scan_arc_ctx_t) {.passphrase = {0,}};
+
+void parse_epub_fast(scan_ebook_ctx_t *ctx, vfile_t *f, document_t *doc) {
+    struct archive *a = NULL;
+    struct archive_entry *entry = NULL;
+    arc_data_t arc_data;
+
+    text_buffer_t content_buffer = text_buffer_create(ctx->content_size);
+
+    if (ctx->tn_size <= 0) {
+        return;
+    }
+
+    int ret = arc_open(&arc_ctx, f, &a, &arc_data, TRUE);
+    if (ret != ARCHIVE_OK) {
+        CTX_LOG_ERRORF(f->filepath, "(ebook.c) [%d] %s", ret, archive_error_string(a))
+        archive_read_free(a);
+        return;
+    }
+
+    while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
+        struct stat info = *archive_entry_stat(entry);
+        if (S_ISREG(info.st_mode)) {
+            const char *utf8_name = archive_entry_pathname_utf8(entry);
+            const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
+
+            char *p = strrchr(file_path, '.');
+            if (p != NULL && (strcmp(p, ".html") == 0 || (strcmp(p, ".xhtml") == 0))) {
+                size_t entry_size = archive_entry_size(entry);
+                void *buf = malloc(entry_size + 1);
+                size_t read = archive_read_data(a, buf, entry_size);
+                *(char *) (buf + entry_size) = '\0';
+
+                if (read != entry_size) {
+                    const char *err_str = archive_error_string(a);
+                    if (err_str) {
+                        CTX_LOG_ERRORF("ebook.c", "Error while reading entry: %s", err_str)
+                    }
+                    free(buf);
+                    break;
+                }
+
+                ret = text_buffer_append_markup(&content_buffer, buf);
+                free(buf);
+
+                if (ret == TEXT_BUF_FULL) {
+                    break;
+                }
+            }
+        }
+    }
+
+    text_buffer_terminate_string(&content_buffer);
+
+    meta_line_t *meta_content = malloc(sizeof(meta_line_t) + content_buffer.dyn_buffer.cur);
+    meta_content->key = MetaContent;
+    memcpy(meta_content->str_val, content_buffer.dyn_buffer.buf, content_buffer.dyn_buffer.cur);
+    APPEND_META(doc, meta_content)
+
+    text_buffer_destroy(&content_buffer);
+
+    archive_read_free(a);
+}
+
+void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, document_t *doc) {
+
+    if (ctx->fast_epub_parse && is_epub(mime_str)) {
+        parse_epub_fast(ctx, f, doc);
+        return;
+    }
+
+    size_t buf_len;
+    void *buf = read_all(f, &buf_len);
+    if (buf == NULL) {
+        CTX_LOG_ERROR(f->filepath, "read_all() failed")
+        return;
+    }
+
+    parse_ebook_mem(ctx, buf, buf_len, mime_str, doc, FALSE);
+    free(buf);
+}
--- a/third-party/libscan/libscan/ebook/ebook.h
+++ b/third-party/libscan/libscan/ebook/ebook.h
@@ -0,0 +1,30 @@
+#ifndef SCAN_EBOOK_H
+#define SCAN_EBOOK_H
+
+#include "../scan.h"
+
+typedef struct {
+    long content_size;
+    int tn_size;
+    const char *tesseract_lang;
+    const char *tesseract_path;
+    pthread_mutex_t mupdf_mutex;
+
+    log_callback_t log;
+    logf_callback_t logf;
+    store_callback_t store;
+    int fast_epub_parse;
+    float tn_qscale;
+} scan_ebook_ctx_t;
+
+void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, document_t *doc);
+
+void
+parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only);
+
+__always_inline
+static int is_epub(const char *mime_string) {
+    return strcmp(mime_string, "application/epub+zip") == 0;
+}
+
+#endif
--- a/third-party/libscan/libscan/font/font.c
+++ b/third-party/libscan/libscan/font/font.c
@@ -0,0 +1,246 @@
+#include "font.h"
+
+#include <ft2build.h>
+#include <freetype/freetype.h>
+#include "../util.h"
+
+
+__thread FT_Library ft_lib = NULL;
+
+
+typedef struct text_dimensions {
+    unsigned int width;
+    unsigned int height;
+    unsigned int baseline;
+} text_dimensions_t;
+
+typedef struct glyph {
+    int top;
+    int height;
+    int width;
+    int descent;
+    int ascent;
+    int advance_width;
+    unsigned char *pixmap;
+} glyph_t;
+
+
+__always_inline
+int kerning_offset(char c, char pc, FT_Face face) {
+    FT_Vector kerning;
+    FT_Get_Kerning(face, c, pc, FT_KERNING_DEFAULT, &kerning);
+
+    return (int) (kerning.x / 64);
+}
+
+__always_inline
+glyph_t ft_glyph_to_glyph(FT_GlyphSlot slot) {
+    glyph_t glyph;
+
+    glyph.pixmap = slot->bitmap.buffer;
+
+    glyph.width = (int) slot->bitmap.width;
+    glyph.height = (int) slot->bitmap.rows;
+    glyph.top = slot->bitmap_top;
+    glyph.advance_width = (int) slot->advance.x / 64;
+
+    glyph.descent = MAX(0, glyph.height - glyph.top);
+    glyph.ascent = MAX(0, MAX(glyph.top, glyph.height) - glyph.descent);
+
+    return glyph;
+}
+
+text_dimensions_t text_dimension(char *text, FT_Face face) {
+    text_dimensions_t dimensions;
+
+    dimensions.width = 0;
+
+    int num_chars = (int) strlen(text);
+
+    unsigned int max_ascent = 0;
+    int max_descent = 0;
+
+    char pc = 0;
+    for (int i = 0; i < num_chars; i++) {
+        char c = text[i];
+
+        FT_Load_Char(face, c, 0);
+        glyph_t glyph = ft_glyph_to_glyph(face->glyph);
+
+        max_descent = MAX(max_descent, glyph.descent);
+        max_ascent = MAX(max_ascent, MAX(glyph.height, glyph.ascent));
+
+        int kerning_x = kerning_offset(c, pc, face);
+        dimensions.width += MAX(glyph.advance_width, glyph.width) + kerning_x;
+
+        pc = c;
+    }
+
+    dimensions.height = max_ascent + max_descent;
+    dimensions.baseline = max_descent;
+
+    return dimensions;
+}
+
+void draw_glyph(glyph_t *glyph, int x, int y, struct text_dimensions text_info, unsigned char *bitmap) {
+    unsigned int src = 0;
+    unsigned int dst = y * text_info.width + x;
+    unsigned int row_offset = text_info.width - glyph->width;
+    unsigned int buf_len = text_info.width * text_info.height;
+
+    for (unsigned int sy = 0; sy < glyph->height; sy++) {
+        for (unsigned int sx = 0; sx < glyph->width; sx++) {
+            if (dst < buf_len) {
+                bitmap[dst] |= glyph->pixmap[src];
+            }
+            src++;
+            dst++;
+        }
+        dst += row_offset;
+    }
+}
+
+void bmp_format(dyn_buffer_t *buf, text_dimensions_t dimensions, const unsigned char *bitmap) {
+
+    dyn_buffer_write_short(buf, 0x4D42); // Magic
+    dyn_buffer_write_int(buf, 0); // Size placeholder
+    dyn_buffer_write_int(buf, 0x5157); //Reserved
+    dyn_buffer_write_int(buf, 14 + 40 + 256 * 4); // pixels offset
+
+    dyn_buffer_write_int(buf, 40); // DIB size
+    dyn_buffer_write_int(buf, (int) dimensions.width);
+    dyn_buffer_write_int(buf, (int) dimensions.height);
+    dyn_buffer_write_short(buf, 1); // Color planes
+    dyn_buffer_write_short(buf, 8); // bits per pixel
+    dyn_buffer_write_int(buf, 0); // compression
+    dyn_buffer_write_int(buf, 0); // Ignored
+    dyn_buffer_write_int(buf, 3800); // hres
+    dyn_buffer_write_int(buf, 3800); // vres
+    dyn_buffer_write_int(buf, 256); // Color count
+    dyn_buffer_write_int(buf, 0); // Ignored
+
+    // RGBA32 Color table (Grayscale)
+    for (int i = 255; i >= 0; i--) {
+        dyn_buffer_write_int(buf, i + (i << 8) + (i << 16));
+    }
+
+    // Pixel array: write from bottom to top, with rows padded to multiples of 4-bytes
+    for (int y = (int) dimensions.height - 1; y >= 0; y--) {
+        for (unsigned int x = 0; x < dimensions.width; x++) {
+            dyn_buffer_write_char(buf, (char) bitmap[y * dimensions.width + x]);
+        }
+        while (buf->cur % 4 != 0) {
+            dyn_buffer_write_char(buf, 0);
+        }
+    }
+
+    // Size
+    *(int *) ((char *) buf->buf + 2) = buf->cur;
+}
+
+void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc) {
+    if (ft_lib == NULL) {
+        FT_Init_FreeType(&ft_lib);
+    }
+
+    size_t buf_len = 0;
+    void *buf = read_all(f, &buf_len);
+    if (buf == NULL) {
+        CTX_LOG_ERROR(f->filepath, "read_all() failed")
+        return;
+    }
+
+    FT_Face face;
+    FT_Error err = FT_New_Memory_Face(ft_lib, (unsigned char *) buf, (int) buf_len, 0, &face);
+    if (err != 0) {
+        CTX_LOG_ERRORF(doc->filepath, "(font.c) FT_New_Memory_Face() returned error code [%d] %s", err,
+                       FT_Error_String(err))
+        free(buf);
+        return;
+    }
+
+    char font_name[4096];
+
+    if (face->style_name == NULL || (strcmp(face->style_name, "?") == 0)) {
+        if (face->family_name == NULL) {
+            strcpy(font_name, "(null)");
+        } else {
+            strncpy(font_name, face->family_name, sizeof(font_name));
+        }
+    } else {
+        snprintf(font_name, sizeof(font_name), "%s %s", face->family_name, face->style_name);
+    }
+
+    meta_line_t *meta_name = malloc(sizeof(meta_line_t) + strlen(font_name));
+    meta_name->key = MetaFontName;
+    strcpy(meta_name->str_val, font_name);
+    APPEND_META(doc, meta_name)
+
+    if (ctx->enable_tn == TRUE) {
+        FT_Done_Face(face);
+        free(buf);
+        return;
+    }
+
+    int pixel = 64;
+    int num_chars = (int) strlen(font_name);
+
+    err = FT_Set_Pixel_Sizes(face, 0, pixel);
+    if (err != 0) {
+        CTX_LOG_WARNINGF(doc->filepath, "(font.c) FT_Set_Pixel_Sizes() returned error code [%d] %s", err,
+                         FT_Error_String(err))
+        FT_Done_Face(face);
+        free(buf);
+        return;
+    }
+
+    text_dimensions_t dimensions = text_dimension(font_name, face);
+    unsigned char *bitmap = calloc(dimensions.width * dimensions.height, 1);
+
+    FT_Vector pen;
+    pen.x = 0;
+
+    char pc = 0;
+    for (int i = 0; i < num_chars; i++) {
+        char c = font_name[i];
+
+        err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
+        if (err != 0) {
+            c = c >= 'a' && c <= 'z' ? c - 32 : c + 32;
+            err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
+            if (err != 0) {
+                CTX_LOG_WARNINGF(doc->filepath, "(font.c) FT_Load_Char() returned error code [%d] %s", err,
+                                 FT_Error_String(err))
+                continue;
+            }
+        }
+        glyph_t glyph = ft_glyph_to_glyph(face->glyph);
+
+        pen.x += kerning_offset(c, pc, face);
+        if (pen.x <= 0) {
+            pen.x = ABS(glyph.advance_width - glyph.width);
+        }
+        pen.y = dimensions.height - glyph.ascent - dimensions.baseline;
+
+        draw_glyph(&glyph, pen.x, pen.y, dimensions, bitmap);
+
+        pen.x += glyph.advance_width;
+        pc = c;
+    }
+
+    dyn_buffer_t bmp_data = dyn_buffer_create();
+    bmp_format(&bmp_data, dimensions, bitmap);
+
+    APPEND_TN_META(doc, dimensions.width, dimensions.height)
+    ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) bmp_data.buf, bmp_data.cur);
+
+    dyn_buffer_destroy(&bmp_data);
+    free(bitmap);
+
+    FT_Done_Face(face);
+    free(buf);
+}
+
+void cleanup_font() {
+    FT_Done_FreeType(ft_lib);
+}
--- a/third-party/libscan/libscan/font/font.h
+++ b/third-party/libscan/libscan/font/font.h
@@ -0,0 +1,17 @@
+#ifndef SCAN_FONT_H
+#define SCAN_FONT_H
+
+#include "../scan.h"
+
+
+typedef struct {
+    int enable_tn;
+    log_callback_t log;
+    logf_callback_t logf;
+    store_callback_t store;
+} scan_font_ctx_t;
+
+void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc);
+void cleanup_font();
+
+#endif
--- a/third-party/libscan/libscan/json/json.c
+++ b/third-party/libscan/libscan/json/json.c
@@ -0,0 +1,119 @@
+#include "json.h"
+#include "cjson/cJSON.h"
+
+
+#define JSON_MAX_FILE_SIZE (1024 * 1024 * 50)
+
+int json_extract_text(cJSON *json, text_buffer_t *tex) {
+    if (cJSON_IsObject(json)) {
+        for (cJSON *child = json->child; child != NULL; child = child->next) {
+            if (json_extract_text(child, tex)) {
+                return TRUE;
+            }
+        }
+    } else if (cJSON_IsArray(json)) {
+        cJSON *child;
+        cJSON_ArrayForEach(child, json) {
+            if (json_extract_text(child, tex)) {
+                return TRUE;
+            }
+        }
+    } else if (cJSON_IsString(json)) {
+        if (text_buffer_append_string0(tex, json->valuestring) == TEXT_BUF_FULL) {
+            return TRUE;
+        }
+        if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
+            return TRUE;
+        }
+    }
+
+    return FALSE;
+}
+
+scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
+
+    if (f->info.st_size > JSON_MAX_FILE_SIZE) {
+        CTX_LOG_WARNINGF("json.c", "File larger than maximum allowed [%s]", f->filepath)
+        return SCAN_ERR_SKIP;
+    }
+
+    size_t buf_len;
+    char *buf = read_all(f, &buf_len);
+
+    if (buf == NULL) {
+        return SCAN_ERR_READ;
+    }
+
+    buf_len += 1;
+    buf = realloc(buf, buf_len);
+    *(buf + buf_len - 1) = '\0';
+
+    cJSON *json = cJSON_ParseWithOpts(buf, NULL, TRUE);
+    text_buffer_t tex = text_buffer_create(ctx->content_size);
+
+    json_extract_text(json, &tex);
+    text_buffer_terminate_string(&tex);
+
+    APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
+
+    cJSON_Delete(json);
+    free(buf);
+    text_buffer_destroy(&tex);
+
+    return SCAN_OK;
+}
+
+#define JSON_BUF_SIZE (1024 * 1024 * 5)
+
+scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
+
+    char *buf = calloc(JSON_BUF_SIZE + 1, sizeof(char));
+    *(buf + JSON_BUF_SIZE) = '\0';
+
+    text_buffer_t tex = text_buffer_create(ctx->content_size);
+
+    size_t ret;
+    int eof = FALSE;
+    const char *parse_end = buf;
+    size_t to_read;
+    char *ptr = buf;
+
+    while (TRUE) {
+        cJSON *json;
+
+        if (!eof) {
+            to_read = parse_end == buf ? JSON_BUF_SIZE : parse_end - buf;
+            ret = f->read(f, ptr, to_read);
+            if (ret != to_read) {
+                eof = TRUE;
+            }
+        }
+
+        json = cJSON_ParseWithOpts(buf, &parse_end, FALSE);
+
+        if (parse_end == buf + JSON_BUF_SIZE) {
+            CTX_LOG_ERRORF("json.c", "Line too large for buffer [%s]", doc->filepath);
+            cJSON_Delete(json);
+            break;
+        }
+
+        if (parse_end == buf) {
+            cJSON_Delete(json);
+            break;
+        }
+
+        json_extract_text(json, &tex);
+
+        cJSON_Delete(json);
+
+        memmove(buf, parse_end, (buf + JSON_BUF_SIZE - parse_end));
+        ptr = buf + JSON_BUF_SIZE - parse_end + buf;
+    }
+
+    text_buffer_terminate_string(&tex);
+
+    APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
+
+    free(buf);
+    text_buffer_destroy(&tex);
+}
--- a/third-party/libscan/libscan/json/json.h
+++ b/third-party/libscan/libscan/json/json.h
@@ -0,0 +1,30 @@
+#ifndef SCAN_JSON_H
+#define SCAN_JSON_H
+
+#include "../scan.h"
+
+
+typedef struct {
+    long content_size;
+    log_callback_t log;
+    logf_callback_t logf;
+    store_callback_t store;
+    unsigned int json_mime;
+    unsigned int ndjson_mime;
+} scan_json_ctx_t;
+
+scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
+
+scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
+
+__always_inline
+static int is_json(scan_json_ctx_t *ctx, unsigned int mime) {
+    return mime == ctx->json_mime;
+}
+
+__always_inline
+static int is_ndjson(scan_json_ctx_t *ctx, unsigned int mime) {
+    return mime == ctx->ndjson_mime;
+}
+
+#endif
--- a/third-party/libscan/libscan/macros.h
+++ b/third-party/libscan/libscan/macros.h
@@ -0,0 +1,62 @@
+#ifndef FALSE
+#define FALSE (0)
+#define BOOL int
+#endif
+
+#ifndef TRUE
+#define TRUE (!FALSE)
+#endif
+
+#undef MAX
+#define MAX(a, b)  (((a) > (b)) ? (a) : (b))
+
+#undef MIN
+#define MIN(a, b)  (((a) < (b)) ? (a) : (b))
+
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
+#undef ABS
+#define ABS(a) (((a) < 0) ? -(a) : (a))
+
+#define SHA1_STR_LENGTH 41
+#define SHA1_DIGEST_LENGTH 20
+
+#define APPEND_STR_META(doc, keyname, value) \
+    {meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \
+    meta_str->key = keyname; \
+    strcpy(meta_str->str_val, value); \
+    APPEND_META(doc, meta_str)}
+
+#define APPEND_LONG_META(doc, keyname, value) \
+    {meta_line_t *meta_long = malloc(sizeof(meta_line_t)); \
+    meta_long->key = keyname; \
+    meta_long->long_val = value; \
+    APPEND_META(doc, meta_long)}
+
+#define APPEND_TN_META(doc, width, height) \
+    {meta_line_t *meta_str = malloc(sizeof(meta_line_t) + 4 + 1 + 4); \
+    meta_str->key = MetaThumbnail; \
+    sprintf(meta_str->str_val, "%04d,%04d", width, height); \
+    APPEND_META(doc, meta_str)}
+
+#define APPEND_META(doc, meta) \
+    meta->next = NULL;\
+    if (doc->meta_head == NULL) {\
+        doc->meta_head = meta;\
+        doc->meta_tail = doc->meta_head;\
+    } else {\
+        doc->meta_tail->next = meta;\
+        doc->meta_tail = meta;\
+    }
+
+#define APPEND_UTF8_META(doc, keyname, str) \
+    text_buffer_t tex = text_buffer_create(-1); \
+    text_buffer_append_string0(&tex, str); \
+    text_buffer_terminate_string(&tex); \
+    meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \
+    meta_tag->key = keyname; \
+    strcpy(meta_tag->str_val, tex.dyn_buffer.buf); \
+    APPEND_META(doc, meta_tag) \
+    text_buffer_destroy(&tex);
--- a/third-party/libscan/libscan/media/media.c
+++ b/third-party/libscan/libscan/media/media.c
@@ -0,0 +1,809 @@
+#include "media.h"
+#include "../ocr/ocr.h"
+#include <ctype.h>
+
+#define MIN_SIZE 32
+#define AVIO_BUF_SIZE 8192
+#define IS_VIDEO(fmt) ((fmt)->iformat->name && strcmp((fmt)->iformat->name, "image2") != 0)
+
+#define STREAM_IS_IMAGE (stream->nb_frames <= 1)
+
+#define STORE_AS_IS ((void*)-1)
+
+// Pointer to document being processed
+__thread document_t *thread_doc;
+
+const char *get_filepath_with_ext(document_t *doc, const char *filepath, const char *mime_str) {
+
+    int has_extension = doc->ext > doc->base;
+
+    if (!has_extension) {
+        if (strcmp(mime_str, "image/png") == 0) {
+            return "file.png";
+        } else if (strcmp(mime_str, "image/jpeg") == 0) {
+            return "file.jpg";
+        }
+    }
+
+    return filepath;
+}
+
+
+__always_inline
+void *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) {
+
+    if (frame->pict_type == AV_PICTURE_TYPE_NONE) {
+        return NULL;
+    }
+
+    int dstW;
+    int dstH;
+    if (frame->width <= size && frame->height <= size) {
+        if (decoder->codec_id == AV_CODEC_ID_MJPEG || decoder->codec_id == AV_CODEC_ID_PNG) {
+            return STORE_AS_IS;
+        }
+
+        dstW = frame->width;
+        dstH = frame->height;
+    } else {
+        double ratio = (double) frame->width / frame->height;
+        if (frame->width > frame->height) {
+            dstW = size;
+            dstH = (int) (size / ratio);
+        } else {
+            dstW = (int) (size * ratio);
+            dstH = size;
+        }
+    }
+
+    if (dstW <= MIN_SIZE || dstH <= MIN_SIZE) {
+        return NULL;
+    }
+
+    AVFrame *scaled_frame = av_frame_alloc();
+
+    struct SwsContext *sws_ctx = sws_getContext(
+            decoder->width, decoder->height, decoder->pix_fmt,
+            dstW, dstH, AV_PIX_FMT_YUVJ420P,
+            SIST_SWS_ALGO, 0, 0, 0
+    );
+
+    int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, dstW, dstH, 1);
+    uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len * 2);
+
+    av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, dstW, dstH, 1);
+
+    sws_scale(sws_ctx,
+              (const uint8_t *const *) frame->data, frame->linesize,
+              0, decoder->height,
+              scaled_frame->data, scaled_frame->linesize
+    );
+
+    scaled_frame->width = dstW;
+    scaled_frame->height = dstH;
+    scaled_frame->format = AV_PIX_FMT_YUV420P;
+
+    sws_freeContext(sws_ctx);
+
+    return scaled_frame;
+}
+
+typedef struct {
+    AVPacket *packet;
+    AVFrame *frame;
+} frame_and_packet_t;
+
+static void frame_and_packet_free(frame_and_packet_t *frame_and_packet) {
+    if (frame_and_packet->packet != NULL) {
+        av_packet_free(&frame_and_packet->packet);
+    }
+
+    if (frame_and_packet->frame != NULL) {
+        av_frame_free(&frame_and_packet->frame);
+    }
+
+    free(frame_and_packet->packet);
+    free(frame_and_packet);
+}
+
+__always_inline
+static void read_subtitles(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, int stream_idx, document_t *doc) {
+
+    text_buffer_t tex = text_buffer_create(-1);
+
+    AVPacket packet;
+    AVSubtitle subtitle;
+
+    AVCodec *subtitle_codec = avcodec_find_decoder(pFormatCtx->streams[stream_idx]->codecpar->codec_id);
+    AVCodecContext *decoder = avcodec_alloc_context3(subtitle_codec);
+    avcodec_parameters_to_context(decoder, pFormatCtx->streams[stream_idx]->codecpar);
+    avcodec_open2(decoder, subtitle_codec, NULL);
+
+    decoder->sub_text_format = FF_SUB_TEXT_FMT_ASS;
+
+    int got_sub;
+
+    while (1) {
+        int read_frame_ret = av_read_frame(pFormatCtx, &packet);
+
+        if (read_frame_ret != 0) {
+            break;
+        }
+
+        if (packet.stream_index != stream_idx) {
+            av_packet_unref(&packet);
+            continue;
+        }
+
+        avcodec_decode_subtitle2(decoder, &subtitle, &got_sub, &packet);
+
+        if (got_sub) {
+            for (int i = 0; i < subtitle.num_rects; i++) {
+                const char *text = subtitle.rects[i]->ass;
+
+                if (text == NULL) {
+                    continue;
+                }
+
+                char *idx = strstr(text, "\\N");
+                if (idx != NULL && strlen(idx + 2) > 1) {
+                    text_buffer_append_string0(&tex, idx + 2);
+                    text_buffer_append_char(&tex, ' ');
+                }
+            }
+            avsubtitle_free(&subtitle);
+        }
+
+        av_packet_unref(&packet);
+    }
+
+    text_buffer_terminate_string(&tex);
+
+    APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
+    text_buffer_destroy(&tex);
+    avcodec_free_context(&decoder);
+}
+
+__always_inline
+static frame_and_packet_t *
+read_frame(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVCodecContext *decoder, int stream_idx,
+           document_t *doc) {
+
+    frame_and_packet_t *result = calloc(1, sizeof(frame_and_packet_t));
+    result->packet = av_packet_alloc();
+    result->frame = av_frame_alloc();
+
+    av_init_packet(result->packet);
+
+    int receive_ret = -EAGAIN;
+    while (receive_ret == -EAGAIN) {
+        // Get video frame
+        while (1) {
+            int read_frame_ret = av_read_frame(pFormatCtx, result->packet);
+
+            if (read_frame_ret != 0) {
+                if (read_frame_ret != AVERROR_EOF) {
+                    CTX_LOG_WARNINGF(doc->filepath,
+                                     "(media.c) avcodec_read_frame() returned error code [%d] %s",
+                                     read_frame_ret, av_err2str(read_frame_ret)
+                    )
+                }
+                frame_and_packet_free(result);
+                return NULL;
+            }
+
+            //Ignore audio/other frames
+            if (result->packet->stream_index != stream_idx) {
+                av_packet_unref(result->packet);
+                continue;
+            }
+            break;
+        }
+
+        // Feed it to decoder
+        int decode_ret = avcodec_send_packet(decoder, result->packet);
+        if (decode_ret != 0) {
+            CTX_LOG_ERRORF(doc->filepath,
+                           "(media.c) avcodec_send_packet() returned error code [%d] %s",
+                           decode_ret, av_err2str(decode_ret)
+            )
+            frame_and_packet_free(result);
+            return NULL;
+        }
+
+        receive_ret = avcodec_receive_frame(decoder, result->frame);
+        if (receive_ret == -EAGAIN && result->packet != NULL) {
+            av_packet_unref(result->packet);
+        }
+    }
+
+    return result;
+}
+
+void append_tag_meta_if_not_exists(scan_media_ctx_t *ctx, document_t *doc, AVDictionaryEntry *tag, enum metakey key) {
+
+    meta_line_t *meta = doc->meta_head;
+    while (meta != NULL) {
+        if (meta->key == key) {
+            CTX_LOG_DEBUGF(doc->filepath, "Ignoring duplicate tag: '%02x=%s' and '%02x=%s'",
+                           key, meta->str_val, key, tag->value)
+            return;
+        }
+        meta = meta->next;
+    }
+
+    text_buffer_t tex = text_buffer_create(-1);
+    text_buffer_append_string0(&tex, tag->value);
+    text_buffer_terminate_string(&tex);
+    meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
+    meta_tag->key = key;
+    strcpy(meta_tag->str_val, tex.dyn_buffer.buf);
+
+    APPEND_META(doc, meta_tag)
+    text_buffer_destroy(&tex);
+}
+
+#define APPEND_TAG_META(keyname) \
+    APPEND_UTF8_META(doc, keyname, tag->value)
+
+#define STRCPY_TOLOWER(dst, str) \
+    strncpy(dst, str, sizeof(dst)); \
+    char *ptr = dst; \
+    for (; *ptr; ++ptr) *ptr = (char) tolower(*ptr);
+
+__always_inline
+static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
+
+    AVDictionaryEntry *tag = NULL;
+    while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
+        char key[256];
+        STRCPY_TOLOWER(key, tag->key)
+
+        if (strcmp(key, "artist") == 0) {
+            APPEND_TAG_META(MetaArtist)
+        } else if (strcmp(key, "genre") == 0) {
+            APPEND_TAG_META(MetaGenre)
+        } else if (strcmp(key, "title") == 0) {
+            APPEND_TAG_META(MetaTitle)
+        } else if (strcmp(key, "album_artist") == 0) {
+            APPEND_TAG_META(MetaAlbumArtist)
+        } else if (strcmp(key, "album") == 0) {
+            APPEND_TAG_META(MetaAlbum)
+        } else if (strcmp(key, "comment") == 0) {
+            APPEND_TAG_META(MetaContent)
+        }
+    }
+}
+
+__always_inline
+static void
+append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *frame, document_t *doc, int is_video) {
+
+    if (is_video) {
+        meta_line_t *meta_duration = malloc(sizeof(meta_line_t));
+        meta_duration->key = MetaMediaDuration;
+        meta_duration->long_val = pFormatCtx->duration / AV_TIME_BASE;
+        if (meta_duration->long_val > INT32_MAX) {
+            meta_duration->long_val = 0;
+        }
+        APPEND_META(doc, meta_duration)
+
+        meta_line_t *meta_bitrate = malloc(sizeof(meta_line_t));
+        meta_bitrate->key = MetaMediaBitrate;
+        meta_bitrate->long_val = pFormatCtx->bit_rate;
+        APPEND_META(doc, meta_bitrate)
+    }
+
+    AVDictionaryEntry *tag = NULL;
+    if (is_video) {
+        while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
+            char key[256];
+            STRCPY_TOLOWER(key, tag->key)
+
+            if (strcmp(key, "title") == 0) {
+                append_tag_meta_if_not_exists(ctx, doc, tag, MetaTitle);
+            } else if (strcmp(key, "comment") == 0) {
+                append_tag_meta_if_not_exists(ctx, doc, tag, MetaContent);
+            } else if (strcmp(key, "artist") == 0) {
+                append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
+            }
+        }
+    } else {
+        // EXIF metadata
+        while ((tag = av_dict_get(frame->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
+            char key[256];
+            STRCPY_TOLOWER(key, tag->key)
+
+            if (strcmp(key, "artist") == 0) {
+                append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
+            } else if (strcmp(key, "imagedescription") == 0) {
+                append_tag_meta_if_not_exists(ctx, doc, tag, MetaContent);
+            } else if (strcmp(key, "make") == 0) {
+                APPEND_TAG_META(MetaExifMake)
+            } else if (strcmp(key, "model") == 0) {
+                APPEND_TAG_META(MetaExifModel)
+            } else if (strcmp(key, "software") == 0) {
+                APPEND_TAG_META(MetaExifSoftware)
+            } else if (strcmp(key, "fnumber") == 0) {
+                APPEND_TAG_META(MetaExifFNumber)
+            } else if (strcmp(key, "focallength") == 0) {
+                APPEND_TAG_META(MetaExifFocalLength)
+            } else if (strcmp(key, "usercomment") == 0) {
+                APPEND_TAG_META(MetaExifUserComment)
+            } else if (strcmp(key, "isospeedratings") == 0) {
+                APPEND_TAG_META(MetaExifIsoSpeedRatings)
+            } else if (strcmp(key, "exposuretime") == 0) {
+                APPEND_TAG_META(MetaExifExposureTime)
+            } else if (strcmp(key, "datetime") == 0) {
+                APPEND_TAG_META(MetaExifDateTime)
+            } else if (strcmp(key, "gpslatitude") == 0) {
+                APPEND_TAG_META(MetaExifGpsLatitudeDMS)
+            } else if (strcmp(key, "gpslatituderef") == 0) {
+                APPEND_TAG_META(MetaExifGpsLatitudeRef)
+            } else if (strcmp(key, "gpslongitude") == 0) {
+                APPEND_TAG_META(MetaExifGpsLongitudeDMS)
+            } else if (strcmp(key, "gpslongituderef") == 0) {
+                APPEND_TAG_META(MetaExifGpsLongitudeRef)
+            }
+        }
+    }
+}
+
+static void ocr_image_cb(const char *text, size_t len) {
+    APPEND_STR_META(thread_doc, MetaContent, text);
+}
+
+#define OCR_PIXEL_FORMAT AV_PIX_FMT_RGB32
+#define OCR_BYTES_PER_PIXEL 4
+#define OCR_PIXELS_PER_INCH 70
+
+void ocr_image(scan_media_ctx_t *ctx, document_t *doc, const AVCodecContext *decoder, AVFrame *frame) {
+
+    // Convert to RGB32
+    AVFrame *rgb_frame = av_frame_alloc();
+
+    struct SwsContext *sws_ctx = sws_getContext(
+            frame->width, frame->height, decoder->pix_fmt,
+            frame->width, frame->height, OCR_PIXEL_FORMAT,
+            SWS_LANCZOS, 0, 0, 0
+    );
+
+    int dst_buf_len = av_image_get_buffer_size(OCR_PIXEL_FORMAT, frame->width, frame->height, 1);
+    uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len * 2);
+
+    av_image_fill_arrays(rgb_frame->data, rgb_frame->linesize, dst_buf, OCR_PIXEL_FORMAT, frame->width, frame->height,
+                         1);
+
+    sws_scale(sws_ctx,
+              (const uint8_t *const *) frame->data, frame->linesize,
+              0, frame->height,
+              rgb_frame->data, rgb_frame->linesize
+    );
+
+    thread_doc = doc;
+    ocr_extract_text(
+            ctx->tesseract_path,
+            ctx->tesseract_lang,
+            rgb_frame->data[0],
+            frame->width,
+            frame->height,
+            OCR_BYTES_PER_PIXEL,
+            rgb_frame->linesize[0],
+            OCR_PIXELS_PER_INCH,
+            ocr_image_cb
+    );
+
+    sws_freeContext(sws_ctx);
+    av_free(*rgb_frame->data);
+    av_frame_free(&rgb_frame);
+}
+
+void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) {
+
+    int video_stream = -1;
+    int audio_stream = -1;
+    int subtitle_stream = -1;
+
+    avformat_find_stream_info(pFormatCtx, NULL);
+
+    for (int i = (int) pFormatCtx->nb_streams - 1; i >= 0; i--) {
+        AVStream *stream = pFormatCtx->streams[i];
+
+        if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
+            if (audio_stream == -1) {
+                const AVCodecDescriptor *desc = avcodec_descriptor_get(stream->codecpar->codec_id);
+
+                if (desc != NULL) {
+                    APPEND_STR_META(doc, MetaMediaAudioCodec, desc->name)
+                }
+
+                audio_stream = i;
+            }
+        } else if (stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
+
+            if (video_stream == -1) {
+                const AVCodecDescriptor *desc = avcodec_descriptor_get(stream->codecpar->codec_id);
+
+                if (desc != NULL) {
+                    APPEND_STR_META(doc, MetaMediaVideoCodec, desc->name)
+                }
+
+                meta_line_t *meta_w = malloc(sizeof(meta_line_t));
+                meta_w->key = MetaWidth;
+                meta_w->long_val = stream->codecpar->width;
+                APPEND_META(doc, meta_w)
+
+                meta_line_t *meta_h = malloc(sizeof(meta_line_t));
+                meta_h->key = MetaHeight;
+                meta_h->long_val = stream->codecpar->height;
+                APPEND_META(doc, meta_h)
+
+                video_stream = i;
+            }
+        } else if (stream->codecpar->codec_type == AVMEDIA_TYPE_SUBTITLE) {
+            subtitle_stream = i;
+        }
+    }
+
+    if (subtitle_stream != -1 && ctx->read_subtitles) {
+        read_subtitles(ctx, pFormatCtx, subtitle_stream, doc);
+
+        // Reset stream
+        if (video_stream != -1) {
+            av_seek_frame(pFormatCtx, video_stream, 0, 0);
+        }
+    }
+
+    if (audio_stream != -1) {
+        append_audio_meta(pFormatCtx, doc);
+    }
+
+    if (video_stream != -1 && ctx->tn_size > 0) {
+        AVStream *stream = pFormatCtx->streams[video_stream];
+
+        if (stream->codecpar->width <= MIN_SIZE || stream->codecpar->height <= MIN_SIZE) {
+            avformat_close_input(&pFormatCtx);
+            avformat_free_context(pFormatCtx);
+            return;
+        }
+
+        // Decoder
+        AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
+        AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
+        avcodec_parameters_to_context(decoder, stream->codecpar);
+        avcodec_open2(decoder, video_codec, NULL);
+
+        //Seek
+        if (!STREAM_IS_IMAGE && stream->codecpar->codec_id != AV_CODEC_ID_GIF) {
+            int seek_ret;
+            for (int i = 20; i >= 0; i--) {
+                seek_ret = av_seek_frame(pFormatCtx, video_stream,
+                                         (long) ((double) stream->duration * 0.10), 0);
+                if (seek_ret == 0) {
+                    break;
+                }
+            }
+        }
+
+        frame_and_packet_t *frame_and_packet = read_frame(ctx, pFormatCtx, decoder, video_stream, doc);
+        if (frame_and_packet == NULL) {
+            avcodec_free_context(&decoder);
+            avformat_close_input(&pFormatCtx);
+            avformat_free_context(pFormatCtx);
+            return;
+        }
+
+        if (ctx->tesseract_lang != NULL && STREAM_IS_IMAGE) {
+            ocr_image(ctx, doc, decoder, frame_and_packet->frame);
+        }
+
+        // NOTE: OCR'd content takes precedence over exif image description
+        append_video_meta(ctx, pFormatCtx, frame_and_packet->frame, doc, IS_VIDEO(pFormatCtx));
+
+        // Scale frame
+        AVFrame *scaled_frame = scale_frame(decoder, frame_and_packet->frame, ctx->tn_size);
+
+        if (scaled_frame == NULL) {
+            frame_and_packet_free(frame_and_packet);
+            avcodec_free_context(&decoder);
+            avformat_close_input(&pFormatCtx);
+            avformat_free_context(pFormatCtx);
+            return;
+        }
+
+        if (scaled_frame == STORE_AS_IS) {
+            APPEND_TN_META(doc, frame_and_packet->frame->width, frame_and_packet->frame->height)
+            ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data,
+                       frame_and_packet->packet->size);
+        } else {
+            // Encode frame to jpeg
+            AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
+                                                              ctx->tn_qscale);
+            avcodec_send_frame(jpeg_encoder, scaled_frame);
+
+            AVPacket jpeg_packet;
+            av_init_packet(&jpeg_packet);
+            avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
+
+            // Save thumbnail
+            APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
+            ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
+
+            avcodec_free_context(&jpeg_encoder);
+            av_packet_unref(&jpeg_packet);
+            av_free(*scaled_frame->data);
+            av_frame_free(&scaled_frame);
+        }
+
+        frame_and_packet_free(frame_and_packet);
+        avcodec_free_context(&decoder);
+    }
+
+    avformat_close_input(&pFormatCtx);
+    avformat_free_context(pFormatCtx);
+}
+
+void parse_media_filename(scan_media_ctx_t *ctx, const char *filepath, document_t *doc) {
+
+    AVFormatContext *pFormatCtx = avformat_alloc_context();
+    if (pFormatCtx == NULL) {
+        CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
+        return;
+    }
+    int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
+    if (res < 0) {
+        CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
+        avformat_close_input(&pFormatCtx);
+        avformat_free_context(pFormatCtx);
+        return;
+    }
+
+    parse_media_format_ctx(ctx, pFormatCtx, doc);
+}
+
+int vfile_read(void *ptr, uint8_t *buf, int buf_size) {
+    struct vfile *f = ptr;
+
+    int ret = f->read(f, buf, buf_size);
+
+    if (ret == 0) {
+        return AVERROR_EOF;
+    }
+    return ret;
+}
+
+typedef struct {
+    size_t size;
+    FILE *file;
+    void *buf;
+} memfile_t;
+
+int memfile_read(void *ptr, uint8_t *buf, int buf_size) {
+    memfile_t *mem = ptr;
+
+    size_t ret = fread(buf, 1, buf_size, mem->file);
+
+    if (ret == 0 && feof(mem->file)) {
+        return AVERROR_EOF;
+    }
+
+    return (int) ret;
+}
+
+long memfile_seek(void *ptr, long offset, int whence) {
+    memfile_t *mem = ptr;
+
+    if (whence == 0x10000) {
+        return (long) mem->size;
+    }
+
+    int ret = fseek(mem->file, offset, whence);
+    if (ret != 0) {
+        return AVERROR_EOF;
+    }
+
+    return ftell(mem->file);
+}
+
+int memfile_open(vfile_t *f, memfile_t *mem) {
+    mem->size = f->info.st_size;
+
+    mem->buf = malloc(mem->size);
+    if (mem->buf == NULL) {
+        return -1;
+    }
+
+    int ret = f->read(f, mem->buf, mem->size);
+    mem->file = fmemopen(mem->buf, mem->size, "rb");
+
+    if (f->calculate_checksum) {
+        SHA1_Init(&f->sha1_ctx);
+        safe_sha1_update(&f->sha1_ctx, mem->buf, mem->size);
+        SHA1_Final(f->sha1_digest, &f->sha1_ctx);
+        f->has_checksum = TRUE;
+    }
+
+    return (ret == mem->size && mem->file != NULL) ? 0 : -1;
+}
+
+int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) {
+    mem->size = (int) buf_len;
+
+    mem->buf = buf;
+    mem->file = fmemopen(mem->buf, mem->size, "rb");
+
+    return mem->file != NULL ? 0 : -1;
+}
+
+void memfile_close(memfile_t *mem) {
+    if (mem->buf != NULL) {
+        free(mem->buf);
+        fclose(mem->file);
+    }
+}
+
+void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc, const char *mime_str) {
+
+    AVFormatContext *pFormatCtx = avformat_alloc_context();
+    if (pFormatCtx == NULL) {
+        CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
+        return;
+    }
+
+    unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
+    AVIOContext *io_ctx = NULL;
+    memfile_t memfile = {0, 0, 0};
+
+    const char *filepath = get_filepath_with_ext(doc, f->filepath, mime_str);
+
+    if (f->info.st_size <= ctx->max_media_buffer) {
+        int ret = memfile_open(f, &memfile);
+        if (ret == 0) {
+            CTX_LOG_DEBUGF(f->filepath, "Loading media file in memory (%ldB)", f->info.st_size)
+            io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, &memfile, memfile_read, NULL, memfile_seek);
+        }
+    }
+
+    if (io_ctx == NULL) {
+        CTX_LOG_DEBUGF(f->filepath, "Reading media file without seek support", f->info.st_size)
+        io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, f, vfile_read, NULL, NULL);
+    }
+
+    pFormatCtx->pb = io_ctx;
+
+    int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
+    if (res < 0) {
+        if (res != -5) {
+            CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
+        }
+        av_free(io_ctx->buffer);
+        memfile_close(&memfile);
+        avio_context_free(&io_ctx);
+        avformat_close_input(&pFormatCtx);
+        avformat_free_context(pFormatCtx);
+        return;
+    }
+
+    parse_media_format_ctx(ctx, pFormatCtx, doc);
+    av_free(io_ctx->buffer);
+    avio_context_free(&io_ctx);
+    memfile_close(&memfile);
+}
+
+void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char *mime_str) {
+
+    if (f->is_fs_file) {
+        parse_media_filename(ctx, f->filepath, doc);
+    } else {
+        parse_media_vfile(ctx, f, doc, mime_str);
+    }
+}
+
+void init_media() {
+    av_log_set_level(AV_LOG_QUIET);
+}
+
+int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) {
+    memfile_t memfile = {0, 0, 0};
+    AVIOContext *io_ctx = NULL;
+
+    AVFormatContext *pFormatCtx = avformat_alloc_context();
+    if (pFormatCtx == NULL) {
+        CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
+        return FALSE;
+    }
+
+    unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
+
+    int ret = memfile_open_buf(buf, buf_len, &memfile);
+    if (ret == 0) {
+        CTX_LOG_DEBUGF(doc->filepath, "Loading media file in memory (%ldB)", buf_len)
+        io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, &memfile, memfile_read, NULL, memfile_seek);
+    } else {
+        avformat_close_input(&pFormatCtx);
+        avformat_free_context(pFormatCtx);
+        fclose(memfile.file);
+        return FALSE;
+    }
+
+    pFormatCtx->pb = io_ctx;
+
+    int res = avformat_open_input(&pFormatCtx, url, NULL, NULL);
+    if (res != 0) {
+        av_free(io_ctx->buffer);
+        avformat_close_input(&pFormatCtx);
+        avformat_free_context(pFormatCtx);
+        avio_context_free(&io_ctx);
+        fclose(memfile.file);
+        return FALSE;
+    }
+
+    AVStream *stream = pFormatCtx->streams[0];
+
+    // Decoder
+    const AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
+    AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
+    avcodec_parameters_to_context(decoder, stream->codecpar);
+    avcodec_open2(decoder, video_codec, NULL);
+
+    frame_and_packet_t *frame_and_packet = read_frame(ctx, pFormatCtx, decoder, 0, doc);
+    if (frame_and_packet == NULL) {
+        avcodec_free_context(&decoder);
+        avformat_close_input(&pFormatCtx);
+        avformat_free_context(pFormatCtx);
+        av_free(io_ctx->buffer);
+        avio_context_free(&io_ctx);
+        fclose(memfile.file);
+        return FALSE;
+    }
+
+    // Scale frame
+    AVFrame *scaled_frame = scale_frame(decoder, frame_and_packet->frame, ctx->tn_size);
+
+    if (scaled_frame == NULL) {
+        frame_and_packet_free(frame_and_packet);
+        avcodec_free_context(&decoder);
+        avformat_close_input(&pFormatCtx);
+        avformat_free_context(pFormatCtx);
+        av_free(io_ctx->buffer);
+        avio_context_free(&io_ctx);
+        fclose(memfile.file);
+        return FALSE;
+    }
+
+    if (scaled_frame == STORE_AS_IS) {
+        APPEND_TN_META(doc, frame_and_packet->frame->width, frame_and_packet->frame->height)
+        ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data,
+                   frame_and_packet->packet->size);
+    } else {
+        // Encode frame to jpeg
+        AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
+                                                          ctx->tn_qscale);
+        avcodec_send_frame(jpeg_encoder, scaled_frame);
+
+        AVPacket jpeg_packet;
+        av_init_packet(&jpeg_packet);
+        avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
+
+        // Save thumbnail
+        APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
+        ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
+
+        av_packet_unref(&jpeg_packet);
+        avcodec_free_context(&jpeg_encoder);
+        av_free(*scaled_frame->data);
+        av_frame_free(&scaled_frame);
+    }
+
+    frame_and_packet_free(frame_and_packet);
+    avcodec_free_context(&decoder);
+
+    avformat_close_input(&pFormatCtx);
+    avformat_free_context(pFormatCtx);
+
+    av_free(io_ctx->buffer);
+    avio_context_free(&io_ctx);
+    fclose(memfile.file);
+
+    return TRUE;
+}
--- a/third-party/libscan/libscan/media/media.h
+++ b/third-party/libscan/libscan/media/media.h
@@ -0,0 +1,55 @@
+#ifndef SIST2_MEDIA_H
+#define SIST2_MEDIA_H
+
+
+#include "../scan.h"
+
+#include "libavformat/avformat.h"
+#include "libswscale/swscale.h"
+#include "libswresample/swresample.h"
+#include "libavcodec/avcodec.h"
+#include "libavutil/imgutils.h"
+
+typedef struct {
+    log_callback_t log;
+    logf_callback_t logf;
+    store_callback_t store;
+
+    int tn_size;
+    float tn_qscale;
+    long max_media_buffer;
+    int read_subtitles;
+
+    const char *tesseract_lang;
+    const char *tesseract_path;
+} scan_media_ctx_t;
+
+__always_inline
+static AVCodecContext *alloc_jpeg_encoder(int w, int h, float qscale) {
+
+    const AVCodec *jpeg_codec = avcodec_find_encoder(AV_CODEC_ID_MJPEG);
+    AVCodecContext *jpeg = avcodec_alloc_context3(jpeg_codec);
+    jpeg->width = w;
+    jpeg->height = h;
+    jpeg->time_base.den = 1000000;
+    jpeg->time_base.num = 1;
+    jpeg->i_quant_factor = qscale;
+
+    jpeg->pix_fmt = AV_PIX_FMT_YUVJ420P;
+    int ret = avcodec_open2(jpeg, jpeg_codec, NULL);
+
+    if (ret != 0) {
+        return NULL;
+    }
+
+    return jpeg;
+}
+
+
+void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char*mime_str);
+
+void init_media();
+
+int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url);
+
+#endif
--- a/third-party/libscan/libscan/mobi/scan_mobi.c
+++ b/third-party/libscan/libscan/mobi/scan_mobi.c
@@ -0,0 +1,79 @@
+#include "scan_mobi.h"
+
+#include <mobi.h>
+#include <errno.h>
+#include "stdlib.h"
+
+void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc) {
+
+    MOBIData *m = mobi_init();
+    if (m == NULL) {
+        CTX_LOG_ERROR(f->filepath, "mobi_init() failed")
+        return;
+    }
+
+    size_t buf_len;
+    char* buf = read_all(f, &buf_len);
+    if (buf == NULL) {
+        mobi_free(m);
+        CTX_LOG_ERROR(f->filepath, "read_all() failed")
+        return;
+    }
+
+    FILE *file = fmemopen(buf, buf_len, "rb");
+    if (file == NULL) {
+        mobi_free(m);
+        free(buf);
+        CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
+        return;
+    }
+
+    MOBI_RET mobi_ret = mobi_load_file(m, file);
+    fclose(file);
+    if (mobi_ret != MOBI_SUCCESS) {
+        mobi_free(m);
+        free(buf);
+        CTX_LOG_ERRORF(f->filepath, "mobi_laod_file() returned error code [%d]", mobi_ret)
+        return;
+    }
+
+    char *author = mobi_meta_get_author(m);
+    if (author != NULL) {
+        APPEND_STR_META(doc, MetaAuthor, author)
+        free(author);
+    }
+    char *title = mobi_meta_get_title(m);
+    if (title != NULL) {
+        APPEND_STR_META(doc, MetaTitle, title)
+        free(title);
+    }
+
+    const size_t maxlen = mobi_get_text_maxsize(m);
+    if (maxlen == MOBI_NOTSET) {
+        free(buf);
+        CTX_LOG_DEBUGF("%s", "Invalid text maxsize: %zu", maxlen)
+        return;
+    }
+
+    char *content_str = malloc(maxlen + 1);
+    size_t length = maxlen;
+    mobi_ret = mobi_get_rawml(m, content_str, &length);
+    if (mobi_ret != MOBI_SUCCESS) {
+        mobi_free(m);
+        free(content_str);
+        free(buf);
+        CTX_LOG_ERRORF(f->filepath, "mobi_get_rawml() returned error code [%d]", mobi_ret)
+        return;
+    }
+
+    text_buffer_t tex = text_buffer_create(ctx->content_size);
+    text_buffer_append_markup(&tex, content_str);
+    text_buffer_terminate_string(&tex);
+
+    APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
+
+    free(content_str);
+    free(buf);
+    text_buffer_destroy(&tex);
+    mobi_free(m);
+}
--- a/third-party/libscan/libscan/mobi/scan_mobi.h
+++ b/third-party/libscan/libscan/mobi/scan_mobi.h
@@ -0,0 +1,14 @@
+#ifndef SCAN_SCAN_MOBI_H
+#define SCAN_SCAN_MOBI_H
+
+#include "../scan.h"
+
+typedef struct {
+    long content_size;
+    log_callback_t log;
+    logf_callback_t logf;
+} scan_mobi_ctx_t;
+
+void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc);
+
+#endif
--- a/third-party/libscan/libscan/msdoc/msdoc.c
+++ b/third-party/libscan/libscan/msdoc/msdoc.c
@@ -0,0 +1,147 @@
+#include "msdoc.h"
+#include <errno.h>
+
+#include <sys/mman.h>
+#include "../../third-party/antiword/src/antiword.h"
+
+#include "../ebook/ebook.h"
+
+void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void *buf, size_t buf_len) {
+
+    // Open word doc
+    options_type *opts = direct_vGetOptions();
+    opts->iParagraphBreak = 74;
+    opts->eConversionType = conversion_text;
+    opts->bHideHiddenText = 1;
+    opts->bRemoveRemovedText = 1;
+    opts->bUseLandscape = 0;
+    opts->eEncoding = encoding_utf_8;
+    opts->iPageHeight = 842; // A4
+    opts->iPageWidth = 595;
+    opts->eImageLevel = level_ps_3;
+
+    int doc_word_version = iGuessVersionNumber(file_in, (int) buf_len);
+    if (doc_word_version < 0 || doc_word_version == 3) {
+        free(buf);
+        return;
+    }
+    rewind(file_in);
+
+    size_t out_len;
+    char *out_buf;
+
+    FILE *file_out = open_memstream(&out_buf, &out_len);
+
+    diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
+    if (diag == NULL) {
+        fclose(file_in);
+        return;
+    }
+
+    iInitDocument(file_in, (int) buf_len);
+    const char *author = szGetAuthor();
+    if (author != NULL) {
+        APPEND_UTF8_META(doc, MetaAuthor, author)
+    }
+
+    const char *title = szGetTitle();
+    if (title != NULL) {
+        APPEND_UTF8_META(doc, MetaTitle, title)
+    }
+    vFreeDocument();
+
+    bWordDecryptor(file_in, (int) buf_len, diag);
+    vDestroyDiagram(diag);
+    fclose(file_out);
+
+    if (buf_len > 0) {
+        text_buffer_t tex = text_buffer_create(ctx->content_size);
+        text_buffer_append_string(&tex, out_buf, out_len);
+        text_buffer_terminate_string(&tex);
+
+        meta_line_t *meta_content = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
+        meta_content->key = MetaContent;
+        memcpy(meta_content->str_val, tex.dyn_buffer.buf, tex.dyn_buffer.cur);
+        APPEND_META(doc, meta_content)
+
+        text_buffer_destroy(&tex);
+    }
+
+    free(buf);
+    free(out_buf);
+}
+
+void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void *buf, size_t buf_len) {
+
+    scan_ebook_ctx_t ebook_ctx = {
+            .content_size = ctx->content_size,
+            .tn_size = ctx->tn_size,
+            .log = ctx->log,
+            .logf = ctx->logf,
+            .store = ctx->store,
+    };
+
+    // Open word doc
+    options_type *opts = direct_vGetOptions();
+    opts->iParagraphBreak = 74;
+    opts->eConversionType = conversion_pdf;
+    opts->bHideHiddenText = 1;
+    opts->bRemoveRemovedText = 1;
+    opts->bUseLandscape = 0;
+    opts->eEncoding = encoding_latin_1;
+    opts->iPageHeight = 842; // A4
+    opts->iPageWidth = 595;
+    opts->eImageLevel = level_ps_3;
+
+    int doc_word_version = iGuessVersionNumber(file, (int) buf_len);
+    if (doc_word_version < 0 || doc_word_version == 3) {
+        free(buf);
+        return;
+    }
+    rewind(file);
+
+    size_t out_len;
+    char *out_buf;
+
+    FILE *file_out = open_memstream(&out_buf, &out_len);
+
+    diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
+    if (diag == NULL) {
+        return;
+    }
+
+    bWordDecryptor(file, (int) buf_len, diag);
+    vDestroyDiagram(diag);
+
+    fclose(file_out);
+
+    parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc, TRUE);
+
+    free(buf);
+    free(out_buf);
+}
+
+void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
+
+    size_t buf_len;
+    char *buf = read_all(f, &buf_len);
+    if (buf == NULL) {
+        CTX_LOG_ERROR(f->filepath, "read_all() failed")
+        return;
+    }
+
+    FILE *file = fmemopen(buf, buf_len, "rb");
+    if (file == NULL) {
+        free(buf);
+        CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
+        return;
+    }
+
+    if (ctx->tn_size > 0) {
+        char *buf_pdf = malloc(buf_len);
+        memcpy(buf_pdf, buf, buf_len);
+        parse_msdoc_pdf(ctx, doc, file, buf_pdf, buf_len);
+    }
+    parse_msdoc_text(ctx, doc, file, buf, buf_len);
+    fclose(file);
+}
--- a/third-party/libscan/libscan/msdoc/msdoc.h
+++ b/third-party/libscan/libscan/msdoc/msdoc.h
@@ -0,0 +1,24 @@
+#ifndef SCAN_SCAN_MSDOC_H
+#define SCAN_SCAN_MSDOC_H
+
+#include "../scan.h"
+
+typedef struct {
+    long content_size;
+    int tn_size;
+    log_callback_t log;
+    logf_callback_t logf;
+    store_callback_t store;
+    unsigned int msdoc_mime;
+} scan_msdoc_ctx_t;
+
+__always_inline
+static int is_msdoc(scan_msdoc_ctx_t *ctx, unsigned int mime) {
+    return mime == ctx->msdoc_mime;
+}
+
+void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc);
+
+void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void* buf, size_t buf_len);
+
+#endif
--- a/third-party/libscan/libscan/ocr/ocr.h
+++ b/third-party/libscan/libscan/ocr/ocr.h
@@ -0,0 +1,47 @@
+#ifndef OCR_H
+#define OCR_H
+
+#include "../scan.h"
+#include <tesseract/capi.h>
+
+#define MIN_OCR_WIDTH 350
+#define MIN_OCR_HEIGHT 100
+#define MIN_OCR_LEN 10
+
+#define OCR_IS_VALID_BPP(d)                                                    \
+  ((d) == 1 || (d) == 2 || (d) == 4 || (d) == 8 || (d) == 16 || (d) == 24 ||   \
+   (d) == 32)
+
+typedef void (*ocr_extract_callback_t)(const char *, size_t);
+
+__always_inline static void
+ocr_extract_text(const char *tesseract_path, const char *tesseract_lang,
+                 const unsigned char *img_buf, const int img_w, const int img_h,
+                 const int img_bpp, const int img_stride, const int img_xres,
+                 const ocr_extract_callback_t cb) {
+
+    if (img_w < MIN_OCR_WIDTH || img_h < MIN_OCR_HEIGHT || img_xres <= 0 ||
+        !OCR_IS_VALID_BPP(img_bpp)) {
+        return;
+    }
+
+    TessBaseAPI *api = TessBaseAPICreate();
+    TessBaseAPIInit3(api, tesseract_path, tesseract_lang);
+
+    TessBaseAPISetImage(api, img_buf, img_w, img_h, img_bpp, img_stride);
+    TessBaseAPISetSourceResolution(api, img_xres);
+
+    char *text = TessBaseAPIGetUTF8Text(api);
+    if (text != NULL) {
+        size_t len = strlen(text);
+        if (len >= MIN_OCR_LEN) {
+            cb(text, len);
+        }
+        TessDeleteText(text);
+    }
+
+    TessBaseAPIEnd(api);
+    TessBaseAPIDelete(api);
+}
+
+#endif
--- a/third-party/libscan/libscan/ooxml/ooxml.c
+++ b/third-party/libscan/libscan/ooxml/ooxml.c
@@ -0,0 +1,260 @@
+#include "ooxml.h"
+
+#include <archive.h>
+#include <archive_entry.h>
+#include <libxml/xmlstring.h>
+#include <libxml/parser.h>
+
+#define _X(str) ((const xmlChar*)str)
+
+__always_inline
+static int should_read_part(const char *part) {
+
+    if (part == NULL) {
+        return FALSE;
+    }
+
+    if (    // Word
+            STR_STARTS_WITH_CONSTANT(part, "word/document.xml")
+            || STR_STARTS_WITH_CONSTANT(part, "word/footnotes.xml")
+            || STR_STARTS_WITH_CONSTANT(part, "word/endnotes.xml")
+            || STR_STARTS_WITH_CONSTANT(part, "word/footer")
+            || STR_STARTS_WITH_CONSTANT(part, "word/header")
+            // PowerPoint
+            || STR_STARTS_WITH_CONSTANT(part, "ppt/slides/slide")
+            || STR_STARTS_WITH_CONSTANT(part, "ppt/notesSlides/slide")
+            // Excel
+            || STR_STARTS_WITH_CONSTANT(part, "xl/worksheets/sheet")
+            || STR_STARTS_WITH_CONSTANT(part, "xl/sharedStrings.xml")
+            || STR_STARTS_WITH_CONSTANT(part, "xl/workbook.xml")
+            ) {
+        return TRUE;
+    }
+
+    return FALSE;
+}
+
+int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_t *buf) {
+    //TODO: Check which nodes are likely to have a 't' child, and ignore nodes that aren't
+    xmlErrorPtr err = xmlGetLastError();
+    if (err != NULL) {
+        if (err->level == XML_ERR_FATAL) {
+            CTX_LOG_ERRORF("ooxml.c", "Got fatal XML error while parsing document: %s", err->message)
+            return -1;
+        }
+    }
+
+    for (xmlNode *child = node; child; child = child->next) {
+        if (child->name != NULL && *child->name == 't' && *(child->name + 1) == '\0') {
+            xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
+
+            if (text) {
+                int ret = text_buffer_append_string0(buf, (char *) text);
+                text_buffer_append_char(buf, ' ');
+                xmlFree(text);
+
+                if (ret == TEXT_BUF_FULL) {
+                    return ret;
+                }
+            }
+        }
+
+        if (extract_text(ctx, xml, child->children, buf) == TEXT_BUF_FULL) {
+            return TEXT_BUF_FULL;
+        }
+    }
+    return 0;
+}
+
+int xml_io_read(void *context, char *buffer, int len) {
+    struct archive *a = context;
+    return (int) archive_read_data(a, buffer, len);
+}
+
+int xml_io_close(UNUSED(void *context)) {
+    //noop
+    return 0;
+}
+
+#define READ_PART_ERR (-2)
+
+__always_inline
+static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) {
+
+    xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
+                            XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
+
+    if (xml == NULL) {
+        CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
+        return READ_PART_ERR;
+    }
+
+    xmlNode *root = xmlDocGetRootElement(xml);
+    if (root == NULL) {
+        CTX_LOG_ERROR(doc->filepath, "Empty document")
+        xmlFreeDoc(xml);
+        return READ_PART_ERR;
+    }
+
+    int ret = extract_text(ctx, xml, root, buf);
+    xmlFreeDoc(xml);
+
+    return ret;
+}
+
+__always_inline
+static int read_doc_props_app(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) {
+    xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
+                            XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
+
+    if (xml == NULL) {
+        CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
+        return -1;
+    }
+
+    xmlNode *root = xmlDocGetRootElement(xml);
+    if (root == NULL) {
+        CTX_LOG_ERROR(doc->filepath, "Empty document")
+        xmlFreeDoc(xml);
+        return -1;
+    }
+
+    if (xmlStrEqual(root->name, _X("Properties"))) {
+        for (xmlNode *child = root->children; child; child = child->next) {
+            xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
+            if (text == NULL) {
+                continue;
+            }
+
+            if (xmlStrEqual(child->name, _X("Pages"))) {
+                APPEND_LONG_META(doc, MetaPages, strtol((char *) text, NULL, 10))
+            }
+
+            xmlFree(text);
+        }
+    }
+    xmlFreeDoc(xml);
+
+    return 0;
+}
+
+__always_inline
+static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) {
+    xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
+                            XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
+
+    if (xml == NULL) {
+        CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
+        return -1;
+    }
+
+    xmlNode *root = xmlDocGetRootElement(xml);
+    if (root == NULL) {
+        CTX_LOG_ERROR(doc->filepath, "Empty document")
+        xmlFreeDoc(xml);
+        return -1;
+    }
+
+    if (xmlStrEqual(root->name, _X("coreProperties"))) {
+        for (xmlNode *child = root->children; child; child = child->next) {
+            xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
+            if (text == NULL) {
+                continue;
+            }
+
+            if (xmlStrEqual(child->name, _X("title"))) {
+                APPEND_STR_META(doc, MetaTitle, (char *) text)
+            } else if (xmlStrEqual(child->name, _X("creator"))) {
+                APPEND_STR_META(doc, MetaAuthor, (char *) text)
+            } else if (xmlStrEqual(child->name, _X("lastModifiedBy"))) {
+                APPEND_STR_META(doc, MetaModifiedBy, (char *) text)
+            }
+
+            xmlFree(text);
+        }
+    }
+    xmlFreeDoc(xml);
+
+    return 0;
+}
+
+#define MAX_TN_SIZE (1024 * 1024 * 15)
+
+void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, struct archive_entry *entry) {
+    size_t entry_size = archive_entry_size(entry);
+
+    if (entry_size <= 0 || entry_size > MAX_TN_SIZE) {
+        return;
+    }
+
+    char *buf = malloc(entry_size);
+    archive_read_data(a, buf, entry_size);
+
+    APPEND_TN_META(doc, 1, 1) // Size unknown
+    ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), buf, entry_size);
+    free(buf);
+}
+
+void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
+
+    size_t buf_len;
+    void *buf = read_all(f, &buf_len);
+    if (buf == NULL) {
+        CTX_LOG_ERROR(f->filepath, "read_all() failed")
+        return;
+    }
+
+    struct archive *a = archive_read_new();
+    archive_read_support_format_zip(a);
+
+    int ret = archive_read_open_memory(a, buf, buf_len);
+    if (ret != ARCHIVE_OK) {
+        CTX_LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a))
+        archive_read_free(a);
+        free(buf);
+        return;
+    }
+
+    text_buffer_t tex = text_buffer_create(ctx->content_size);
+
+    struct archive_entry *entry;
+    int buffer_full = FALSE;
+    while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
+        if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
+            const char *path = archive_entry_pathname(entry);
+
+            if (!buffer_full && should_read_part(path) && ctx->content_size > 0) {
+                ret = read_part(ctx, a, &tex, doc);
+                if (ret == READ_PART_ERR) {
+                    break;
+                } else if (ret == TEXT_BUF_FULL) {
+                    buffer_full = TRUE;
+                }
+            } else if (strcmp(path, "docProps/app.xml") == 0) {
+                if (read_doc_props_app(ctx, a, doc) != 0) {
+                    break;
+                }
+            } else if (strcmp(path, "docProps/core.xml") == 0) {
+                if (read_doc_props(ctx, a, doc) != 0) {
+                    break;
+                }
+            } else if (strcmp(path, "docProps/thumbnail.jpeg") == 0) {
+                read_thumbnail(ctx, doc, a, entry);
+            }
+        }
+    }
+
+    if (tex.dyn_buffer.cur > 0) {
+        text_buffer_terminate_string(&tex);
+
+        meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
+        meta->key = MetaContent;
+        strcpy(meta->str_val, tex.dyn_buffer.buf);
+        APPEND_META(doc, meta)
+    }
+
+    archive_read_close(a);
+    archive_read_free(a);
+    text_buffer_destroy(&tex);
+    free(buf);
+}
--- a/third-party/libscan/libscan/ooxml/ooxml.h
+++ b/third-party/libscan/libscan/ooxml/ooxml.h
@@ -0,0 +1,16 @@
+#ifndef SCAN_OOXML_H
+#define SCAN_OOXML_H
+
+#include <stdlib.h>
+#include "../scan.h"
+
+typedef struct {
+    long content_size;
+    log_callback_t log;
+    logf_callback_t logf;
+    store_callback_t store;
+} scan_ooxml_ctx_t;
+
+void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc);
+
+#endif
--- a/third-party/libscan/libscan/raw/raw.c
+++ b/third-party/libscan/libscan/raw/raw.c
@@ -0,0 +1,224 @@
+#include "raw.h"
+#include <libraw/libraw.h>
+
+#include "../media/media.h"
+#include <unistd.h>
+
+
+#define MIN_SIZE 32
+
+int store_thumbnail_jpeg(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
+    return store_image_thumbnail((scan_media_ctx_t *) ctx, img->data, img->data_size, doc, "x.jpeg");
+}
+
+int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
+
+    int dstW;
+    int dstH;
+
+    if (img->width <= ctx->tn_size && img->height <= ctx->tn_size) {
+        dstW = img->width;
+        dstH = img->height;
+    } else {
+        double ratio = (double) img->width / img->height;
+        if (img->width > img->height) {
+            dstW = ctx->tn_size;
+            dstH = (int) (ctx->tn_size / ratio);
+        } else {
+            dstW = (int) (ctx->tn_size * ratio);
+            dstH = ctx->tn_size;
+        }
+    }
+
+    if (dstW <= MIN_SIZE || dstH <= MIN_SIZE) {
+        return FALSE;
+    }
+
+    AVFrame *scaled_frame = av_frame_alloc();
+
+    struct SwsContext *sws_ctx = sws_getContext(
+            img->width, img->height, AV_PIX_FMT_RGB24,
+            dstW, dstH, AV_PIX_FMT_YUVJ420P,
+            SIST_SWS_ALGO, 0, 0, 0
+    );
+
+    int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, dstW, dstH, 1);
+    uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len);
+
+    av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, dstW, dstH, 1);
+
+    const uint8_t *in_data[1] = {img->data};
+    int in_line_size[1] = {3 * img->width};
+
+    sws_scale(sws_ctx,
+              in_data, in_line_size,
+              0, img->height,
+              scaled_frame->data, scaled_frame->linesize
+    );
+
+    scaled_frame->width = dstW;
+    scaled_frame->height = dstH;
+    scaled_frame->format = AV_PIX_FMT_YUV420P;
+
+    sws_freeContext(sws_ctx);
+
+    AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, 1.0f);
+    avcodec_send_frame(jpeg_encoder, scaled_frame);
+
+    AVPacket jpeg_packet;
+    av_init_packet(&jpeg_packet);
+    avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
+
+    APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
+    ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
+
+    av_packet_unref(&jpeg_packet);
+    av_free(*scaled_frame->data);
+    av_frame_free(&scaled_frame);
+    avcodec_free_context(&jpeg_encoder);
+
+    return TRUE;
+}
+
+#define DMS_REF(ref) (((ref) == 'S' || (ref) == 'W') ? -1 : 1)
+
+void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
+    libraw_data_t *libraw_lib = libraw_init(0);
+
+    if (!libraw_lib) {
+        CTX_LOG_ERROR("raw.c", "Cannot create libraw handle")
+        return;
+    }
+
+    size_t buf_len = 0;
+    void *buf = read_all(f, &buf_len);
+    if (buf == NULL) {
+        CTX_LOG_ERROR(f->filepath, "read_all() failed")
+        return;
+    }
+
+    int ret = libraw_open_buffer(libraw_lib, buf, buf_len);
+    if (ret != 0) {
+        CTX_LOG_ERROR(f->filepath, "Could not open raw file")
+        free(buf);
+        libraw_close(libraw_lib);
+        return;
+    }
+
+    if (*libraw_lib->idata.model != '\0') {
+        APPEND_STR_META(doc, MetaExifModel, libraw_lib->idata.model)
+    }
+    if (*libraw_lib->idata.make != '\0') {
+        APPEND_STR_META(doc, MetaExifMake, libraw_lib->idata.make)
+    }
+    if (*libraw_lib->idata.software != '\0') {
+        APPEND_STR_META(doc, MetaExifSoftware, libraw_lib->idata.software)
+    }
+    APPEND_LONG_META(doc, MetaWidth, libraw_lib->sizes.width)
+    APPEND_LONG_META(doc, MetaHeight, libraw_lib->sizes.height)
+    char tmp[1024];
+    snprintf(tmp, sizeof(tmp), "%g", libraw_lib->other.iso_speed);
+    APPEND_STR_META(doc, MetaExifIsoSpeedRatings, tmp)
+
+    if (*libraw_lib->other.desc != '\0') {
+        APPEND_STR_META(doc, MetaContent, libraw_lib->other.desc)
+    }
+    if (*libraw_lib->other.artist != '\0') {
+        APPEND_STR_META(doc, MetaArtist, libraw_lib->other.artist)
+    }
+
+    struct tm *time = localtime(&libraw_lib->other.timestamp);
+    strftime(tmp, sizeof(tmp), "%Y:%m:%d %H:%M:%S", time);
+    APPEND_STR_META(doc, MetaExifDateTime, tmp)
+
+    snprintf(tmp, sizeof(tmp), "%.1f", libraw_lib->other.focal_len);
+    APPEND_STR_META(doc, MetaExifFocalLength, tmp)
+
+    snprintf(tmp, sizeof(tmp), "%.1f", libraw_lib->other.aperture);
+    APPEND_STR_META(doc, MetaExifFNumber, tmp)
+
+    int denominator = (int) roundf(1 / libraw_lib->other.shutter);
+    snprintf(tmp, sizeof(tmp), "1/%d", denominator);
+    APPEND_STR_META(doc, MetaExifExposureTime, tmp)
+
+    libraw_gps_info_t gps = libraw_lib->other.parsed_gps;
+    double gps_longitude_dec =
+            (gps.longitude[0] + gps.longitude[1] / 60 + gps.longitude[2] / 3600) * DMS_REF(gps.longref);
+    snprintf(tmp, sizeof(tmp), "%.15f", gps_longitude_dec);
+    if (gps_longitude_dec != 0.0) {
+        APPEND_STR_META(doc, MetaExifGpsLongitudeDec, tmp)
+    }
+
+    double gps_latitude_dec = (gps.latitude[0] + gps.latitude[1] / 60 + gps.latitude[2] / 3600) * DMS_REF(gps.latref);
+    snprintf(tmp, sizeof(tmp), "%.15f", gps_latitude_dec);
+    if (gps_latitude_dec != 0.0) {
+        APPEND_STR_META(doc, MetaExifGpsLatitudeDec, tmp)
+    }
+
+    APPEND_STR_META(doc, MetaMediaVideoCodec, "raw")
+
+    if (ctx->tn_size <= 0) {
+        free(buf);
+        libraw_close(libraw_lib);
+        return;
+    }
+
+    int unpack_ret = libraw_unpack_thumb(libraw_lib);
+    if (unpack_ret != 0) {
+        CTX_LOG_ERRORF(f->filepath, "libraw_unpack_thumb returned error code %d", unpack_ret)
+        free(buf);
+        libraw_close(libraw_lib);
+        return;
+    }
+
+    int errc = 0;
+    libraw_processed_image_t *thumb = libraw_dcraw_make_mem_thumb(libraw_lib, &errc);
+    if (errc != 0) {
+        free(buf);
+        libraw_dcraw_clear_mem(thumb);
+        libraw_close(libraw_lib);
+        return;
+    }
+
+    int tn_ok = 0;
+    if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_JPEG) {
+        tn_ok = store_thumbnail_jpeg(ctx, thumb, doc);
+    } else if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_BITMAP) {
+        // TODO: technically this should work but is currently untested
+        tn_ok = store_thumbnail_rgb24(ctx, thumb, doc);
+    }
+
+    libraw_dcraw_clear_mem(thumb);
+
+    if (tn_ok == TRUE) {
+        free(buf);
+        libraw_close(libraw_lib);
+        return;
+    }
+
+    ret = libraw_unpack(libraw_lib);
+    if (ret != 0) {
+        CTX_LOG_ERROR(f->filepath, "Could not unpack raw file")
+        free(buf);
+        libraw_close(libraw_lib);
+        return;
+    }
+
+    libraw_dcraw_process(libraw_lib);
+
+    errc = 0;
+    libraw_processed_image_t *img = libraw_dcraw_make_mem_image(libraw_lib, &errc);
+    if (errc != 0) {
+        free(buf);
+        libraw_dcraw_clear_mem(img);
+        libraw_close(libraw_lib);
+        return;
+    }
+
+    store_thumbnail_rgb24(ctx, img, doc);
+
+    libraw_dcraw_clear_mem(img);
+    libraw_close(libraw_lib);
+
+    free(buf);
+}
--- a/third-party/libscan/libscan/raw/raw.h
+++ b/third-party/libscan/libscan/raw/raw.h
@@ -0,0 +1,17 @@
+#ifndef SIST2_RAW_H
+#define SIST2_RAW_H
+
+#include "../scan.h"
+
+typedef struct {
+    log_callback_t log;
+    logf_callback_t logf;
+    store_callback_t store;
+
+    int tn_size;
+    float tn_qscale;
+} scan_raw_ctx_t;
+
+void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc);
+
+#endif //SIST2_RAW_H
--- a/third-party/libscan/libscan/scan.h
+++ b/third-party/libscan/libscan/scan.h
@@ -0,0 +1,171 @@
+#ifndef SCAN_SCAN_H
+#define SCAN_SCAN_H
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <stdio.h>
+#include <sys/stat.h>
+#include <openssl/md5.h>
+#include <openssl/sha.h>
+
+#include "macros.h"
+
+#define SIST_SWS_ALGO SWS_LANCZOS
+
+#define UNUSED(x) __attribute__((__unused__))  x
+
+typedef void (*store_callback_t)(char *key, size_t key_len, char *buf, size_t buf_len);
+
+typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...);
+
+typedef void (*log_callback_t)(const char *filepath, int level, char *str);
+
+typedef int scan_code_t;
+#define SCAN_OK (scan_code_t) 0
+#define SCAN_ERR_READ (scan_code_t) (-1)
+#define SCAN_ERR_SKIP (scan_code_t) (-2)
+
+#define LEVEL_DEBUG 0
+#define LEVEL_INFO 1
+#define LEVEL_WARNING 2
+#define LEVEL_ERROR 3
+#define LEVEL_FATAL 4
+
+#define CTX_LOG_DEBUGF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_DEBUG, fmt, __VA_ARGS__);
+#define CTX_LOG_DEBUG(filepath, str) ctx->log(filepath, LEVEL_DEBUG, str);
+
+#define CTX_LOG_INFOF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_INFO, fmt, __VA_ARGS__);
+#define CTX_LOG_INFO(filepath, str) ctx->log(filepath, LEVEL_INFO, str);
+
+#define CTX_LOG_WARNINGF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_WARNING, fmt, __VA_ARGS__);
+#define CTX_LOG_WARNING(filepath, str) ctx->log(filepath, LEVEL_WARNING, str);
+
+#define CTX_LOG_ERRORF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_ERROR, fmt, __VA_ARGS__);
+#define CTX_LOG_ERROR(filepath, str) ctx->log(filepath, LEVEL_ERROR, str);
+
+#define CTX_LOG_FATALF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_FATAL, fmt, __VA_ARGS__); exit(-1);
+#define CTX_LOG_FATAL(filepath, str) ctx->log(filepath, LEVEL_FATAL, str); exit(-1);
+
+enum metakey {
+    // String
+    MetaContent = 1,
+    MetaMediaAudioCodec,
+    MetaMediaVideoCodec,
+    MetaArtist,
+    MetaAlbum,
+    MetaAlbumArtist,
+    MetaGenre,
+    MetaTitle,
+    MetaFontName,
+    MetaParent,
+    MetaExifMake,
+    MetaExifDescription,
+    MetaExifSoftware,
+    MetaExifExposureTime,
+    MetaExifFNumber,
+    MetaExifFocalLength,
+    MetaExifUserComment,
+    MetaExifModel,
+    MetaExifIsoSpeedRatings,
+    MetaExifDateTime,
+    MetaAuthor,
+    MetaModifiedBy,
+    MetaThumbnail,
+    MetaChecksum,
+
+    // Number
+    MetaWidth,
+    MetaHeight,
+    MetaMediaDuration,
+    MetaMediaBitrate,
+    MetaPages,
+
+    // ??
+    MetaExifGpsLongitudeDMS,
+    MetaExifGpsLongitudeRef,
+    MetaExifGpsLatitudeDMS,
+    MetaExifGpsLatitudeRef,
+    MetaExifGpsLatitudeDec,
+    MetaExifGpsLongitudeDec,
+};
+
+typedef struct meta_line {
+    struct meta_line *next;
+    enum metakey key;
+    union {
+        char str_val[0];
+        unsigned long long_val;
+        double double_val;
+    };
+} meta_line_t;
+
+
+typedef struct document {
+    unsigned char path_md5[MD5_DIGEST_LENGTH];
+    unsigned long size;
+    unsigned int mime;
+    int mtime;
+    short base;
+    short ext;
+    char has_parent;
+    meta_line_t *meta_head;
+    meta_line_t *meta_tail;
+    char *filepath;
+} document_t;
+
+typedef struct vfile vfile_t;
+
+__attribute__((warn_unused_result))
+typedef int (*read_func_t)(struct vfile *, void *buf, size_t size);
+
+__attribute__((warn_unused_result))
+typedef long (*seek_func_t)(struct vfile *, long offset, int whence);
+
+typedef void (*close_func_t)(struct vfile *);
+
+typedef void (*reset_func_t)(struct vfile *);
+
+typedef struct vfile {
+    union {
+        int fd;
+        struct archive *arc;
+        const void *_test_data;
+    };
+
+    int is_fs_file;
+    int has_checksum;
+    int calculate_checksum;
+    const char *filepath;
+    struct stat info;
+
+    SHA_CTX sha1_ctx;
+    unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
+
+    void *rewind_buffer;
+    int rewind_buffer_size;
+    int rewind_buffer_cursor;
+
+    read_func_t read;
+    read_func_t read_rewindable;
+    close_func_t close;
+    reset_func_t reset;
+    log_callback_t log;
+    logf_callback_t logf;
+} vfile_t;
+
+typedef struct parse_job_t {
+    int base;
+    int ext;
+    struct vfile vfile;
+    unsigned char parent[MD5_DIGEST_LENGTH];
+    char filepath[1];
+} parse_job_t;
+
+
+#include "util.h"
+
+typedef void (*parse_callback_t)(parse_job_t *job);
+
+#endif
--- a/third-party/libscan/libscan/text/text.c
+++ b/third-party/libscan/libscan/text/text.c
@@ -0,0 +1,64 @@
+#include "text.h"
+
+scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
+
+    int to_read = MIN(ctx->content_size, f->info.st_size);
+
+    if (to_read <= 2) {
+        return SCAN_OK;
+    }
+
+    char *buf = malloc(to_read);
+    int ret = f->read(f, buf, to_read);
+    if (ret < 0) {
+        CTX_LOG_ERRORF(doc->filepath, "read() returned error code: [%d]", ret)
+        free(buf);
+        return SCAN_ERR_READ;
+    }
+
+    text_buffer_t tex = text_buffer_create(ctx->content_size);
+
+    if ((*(int16_t*)buf) == (int16_t)0xFFFE) {
+        text_buffer_append_string16_le(&tex, buf + 2, to_read - 2);
+    } else if((*(int16_t*)buf) == (int16_t)0xFEFF) {
+        text_buffer_append_string16_be(&tex, buf + 2, to_read - 2);
+    } else {
+        text_buffer_append_string(&tex, buf, to_read);
+    }
+    text_buffer_terminate_string(&tex);
+
+    APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
+
+    free(buf);
+    text_buffer_destroy(&tex);
+
+    return SCAN_OK;
+}
+
+#define MAX_MARKUP_SIZE (1024 * 1024)
+
+scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
+
+    int to_read = MIN(MAX_MARKUP_SIZE, f->info.st_size);
+
+    char *buf = malloc(to_read + 1);
+    int ret = f->read(f, buf, to_read);
+    if (ret < 0) {
+        CTX_LOG_ERRORF(doc->filepath, "read() returned error code: [%d]", ret)
+        free(buf);
+        return SCAN_ERR_READ;
+    }
+
+    *(buf + to_read) = '\0';
+
+    text_buffer_t tex = text_buffer_create(ctx->content_size);
+    text_buffer_append_markup(&tex, buf);
+    text_buffer_terminate_string(&tex);
+
+    APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
+
+    free(buf);
+    text_buffer_destroy(&tex);
+
+    return SCAN_OK;
+}
--- a/third-party/libscan/libscan/text/text.h
+++ b/third-party/libscan/libscan/text/text.h
@@ -0,0 +1,18 @@
+#ifndef SCAN_TEXT_H
+#define SCAN_TEXT_H
+
+#include "../scan.h"
+#include "../util.h"
+
+typedef struct {
+    long content_size;
+
+    log_callback_t log;
+    logf_callback_t logf;
+} scan_text_ctx_t;
+
+scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc);
+
+scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc);
+
+#endif
--- a/third-party/libscan/libscan/util.c
+++ b/third-party/libscan/libscan/util.c
--- a/third-party/libscan/libscan/util.h
+++ b/third-party/libscan/libscan/util.h
@@ -0,0 +1,361 @@
+#ifndef SCAN_UTIL_H
+#define SCAN_UTIL_H
+
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+#include "../third-party/utf8.h/utf8.h"
+#include "macros.h"
+
+#define STR_STARTS_WITH_CONSTANT(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
+
+#define TEXT_BUF_FULL (-1)
+#define INITIAL_BUF_SIZE (1024 * 16)
+
+#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c))
+#define SHOULD_KEEP_CHAR(c) (\
+    ((c) >= '\'' && (c) <= ';') || \
+    ((c) >= 'A' && (c) <= 'z') || \
+    ((c) > 127 && (c) != 0x00A0 && (c) && (c) != 0xFFFD))
+
+
+typedef struct dyn_buffer {
+    char *buf;
+    size_t cur;
+    size_t size;
+} dyn_buffer_t;
+
+typedef struct text_buffer {
+    long max_size;
+    int last_char_was_whitespace;
+    dyn_buffer_t dyn_buffer;
+} text_buffer_t;
+
+static int utf8_validchr2(const char *s) {
+    if (0x00 == (0x80 & *s)) {
+        return TRUE;
+    } else if (0xf0 == (0xf8 & *s)) {
+        if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) ||
+            (0x80 != (0xc0 & s[3]))) {
+            return FALSE;
+        }
+
+        if (0x80 == (0xc0 & s[4])) {
+            return FALSE;
+        }
+
+        if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) {
+            return FALSE;
+        }
+    } else if (0xe0 == (0xf0 & *s)) {
+        if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) {
+            return FALSE;
+        }
+
+        if (0x80 == (0xc0 & s[3])) {
+            return FALSE;
+        }
+
+        if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) {
+            return FALSE;
+        }
+    } else if (0xc0 == (0xe0 & *s)) {
+        if (0x80 != (0xc0 & s[1])) {
+            return FALSE;
+        }
+
+        if (0x80 == (0xc0 & s[2])) {
+            return FALSE;
+        }
+
+        if (0 == (0x1e & s[0])) {
+            return FALSE;
+        }
+    } else {
+        return FALSE;
+    }
+
+    return TRUE;
+}
+
+
+static dyn_buffer_t dyn_buffer_create() {
+    dyn_buffer_t buf;
+
+    buf.size = INITIAL_BUF_SIZE;
+    buf.cur = 0;
+    buf.buf = (char *) malloc(INITIAL_BUF_SIZE);
+
+    return buf;
+}
+
+static void grow_buffer(dyn_buffer_t *buf, size_t size) {
+    if (buf->cur + size > buf->size) {
+        do {
+            buf->size *= 2;
+        } while (buf->cur + size > buf->size);
+
+        buf->buf = (char *) realloc(buf->buf, buf->size);
+    }
+}
+
+static void grow_buffer_small(dyn_buffer_t *buf) {
+    if (buf->cur + sizeof(long) > buf->size) {
+        buf->size *= 2;
+        buf->buf = (char *) realloc(buf->buf, buf->size);
+    }
+}
+
+static void dyn_buffer_write(dyn_buffer_t *buf, const void *data, size_t size) {
+    grow_buffer(buf, size);
+
+    memcpy(buf->buf + buf->cur, data, size);
+    buf->cur += size;
+}
+
+static void dyn_buffer_write_char(dyn_buffer_t *buf, char c) {
+    grow_buffer_small(buf);
+
+    *(buf->buf + buf->cur) = c;
+    buf->cur += sizeof(c);
+}
+
+static void dyn_buffer_write_str(dyn_buffer_t *buf, const char *str) {
+    dyn_buffer_write(buf, str, strlen(str));
+    dyn_buffer_write_char(buf, '\0');
+}
+
+static void dyn_buffer_append_string(dyn_buffer_t *buf, const char *str) {
+    dyn_buffer_write(buf, str, strlen(str));
+}
+
+static void dyn_buffer_write_int(dyn_buffer_t *buf, int d) {
+    grow_buffer_small(buf);
+
+    *(int *) (buf->buf + buf->cur) = d;
+    buf->cur += sizeof(int);
+}
+
+static void dyn_buffer_write_short(dyn_buffer_t *buf, uint16_t s) {
+    grow_buffer_small(buf);
+
+    *(uint16_t *) (buf->buf + buf->cur) = s;
+    buf->cur += sizeof(uint16_t);
+}
+
+static void dyn_buffer_write_long(dyn_buffer_t *buf, unsigned long l) {
+    grow_buffer_small(buf);
+
+    *(unsigned long *) (buf->buf + buf->cur) = l;
+    buf->cur += sizeof(unsigned long);
+}
+
+static void dyn_buffer_destroy(dyn_buffer_t *buf) {
+    free(buf->buf);
+}
+
+static void text_buffer_destroy(text_buffer_t *buf) {
+    dyn_buffer_destroy(&buf->dyn_buffer);
+}
+
+static text_buffer_t text_buffer_create(long max_size) {
+    text_buffer_t text_buf;
+
+    text_buf.dyn_buffer = dyn_buffer_create();
+    text_buf.max_size = max_size;
+    text_buf.last_char_was_whitespace = FALSE;
+
+    return text_buf;
+}
+
+static int text_buffer_append_char(text_buffer_t *buf, int c) {
+
+    if (SHOULD_IGNORE_CHAR(c) || c == ' ') {
+        if (!buf->last_char_was_whitespace && buf->dyn_buffer.cur != 0) {
+            dyn_buffer_write_char(&buf->dyn_buffer, ' ');
+            buf->last_char_was_whitespace = TRUE;
+
+            if (buf->max_size > 0 && buf->dyn_buffer.cur > buf->max_size) {
+                return TEXT_BUF_FULL;
+            }
+        }
+    } else {
+        buf->last_char_was_whitespace = FALSE;
+        grow_buffer_small(&buf->dyn_buffer);
+
+        if (((utf8_int32_t) 0xffffff80 & c) == 0) {
+            *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = (char) c;
+        } else if (((utf8_int32_t) 0xfffff800 & c) == 0) {
+            *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xc0 | (char) (c >> 6);
+            *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
+        } else if (((utf8_int32_t) 0xffff0000 & c) == 0) {
+            *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xe0 | (char) (c >> 12);
+            *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f);
+            *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
+        } else {
+            *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xf0 | (char) (c >> 18);
+            *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 12) & 0x3f);
+            *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f);
+            *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
+        }
+
+        if (buf->max_size > 0 && buf->dyn_buffer.cur > buf->max_size) {
+            return TEXT_BUF_FULL;
+        }
+    }
+
+    return 0;
+}
+
+
+static void text_buffer_terminate_string(text_buffer_t *buf) {
+    if (buf->dyn_buffer.cur > 0 && *(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) == ' ') {
+        *(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) = '\0';
+    } else {
+        dyn_buffer_write_char(&buf->dyn_buffer, '\0');
+    }
+}
+
+// Naive UTF16 -> ascii conversion
+static int text_buffer_append_string16_le(text_buffer_t *buf, const char *str, size_t len) {
+    int ret = 0;
+    for (int i = 1; i < len; i += 2) {
+        ret = text_buffer_append_char(buf, str[i]);
+    }
+    return ret;
+}
+
+static int text_buffer_append_string16_be(text_buffer_t *buf, const char *str, size_t len) {
+    int ret = 0;
+    for (int i = 0; i < len; i += 2) {
+        ret = text_buffer_append_char(buf, str[i]);
+    }
+    return ret;
+}
+
+#define UTF8_END_OF_STRING \
+    (ptr - str >= len || *ptr == 0 || \
+    (0xc0 == (0xe0 & *ptr) && ptr - str > len - 2) || \
+    (0xe0 == (0xf0 & *ptr) && ptr - str > len - 3) || \
+    (0xf0 == (0xf8 & *ptr) && ptr - str > len - 4))
+
+static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t len) {
+
+    const char *ptr = str;
+    const char *oldPtr = ptr;
+
+    if (str == NULL || UTF8_END_OF_STRING) {
+        return 0;
+    }
+
+    if (len <= 4) {
+        for (int i = 0; i < len; i++) {
+            if (((utf8_int32_t) 0xffffff80 & str[i]) == 0 && SHOULD_KEEP_CHAR(str[i])) {
+                dyn_buffer_write_char(&buf->dyn_buffer, str[i]);
+            }
+        }
+        return 0;
+    }
+
+    utf8_int32_t c;
+    char tmp[16] = {0};
+
+    do {
+        ptr = (char *) utf8codepoint(ptr, &c);
+        *(int *) tmp = 0x00000000;
+        memcpy(tmp, oldPtr, ptr - oldPtr);
+        oldPtr = ptr;
+
+        if (!utf8_validchr2(tmp)) {
+            continue;
+        }
+
+        int ret = text_buffer_append_char(buf, c);
+
+        if (ret != 0) {
+            return ret;
+        }
+    } while (!UTF8_END_OF_STRING);
+
+    return 0;
+}
+
+static int text_buffer_append_string0(text_buffer_t *buf, const char *str) {
+    return text_buffer_append_string(buf, str, strlen(str));
+}
+
+static int text_buffer_append_markup(text_buffer_t *buf, const char *markup) {
+
+    int tag_open = TRUE;
+    const char *ptr = markup;
+    const char *start = markup;
+
+    while (*ptr != '\0') {
+        if (tag_open) {
+            if (*ptr == '>') {
+                tag_open = FALSE;
+                start = ptr + 1;
+            }
+        } else {
+            if (*ptr == '<') {
+                tag_open = TRUE;
+                if (ptr != start) {
+                    if (text_buffer_append_string(buf, start, (ptr - start)) == TEXT_BUF_FULL) {
+                        return TEXT_BUF_FULL;
+                    }
+                    if (text_buffer_append_char(buf, ' ') == TEXT_BUF_FULL) {
+                        return TEXT_BUF_FULL;
+                    }
+                }
+            }
+        }
+
+        ptr += 1;
+    }
+
+    if (ptr != start) {
+        if (text_buffer_append_string(buf, start, (ptr - start)) == TEXT_BUF_FULL) {
+            return TEXT_BUF_FULL;
+        }
+        if (text_buffer_append_char(buf, ' ') == TEXT_BUF_FULL) {
+            return TEXT_BUF_FULL;
+        }
+    }
+    return 0;
+}
+
+static void *read_all(vfile_t *f, size_t *size) {
+    void *buf = malloc(f->info.st_size);
+    *size = f->read(f, buf, f->info.st_size);
+
+    if (*size != f->info.st_size) {
+        free(buf);
+        return NULL;
+    }
+
+    return buf;
+}
+
+#define STACK_BUFFER_SIZE (size_t)(4096 * 8)
+
+__always_inline
+static void safe_sha1_update(SHA_CTX *ctx, void *buf, size_t size) {
+    unsigned char stack_buf[STACK_BUFFER_SIZE];
+
+    void *sha1_buf;
+    if (size <= STACK_BUFFER_SIZE) {
+        sha1_buf = stack_buf;
+    } else {
+        void *heap_sha1_buf = malloc(size);
+        sha1_buf = heap_sha1_buf;
+    }
+
+    memcpy(sha1_buf, buf, size);
+    SHA1_Update(ctx, (const void *) sha1_buf, size);
+
+    if (sha1_buf != stack_buf) {
+        free(sha1_buf);
+    }
+}
+
+#endif
--- a/third-party/libscan/libscan/wpd/libwpd_c_api.cpp
+++ b/third-party/libscan/libscan/wpd/libwpd_c_api.cpp
@@ -0,0 +1,200 @@
+#include "libwpd_c_api.h"
+#include "libwpd/libwpd.h"
+#include "libwpd/WPXProperty.h"
+#include "libwpd-stream/libwpd-stream.h"
+
+class StringDocument : public WPXDocumentInterface {
+
+private:
+    text_buffer_t *tex;
+    document_t *doc;
+    bool is_full;
+public:
+
+    StringDocument(text_buffer_t *tex, document_t *doc) {
+        this->tex = tex;
+        this->doc = doc;
+        this->is_full = false;
+    }
+
+    void setDocumentMetaData(const WPXPropertyList &propList) override {
+
+        WPXPropertyList::Iter propIter(propList);
+        for (propIter.rewind(); propIter.next();) {
+            // TODO: Read metadata here ?!
+        }
+    }
+
+    void endDocument() override {
+        text_buffer_terminate_string(this->tex);
+    }
+
+    void closeParagraph() override {
+        if (!this->is_full) {
+            if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
+                this->is_full = true;
+            };
+        }
+    }
+
+    void closeSpan() override {
+        if (!this->is_full) {
+            if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
+                this->is_full = true;
+            };
+        }
+    }
+
+    void closeSection() override {
+        if (!this->is_full) {
+            if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
+                this->is_full = true;
+            };
+        }
+    }
+
+    void insertTab() override {
+        if (!this->is_full) {
+            if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
+                this->is_full = true;
+            };
+        }
+    }
+
+    void insertSpace() override {
+        if (!this->is_full) {
+            if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
+                this->is_full = true;
+            };
+        }
+    }
+
+    void insertText(const WPXString &text) override {
+        if (!this->is_full) {
+            if (text_buffer_append_string0(tex, text.cstr()) == TEXT_BUF_FULL) {
+                this->is_full = true;
+            };
+        }
+    }
+
+    void insertLineBreak() override {
+        if (!this->is_full) {
+            if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
+                this->is_full = true;
+            };
+        }
+    }
+
+    void definePageStyle(const WPXPropertyList &propList) override { /* noop */ }
+
+    void closePageSpan() override { /* noop */ }
+
+    void openHeader(const WPXPropertyList &propList) override { /* noop */ }
+
+    void closeHeader() override { /* noop */ }
+
+    void openFooter(const WPXPropertyList &propList) override { /* noop */ }
+
+    void closeFooter() override { /* noop */ }
+
+    void
+    defineParagraphStyle(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
+
+    void openParagraph(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
+
+    void defineCharacterStyle(const WPXPropertyList &propList) override { /* noop */ }
+
+    void openSpan(const WPXPropertyList &propList) override { /* noop */ }
+
+    void
+    defineSectionStyle(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
+
+    void openSection(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
+
+    void insertField(const WPXString &type, const WPXPropertyList &propList) override { /* noop */ }
+
+    void defineOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
+
+    void defineUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
+
+    void openOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
+
+    void openUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
+
+    void closeOrderedListLevel() override { /* noop */ }
+
+    void closeUnorderedListLevel() override { /* noop */ }
+
+    void openListElement(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
+
+    void closeListElement() override { /* noop */ }
+
+    void openFootnote(const WPXPropertyList &propList) override { /* noop */ }
+
+    void closeFootnote() override { /* noop */ }
+
+    void openEndnote(const WPXPropertyList &propList) override { /* noop */ }
+
+    void closeEndnote() override { /* noop */ }
+
+    void openComment(const WPXPropertyList &propList) override { /* noop */ }
+
+    void closeComment() override { /* noop */ }
+
+    void openTextBox(const WPXPropertyList &propList) override { /* noop */ }
+
+    void closeTextBox() override { /* noop */ }
+
+    void openTable(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
+
+    void openTableRow(const WPXPropertyList &propList) override { /* noop */ }
+
+    void closeTableRow() override { /* noop */ }
+
+    void openTableCell(const WPXPropertyList &propList) override { /* noop */ }
+
+    void closeTableCell() override { /* noop */ }
+
+    void insertCoveredTableCell(const WPXPropertyList &propList) override { /* noop */ }
+
+    void closeTable() override { /* noop */ }
+
+    void openFrame(const WPXPropertyList &propList) override { /* noop */ }
+
+    void closeFrame() override { /* noop */ }
+
+    void insertBinaryObject(const WPXPropertyList &propList, const WPXBinaryData &data) override { /* noop */ }
+
+    void insertEquation(const WPXPropertyList &propList, const WPXString &data) override { /* noop */ }
+
+    void openPageSpan(const WPXPropertyList &propList) override { /* noop */ }
+
+    void startDocument() override { /* noop */ };
+};
+
+
+wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len) {
+    auto *input = new WPXStringStream(buf, buf_len);
+    return input;
+}
+
+wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t ptr) {
+    auto *stream = (WPXStringStream *) ptr;
+    WPDConfidence confidence = WPDocument::isFileFormatSupported(stream);
+
+    return (wpd_confidence_t) confidence;
+}
+
+wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc) {
+    auto *stream = (WPXStringStream *) ptr;
+
+    auto myDoc = StringDocument(tex, doc);
+    WPDResult result2 = WPDocument::parse(stream, &myDoc, nullptr);
+
+    return (wpd_result_t) result2;
+}
+
+void wpd_memory_stream_destroy(wpd_stream_t ptr) {
+    auto *stream = (WPXStringStream *) ptr;
+    delete stream;
+}
--- a/third-party/libscan/libscan/wpd/libwpd_c_api.h
+++ b/third-party/libscan/libscan/wpd/libwpd_c_api.h
@@ -0,0 +1,50 @@
+#ifndef SIST2_LIBWPD_C_API_H
+#define SIST2_LIBWPD_C_API_H
+
+#include "stdlib.h"
+
+#ifdef __cplusplus
+#define EXTERNC extern "C"
+#else
+#define EXTERNC
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "../scan.h"
+#include "../util.h"
+#ifdef __cplusplus
+};
+#endif
+
+
+typedef void *wpd_stream_t;
+
+typedef enum {
+    C_WPD_CONFIDENCE_NONE = 0,
+    C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION,
+    C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION,
+    C_WPD_CONFIDENCE_EXCELLENT
+} wpd_confidence_t;
+
+typedef enum {
+    C_WPD_OK,
+    C_WPD_FILE_ACCESS_ERROR,
+    C_WPD_PARSE_ERROR,
+    C_WPD_UNSUPPORTED_ENCRYPTION_ERROR,
+    C_WPD_PASSWORD_MISSMATCH_ERROR,
+    C_WPD_OLE_ERROR,
+    C_WPD_UNKNOWN_ERROR
+} wpd_result_t;
+
+
+EXTERNC wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t stream);
+
+EXTERNC wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len);
+
+EXTERNC void wpd_memory_stream_destroy(wpd_stream_t stream);
+
+EXTERNC wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc);
+
+#endif
--- a/third-party/libscan/libscan/wpd/wpd.c
+++ b/third-party/libscan/libscan/wpd/wpd.c
@@ -0,0 +1,41 @@
+#include "wpd.h"
+#include "libwpd_c_api.h"
+
+scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc) {
+
+    size_t buf_len;
+    void *buf = read_all(f, &buf_len);
+
+    void *stream = wpd_memory_stream_create(buf, buf_len);
+    wpd_confidence_t conf = wpd_is_file_format_supported(stream);
+
+    if (conf == C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION || conf == C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION) {
+        CTX_LOG_DEBUGF("wpd.c", "File is encrypted! Password-protected WPD files are not supported yet (conf=%d)", conf)
+        wpd_memory_stream_destroy(stream);
+        free(buf);
+        return SCAN_ERR_READ;
+    }
+
+    if (conf != C_WPD_CONFIDENCE_EXCELLENT) {
+        CTX_LOG_ERRORF("wpd.c", "Unsupported file format! [%s] (conf=%d)", doc->filepath, conf)
+        wpd_memory_stream_destroy(stream);
+        free(buf);
+        return SCAN_ERR_READ;
+    }
+
+    text_buffer_t tex = text_buffer_create(-1);
+    wpd_result_t res = wpd_parse(stream, &tex, doc);
+
+    if (res != C_WPD_OK) {
+        CTX_LOG_ERRORF("wpd.c", "Error while parsing WPD file [%s] (%d)",
+                       doc->filepath, res)
+    }
+
+    if (tex.dyn_buffer.cur != 0) {
+        APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
+    }
+
+    text_buffer_destroy(&tex);
+    wpd_memory_stream_destroy(stream);
+    free(buf);
+}
--- a/third-party/libscan/libscan/wpd/wpd.h
+++ b/third-party/libscan/libscan/wpd/wpd.h
@@ -0,0 +1,23 @@
+#ifndef SIST2_WPD_H
+#define SIST2_WPD_H
+
+#include "../scan.h"
+#include "../util.h"
+
+typedef struct {
+    long content_size;
+
+    log_callback_t log;
+    logf_callback_t logf;
+
+    unsigned int wpd_mime;
+} scan_wpd_ctx_t;
+
+scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc);
+
+__always_inline
+static int is_wpd(scan_wpd_ctx_t *ctx, unsigned int mime) {
+    return mime == ctx->wpd_mime;
+}
+
+#endif
--- a/third-party/libscan/test/main.cpp
+++ b/third-party/libscan/test/main.cpp
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
simon987	4dedd281f1	Push compiled vue changes	2022-01-09 09:30:31 -05:00
simon987	65c499e477	Merge pull request #231 from simon987/dev v2.11.6	2022-01-09 09:28:24 -05:00
simon987	625f3d0d6e	Option to update media type tab in real time, add media type table in details	2022-01-08 18:23:22 -05:00
simon987	64b8aab8bf	Validate that all the tesseract data files are in the same folder	2022-01-08 15:04:07 -05:00
simon987	ad95684771	Update --ocr-* args, enable OCR'ing images	2022-01-08 14:24:50 -05:00
simon987	b37e5a4ad4	Fix some warnings in media.c	2022-01-08 11:06:14 -05:00
simon987	15ae2190cf	Fix tesseract lang validation, update README.md, fix tesseract memory leak	2022-01-08 11:04:52 -05:00
simon987	255bc2d689	Tweak MIN_OCR_SIZE behavior, update gitignore	2022-01-08 10:33:02 -05:00
simon987	fe1aa6dd4c	Merge pull request #227 from yatli/dev refactor: split ocr_extract_text from ebook	2022-01-08 10:25:41 -05:00
simon987	cd2a44e016	Update ocr.h Fix minimum image size validation in ocr_extract_text	2022-01-08 10:24:57 -05:00
simon987	ed2a3f342a	Localize tag add/delete, fix some translations, add LanguageIcon, add --lang arg, fix lightbox slideshow time, fix gif hover	2022-01-08 10:03:38 -05:00
simon987	1107fe9a53	Remove libscan hash debug info	2022-01-08 10:00:34 -05:00
simon987	a96e65d039	Add zh-CN option in language dropdown	2022-01-07 17:44:49 -05:00
simon987	87936eecd4	Merge pull request #229 from yatli/master add zh-CN translation	2022-01-07 13:55:14 -05:00
Yatao Li	d817a0e9dd	add zh-CN translation	2022-01-08 01:39:50 +08:00
Yatao Li	94a5e0ac59	refactor: split ocr_extract_text from ebook	2022-01-07 23:20:35 +08:00
simon987	d40f5052f9	static link for libasan in debug build	2021-12-29 19:25:03 -05:00
simon987	ee9a8fa514	Add thread lock for incremental_mark_file_for_copy()	2021-12-29 19:18:10 -05:00
simon987	81008d8936	Add --list-file argument	2021-12-29 18:54:13 -05:00
simon987	52466d5d8a	Update tesseract datapaths	2021-12-25 11:12:00 -05:00
simon987	5f73fc024b	Version bump, update readme	2021-12-25 11:08:52 -05:00
simon987	f2fd7ccf41	Fix raw parsing maybe, fix index picker css	2021-12-25 11:08:52 -05:00
simon987	d87fee8e00	Merge pull request #214 from dpieski/patch-2 Update USAGE.md	2021-12-22 09:55:24 -05:00
Andrew	672d1344d7	Update USAGE.md Get-WmiObject is deprecated in favor of Get-CimInstance	2021-12-15 15:00:36 -06:00
simon987	27e32db1ed	Fix attempt for excludes	2021-11-17 20:18:48 -05:00
simon987	bb91139ffb	console log fixes, version bump	2021-11-15 20:52:24 -05:00
simon987	70cfa8c37c	Fix Dockerfile.arm64	2021-11-13 18:25:24 -05:00
simon987	7493dedc8c	Merge pull request #208 from simon987/dev v2.11.4	2021-11-13 17:37:47 -05:00
simon987	c786a31bb2	Merge remote-tracking branch 'origin/master' into dev # Conflicts: # README.md	2021-11-13 17:36:55 -05:00
simon987	48d024e751	Update dockerfiles	2021-11-13 17:36:30 -05:00
simon987	08b2ca9d43	Update lcms -> lcms2	2021-11-12 11:29:50 -05:00
simon987	ed8b4f4fad	Add natural sorting support	2021-11-12 10:33:51 -05:00
simon987	66de93a8bd	Language & formatting	2021-11-12 10:17:32 -05:00
simon987	e3f78fb693	Shift click & select all/none in index picker	2021-11-12 10:12:25 -05:00
simon987	030643cee0	Move CI scripts to script folder	2021-11-12 09:05:37 -05:00
simon987	b17b9439df	Print progress bar in index module	2021-11-07 13:20:05 -05:00
simon987	414f65346c	Update docker command in README.md	2021-11-07 13:18:32 -05:00
simon987	be8eedc9c7	Skip subtree of excluded directories	2021-11-07 11:56:09 -05:00
simon987	5b62fe77f2	Update demo URL	2021-11-07 09:52:28 -05:00
simon987	61ab68ce15	Update argparse repo URL	2021-11-07 09:42:17 -05:00
simon987	82ecb8bb85	Update gitignore	2021-11-07 09:36:39 -05:00
simon987	a41b5dcc1f	Remove libscan git submodule	2021-11-07 09:30:14 -05:00
simon987	06f21d5f0f	Remove libscan submodule	2021-11-07 09:17:02 -05:00
simon987	e82a388d1e	Don't show resolution badge on narrow images	2021-10-22 10:21:35 -04:00
simon987	bf02e571b3	Forgot to add that file two commits ago	2021-10-22 09:44:56 -04:00
simon987	750a392a61	Show reduced ResuldCard when there are no results	2021-10-22 09:32:17 -04:00
simon987	3d7b977a82	Read ES version, handle legacy versions, add notice & debug info	2021-10-21 19:14:43 -04:00
simon987	cd71551a22	Some documentation updates	2021-09-25 09:30:53 -04:00