Rework document IDs

2025-11-01 16:36:53 +00:00 · 2022-03-03 17:44:59 -05:00 · 2022-03-03 17:44:59 -05:00 · 16a4fb4874
commit 16a4fb4874
parent cdc4c0ad3d
35 changed files with 246 additions and 250 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -97,9 +97,12 @@ else ()
    target_compile_options(
            sist2
            PRIVATE
+
            -Ofast
+            #-march=native
            -fno-stack-protector
            -fomit-frame-pointer
+            #-freciprocal-math
    )
 endif ()

@ -121,11 +124,13 @@ target_link_libraries(
        CURL::libcurl

        pthread
-        magic
+        #magic

        c

        scan
+
+        /usr/lib/x86_64-linux-gnu/libmagic.so.1
 )

 add_custom_target(
--- a/2
+++ b/2
@ -9,7 +9,7 @@ RUN strip sist2 || mv sist2_debug sist2

 FROM --platform="linux/amd64" ubuntu:21.10

-RUN apt update && apt install -y curl libasan5 && rm -rf /var/lib/apt/lists/*
+RUN apt update && apt install -y curl libasan5 libmagic1 && rm -rf /var/lib/apt/lists/*

 RUN mkdir -p /usr/share/tessdata && \
    cd /usr/share/tessdata/ && \
--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@ -103,7 +103,7 @@ Made by simon987 <me@simon987.net>. Released under GPL-3.0
 * `--thumbnail-count`
    Maximum number of thumbnails to generate. When set to a value >= 2, thumbnails for video previews
    will be generated. The actual number of thumbnails generated depends on the length of the video (maximum 1 image 
-    every ~5s). Set to 0 to completely disable thumbnails.
+    every ~7s). Set to 0 to completely disable thumbnails.
 * `--content-size` 
    Number of bytes of text to be extracted from the content of files (plain text, PDFs etc.).
    Repeated whitespace and special characters do not count toward this limit.
--- a/schema/settings.json
+++ b/schema/settings.json
@ -55,5 +55,37 @@
        ]
      }
    }
+  },
+  "mappings": {
+    "dynamic_templates": [
+      {
+        "keyword_fields": {
+          "match_mapping_type": "string",
+          "match":   "kw_*",
+          "mapping": {
+            "type": "keyword"
+          }
+        }
+      },
+      {
+        "integer_fields": {
+          "match_mapping_type": "*",
+          "match":   "int_*",
+          "mapping": {
+            "type": "integer"
+          }
+        }
+      },
+      {
+        "meta_fields": {
+          "match_mapping_type": "*",
+          "match":   "mt_*",
+          "mapping": {
+            "type": "keyword",
+            "index": false
+          }
+        }
+      }
+    ]
  }
 }
--- a/sist2-vue/dist/css/index.css
+++ b/sist2-vue/dist/css/index.css
--- a/sist2-vue/dist/js/chunk-vendors.js
+++ b/sist2-vue/dist/js/chunk-vendors.js
--- a/sist2-vue/dist/js/index.js
+++ b/sist2-vue/dist/js/index.js
--- a/sist2-vue/package-lock.json
+++ b/sist2-vue/package-lock.json
@ -12,7 +12,6 @@
        "axios": "^0.25.0",
        "bootstrap-vue": "^2.21.2",
        "core-js": "^3.6.5",
-        "crypto-es": "^1.2.7",
        "d3": "^5.16.0",
        "date-fns": "^2.21.3",
        "dom-to-image": "^2.6.0",
@ -5261,11 +5260,6 @@
        "node": "*"
      }
    },
-    "node_modules/crypto-es": {
-      "version": "1.2.7",
-      "resolved": "https://registry.npmjs.org/crypto-es/-/crypto-es-1.2.7.tgz",
-      "integrity": "sha512-UUqiVJ2gUuZFmbFsKmud3uuLcNP2+Opt+5ysmljycFCyhA0+T16XJmo1ev/t5kMChMqWh7IEvURNCqsg+SjZGQ=="
-    },
    "node_modules/css-color-names": {
      "version": "0.0.4",
      "resolved": "https://registry.npmjs.org/css-color-names/-/css-color-names-0.0.4.tgz",
@ -19621,11 +19615,6 @@
        "randomfill": "^1.0.3"
      }
    },
-    "crypto-es": {
-      "version": "1.2.7",
-      "resolved": "https://registry.npmjs.org/crypto-es/-/crypto-es-1.2.7.tgz",
-      "integrity": "sha512-UUqiVJ2gUuZFmbFsKmud3uuLcNP2+Opt+5ysmljycFCyhA0+T16XJmo1ev/t5kMChMqWh7IEvURNCqsg+SjZGQ=="
-    },
    "css-color-names": {
      "version": "0.0.4",
      "resolved": "https://registry.npmjs.org/css-color-names/-/css-color-names-0.0.4.tgz",
--- a/sist2-vue/package.json
+++ b/sist2-vue/package.json
@ -11,7 +11,6 @@
    "axios": "^0.25.0",
    "bootstrap-vue": "^2.21.2",
    "core-js": "^3.6.5",
-    "crypto-es": "^1.2.7",
    "d3": "^5.16.0",
    "date-fns": "^2.21.3",
    "dom-to-image": "^2.6.0",
--- a/sist2-vue/src/Sist2Api.ts
+++ b/sist2-vue/src/Sist2Api.ts
@ -1,6 +1,5 @@
 import axios from "axios";
 import {ext, strUnescape, lum} from "./util";
-import CryptoES from 'crypto-es';

 export interface EsTag {
    id: string
@ -30,7 +29,6 @@ export interface EsHit {
    _index: string
    _id: string
    _score: number
-    _path_md5: string
    _type: string
    _tags: Tag[]
    _seq: number
@ -249,11 +247,6 @@ class Sist2Api {
                res.hits.hits.forEach((hit: EsHit) => {
                    hit["_source"]["name"] = strUnescape(hit["_source"]["name"]);
                    hit["_source"]["path"] = strUnescape(hit["_source"]["path"]);
-                    hit["_path_md5"] = CryptoES.MD5(
-                        hit["_source"]["path"] +
-                        (hit["_source"]["path"] ? "/" : "") +
-                        hit["_source"]["name"] + ext(hit)
-                    ).toString();

                    this.setHitProps(hit);
                    this.setHitTags(hit);
@ -380,8 +373,7 @@ class Sist2Api {
        return axios.post(`${this.baseUrl}tag/` + hit["_source"]["index"], {
            delete: false,
            name: tag,
-            doc_id: hit["_id"],
-            path_md5: hit._path_md5
+            doc_id: hit["_id"]
        });
    }

@ -389,8 +381,7 @@ class Sist2Api {
        return axios.post(`${this.baseUrl}tag/` + hit["_source"]["index"], {
            delete: true,
            name: tag,
-            doc_id: hit["_id"],
-            path_md5: hit._path_md5
+            doc_id: hit["_id"]
        });
    }

--- a/sist2-vue/src/views/FilePage.vue
+++ b/sist2-vue/src/views/FilePage.vue
@ -56,6 +56,22 @@ export default Vue.extend({
    onThumbnailClick() {
      window.open(`/f/${this.doc._id}`, "_blank");
    },
+    findByCustomField(field, id) {
+      return {
+        query: {
+          bool: {
+            must: [
+              {
+                match: {
+                  [field]: id
+                }
+              }
+            ]
+          }
+        },
+        size: 1
+      }
+    },
    findById(id) {
      return {
        query: {
@ -103,6 +119,8 @@ export default Vue.extend({
      query = this.findById(this.$route.query.byId);
    } else if (this.$route.query.byName) {
      query = this.findByName(this.$route.query.byName);
+    } else if (this.$route.query.by && this.$route.query.q) {
+      query = this.findByCustomField(this.$route.query.by, this.$route.query.q)
    }

    if (query) {
--- a/src/index/elastic.c
+++ b/src/index/elastic.c
@ -45,7 +45,7 @@ void elastic_cleanup() {
    destroy_indexer(Indexer);
 }

-void print_json(cJSON *document, const char id_str[MD5_STR_LENGTH]) {
+void print_json(cJSON *document, const char id_str[SIST_DOC_ID_LEN]) {

    cJSON *line = cJSON_CreateObject();

@ -72,19 +72,19 @@ void delete_document(const char* document_id_str, void* UNUSED(_data)) {
    bulk_line->type = ES_BULK_LINE_DELETE;
    bulk_line->next = NULL;

-    memcpy(bulk_line->path_md5_str, document_id_str, MD5_STR_LENGTH);
+    strcpy(bulk_line->doc_id, document_id_str);
    tpool_add_work(IndexCtx.pool, index_json_func, bulk_line);
 }


-void index_json(cJSON *document, const char index_id_str[MD5_STR_LENGTH]) {
+void index_json(cJSON *document, const char doc_id[SIST_DOC_ID_LEN]) {
    char *json = cJSON_PrintUnformatted(document);

    size_t json_len = strlen(json);
    es_bulk_line_t *bulk_line = malloc(sizeof(es_bulk_line_t) + json_len + 2);
    bulk_line->type = ES_BULK_LINE_INDEX;
    memcpy(bulk_line->line, json, json_len);
-    memcpy(bulk_line->path_md5_str, index_id_str, MD5_STR_LENGTH);
+    strcpy(bulk_line->doc_id, doc_id);
    *(bulk_line->line + json_len) = '\n';
    *(bulk_line->line + json_len + 1) = '\0';
    bulk_line->next = NULL;
@ -93,7 +93,7 @@ void index_json(cJSON *document, const char index_id_str[MD5_STR_LENGTH]) {
    tpool_add_work(IndexCtx.pool, index_json_func, bulk_line);
 }

-void execute_update_script(const char *script, int async, const char index_id[MD5_STR_LENGTH]) {
+void execute_update_script(const char *script, int async, const char index_id[SIST_INDEX_ID_LEN]) {

    if (Indexer == NULL) {
        Indexer = create_indexer(IndexCtx.es_url, IndexCtx.es_index);
@ -167,7 +167,7 @@ void *create_bulk_buffer(int max, int *count, size_t *buf_len) {
            snprintf(
                    action_str, sizeof(action_str),
                    "{\"index\":{\"_id\":\"%s\",\"_type\":\"_doc\",\"_index\":\"%s\"}}\n",
-                    line->path_md5_str, Indexer->es_index
+                    line->doc_id, Indexer->es_index
            );

            size_t action_str_len = strlen(action_str);
@ -184,7 +184,7 @@ void *create_bulk_buffer(int max, int *count, size_t *buf_len) {
            snprintf(
                    action_str, sizeof(action_str),
                    "{\"delete\":{\"_id\":\"%s\",\"_index\":\"%s\"}}\n",
-                    line->path_md5_str, Indexer->es_index
+                    line->doc_id, Indexer->es_index
            );

            size_t action_str_len = strlen(action_str);
@ -263,7 +263,7 @@ void _elastic_flush(int max) {
    if (r->status_code == 413) {

        if (max <= 1) {
-            LOG_ERRORF("elastic.c", "Single document too large, giving up: {%s}", Indexer->line_head->path_md5_str)
+            LOG_ERRORF("elastic.c", "Single document too large, giving up: {%s}", Indexer->line_head->doc_id)
            free_response(r);
            free(buf);
            free_queue(1);
--- a/src/index/elastic.h
+++ b/src/index/elastic.h
@ -8,7 +8,7 @@

 typedef struct es_bulk_line {
    struct es_bulk_line *next;
-    char path_md5_str[MD5_STR_LENGTH];
+    char doc_id[SIST_DOC_ID_LEN];
    int type;
    char line[0];
 } es_bulk_line_t;
@ -40,9 +40,9 @@ typedef struct es_indexer es_indexer_t;

 void elastic_index_line(es_bulk_line_t *line);

-void print_json(cJSON *document, const char index_id_str[MD5_STR_LENGTH]);
+void print_json(cJSON *document, const char index_id_str[SIST_INDEX_ID_LEN]);

-void index_json(cJSON *document, const char index_id_str[MD5_STR_LENGTH]);
+void index_json(cJSON *document, const char doc_id[SIST_INDEX_ID_LEN]);

 void delete_document(const char *document_id_str, void* data);

@ -59,6 +59,6 @@ char *elastic_get_status();

 es_version_t *elastic_get_version(const char *es_url);

-void execute_update_script(const char *script, int async, const char index_id[MD5_STR_LENGTH]);
+void execute_update_script(const char *script, int async, const char index_id[SIST_INDEX_ID_LEN]);

 #endif
--- a/src/index/static_generated.c
+++ b/src/index/static_generated.c
--- a/src/io/serialize.c
+++ b/src/io/serialize.c
@ -124,9 +124,7 @@ char *build_json_string(document_t *doc) {
        cJSON_AddStringToObject(json, "path", "");
    }

-    char md5_str[MD5_STR_LENGTH];
-    buf2hex(doc->path_md5, MD5_DIGEST_LENGTH, md5_str);
-    cJSON_AddStringToObject(json, "_id", md5_str);
+    cJSON_AddStringToObject(json, "_id", doc->doc_id);

    // Metadata
    meta_line_t *meta = doc->meta_head;
@ -452,32 +450,31 @@ void read_lines(const char *path, const line_processor_t processor) {

    dyn_buffer_destroy(&buf);
    fclose(file);
-
 }

-void read_index_ndjson(const char *line, void* _data) {
-    void** data = _data;
-    const char* index_id = data[0];
+void read_index_ndjson(const char *line, void *_data) {
+    void **data = _data;
+    const char *index_id = data[0];
    index_func func = data[1];
    read_index_bin_handle_line(line, index_id, func);
 }

-void read_index(const char *path, const char index_id[MD5_STR_LENGTH], const char *type, index_func func) {
+void read_index(const char *path, const char index_id[SIST_INDEX_ID_LEN], const char *type, index_func func) {
    if (strcmp(type, INDEX_TYPE_NDJSON) == 0) {
        read_lines(path, (line_processor_t) {
-            .data = (void*[2]){(void*)index_id, func} ,
-            .func = read_index_ndjson,
+                .data = (void *[2]) {(void *) index_id, func},
+                .func = read_index_ndjson,
        });
    }
 }

 static __thread GHashTable *IncrementalReadTable = NULL;

-void json_put_incremental(cJSON *document, UNUSED(const char id_str[MD5_STR_LENGTH])) {
+void json_put_incremental(cJSON *document, UNUSED(const char doc_id[SIST_DOC_ID_LEN])) {
    const char *path_md5_str = cJSON_GetObjectItem(document, "_id")->valuestring;
    const int mtime = cJSON_GetObjectItem(document, "mtime")->valueint;

-    incremental_put_str(IncrementalReadTable, path_md5_str, mtime);
+    incremental_put(IncrementalReadTable, path_md5_str, mtime);
 }

 void incremental_read(GHashTable *table, const char *filepath, index_descriptor_t *desc) {
@ -490,13 +487,11 @@ static __thread GHashTable *IncrementalNewTable = NULL;
 static __thread store_t *IncrementalCopySourceStore = NULL;
 static __thread store_t *IncrementalCopyDestinationStore = NULL;

-void incremental_copy_handle_doc(cJSON *document, UNUSED(const char id_str[MD5_STR_LENGTH])) {
+void incremental_copy_handle_doc(cJSON *document, UNUSED(const char id_str[SIST_DOC_ID_LEN])) {

-    const char *path_md5_str = cJSON_GetObjectItem(document, "_id")->valuestring;
-    unsigned char path_md5[MD5_DIGEST_LENGTH];
-    hex2buf(path_md5_str, MD5_STR_LENGTH - 1, path_md5);
+    const char *doc_id = cJSON_GetObjectItem(document, "_id")->valuestring;

-    if (cJSON_GetObjectItem(document, "parent") != NULL || incremental_get_str(IncrementalCopyTable, path_md5_str)) {
+    if (cJSON_GetObjectItem(document, "parent") != NULL || incremental_get(IncrementalCopyTable, doc_id)) {
        // Copy index line
        cJSON_DeleteItemFromObject(document, "index");
        char *json_str = cJSON_PrintUnformatted(document);
@ -510,9 +505,9 @@ void incremental_copy_handle_doc(cJSON *document, UNUSED(const char id_str[MD5_S

        // Copy tn store contents
        size_t buf_len;
-        char *buf = store_read(IncrementalCopySourceStore, (char *) path_md5, sizeof(path_md5), &buf_len);
+        char *buf = store_read(IncrementalCopySourceStore, (char *) doc_id, sizeof(doc_id), &buf_len);
        if (buf_len != 0) {
-            store_write(IncrementalCopyDestinationStore, (char *) path_md5, sizeof(path_md5), buf, buf_len);
+            store_write(IncrementalCopyDestinationStore, (char *) doc_id, sizeof(doc_id), buf, buf_len);
            free(buf);
        }
    }
@ -536,24 +531,24 @@ void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
    read_index(filepath, "", INDEX_TYPE_NDJSON, incremental_copy_handle_doc);
 }

-void incremental_delete_handle_doc(cJSON *document, UNUSED(const char id_str[MD5_STR_LENGTH])) {
+void incremental_delete_handle_doc(cJSON *document, UNUSED(const char id_str[SIST_DOC_ID_LEN])) {

-    char path_md5_n[MD5_STR_LENGTH + 1];
-    path_md5_n[MD5_STR_LENGTH] = '\0';
-    path_md5_n[MD5_STR_LENGTH - 1] = '\n';
-    const char *path_md5_str = cJSON_GetObjectItem(document, "_id")->valuestring;
+    char doc_id_n[SIST_DOC_ID_LEN + 1];
+    doc_id_n[SIST_DOC_ID_LEN] = '\0';
+    doc_id_n[SIST_DOC_ID_LEN - 1] = '\n';
+    const char *doc_id = cJSON_GetObjectItem(document, "_id")->valuestring;

    // do not delete archive virtual entries
    if (cJSON_GetObjectItem(document, "parent") == NULL 
-        && !incremental_get_str(IncrementalCopyTable, path_md5_str)
-        && !incremental_get_str(IncrementalNewTable, path_md5_str)
+        && !incremental_get(IncrementalCopyTable, doc_id)
+        && !incremental_get(IncrementalNewTable, doc_id)
        ) {
-        memcpy(path_md5_n, path_md5_str, MD5_STR_LENGTH - 1);
-        zstd_write_string(path_md5_n, MD5_STR_LENGTH);
+        memcpy(doc_id_n, doc_id, SIST_DOC_ID_LEN - 1);
+        zstd_write_string(doc_id, sizeof(doc_id_n));
    }
 }

-void incremental_delete(const char *del_filepath, const char* index_filepath, 
+void incremental_delete(const char *del_filepath, const char *index_filepath,
                        GHashTable *copy_table, GHashTable *new_table) {

    if (WriterCtx.out_file == NULL) {
--- a/src/io/serialize.h
+++ b/src/io/serialize.h
@ -12,7 +12,7 @@ typedef struct line_processor {
  void (*func)(const char*, void*);
 } line_processor_t;

-typedef void(*index_func)(cJSON *, const char[MD5_STR_LENGTH]);
+typedef void(*index_func)(cJSON *, const char[SIST_DOC_ID_LEN]);

 void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
                      const char *dst_filepath, GHashTable *copy_table);
@ -24,7 +24,7 @@ void write_document(document_t *doc);

 void read_lines(const char *path, const line_processor_t processor);

-void read_index(const char *path, const char[MD5_STR_LENGTH], const char *type, index_func);
+void read_index(const char *path, const char index_id[SIST_INDEX_ID_LEN], const char *type, index_func);

 void incremental_read(GHashTable *table, const char *filepath, index_descriptor_t *desc);

--- a/src/io/store.c
+++ b/src/io/store.c
@ -52,22 +52,7 @@ void store_flush(store_t *store) {
 void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len) {

    if (LogCtx.very_verbose) {
-        if (key_len == MD5_DIGEST_LENGTH) {
-            char path_md5_str[MD5_STR_LENGTH];
-            buf2hex((unsigned char *) key, MD5_DIGEST_LENGTH, path_md5_str);
-
-            LOG_DEBUGF("store.c", "Store write {%s} %lu bytes", path_md5_str, buf_len)
-
-        } else if (key_len == MD5_DIGEST_LENGTH + sizeof(int)) {
-            char path_md5_str[MD5_STR_LENGTH];
-            buf2hex((unsigned char *) key, MD5_DIGEST_LENGTH, path_md5_str);
-
-            LOG_DEBUGF("store.c", "Store write {%s/%d} %lu bytes",
-                       path_md5_str, *(int *) (key + MD5_DIGEST_LENGTH), buf_len);
-
-        } else {
-            LOG_DEBUGF("store.c", "Store write {%s} %lu bytes", key, buf_len)
-        }
+        LOG_DEBUGF("store.c", "Store write %s@{%s} %lu bytes", store->path, key, buf_len)
    }

 #if (SIST_FAKE_STORE != 1)
--- a/src/io/walk.c
+++ b/src/io/walk.c
@ -22,7 +22,7 @@ parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info,

    job->vfile.info = *info;

-    memset(job->parent, 0, MD5_DIGEST_LENGTH);
+    job->parent[0] = '\0';

    job->vfile.filepath = job->filepath;
    job->vfile.read = fs_read;
--- a/src/main.c
+++ b/src/main.c
@ -118,7 +118,7 @@ void init_dir(const char *dirpath, scan_args_t* args) {
      index_descriptor_t original_desc = read_index_descriptor(descriptor_path);
      memcpy(ScanCtx.index.desc.id, original_desc.id, sizeof(original_desc.id));
    } else {
-      // genreate new index id based on timestamp
+      // generate new index id based on timestamp
      unsigned char index_md5[MD5_DIGEST_LENGTH];
      MD5((unsigned char *) &ScanCtx.index.desc.timestamp, sizeof(ScanCtx.index.desc.timestamp), index_md5);
      buf2hex(index_md5, MD5_DIGEST_LENGTH, ScanCtx.index.desc.id);
--- a/src/parsing/parse.c
+++ b/src/parsing/parse.c
@ -69,7 +69,7 @@ void parse(void *arg) {
    doc->base = (short) job->base;

    char *rel_path = doc->filepath + ScanCtx.index.desc.root_len;
-    MD5((unsigned char *) rel_path, strlen(rel_path), doc->path_md5);
+    generate_doc_id(rel_path, doc->doc_id);

    doc->meta_head = NULL;
    doc->meta_tail = NULL;
@ -77,10 +77,10 @@ void parse(void *arg) {
    doc->size = job->vfile.info.st_size;
    doc->mtime = (int) job->vfile.info.st_mtim.tv_sec;

-    int inc_ts = incremental_get(ScanCtx.original_table, doc->path_md5);
+    int inc_ts = incremental_get(ScanCtx.original_table, doc->doc_id);
    if (inc_ts != 0 && inc_ts == job->vfile.info.st_mtim.tv_sec) {
        pthread_mutex_lock(&ScanCtx.copy_table_mu);
-        incremental_mark_file(ScanCtx.copy_table, doc->path_md5);
+        incremental_mark_file(ScanCtx.copy_table, doc->doc_id);
        pthread_mutex_unlock(&ScanCtx.copy_table_mu);

        pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
@ -96,16 +96,14 @@ void parse(void *arg) {

    if (ScanCtx.new_table != NULL) {
        pthread_mutex_lock(&ScanCtx.copy_table_mu);
-        incremental_mark_file(ScanCtx.new_table, doc->path_md5);
+        incremental_mark_file(ScanCtx.new_table, doc->doc_id);
        pthread_mutex_unlock(&ScanCtx.copy_table_mu);
    }

    char *buf[MAGIC_BUF_SIZE];

    if (LogCtx.very_verbose) {
-        char path_md5_str[MD5_STR_LENGTH];
-        buf2hex(doc->path_md5, MD5_DIGEST_LENGTH, path_md5_str);
-        LOG_DEBUGF(job->filepath, "Starting parse job {%s}", path_md5_str)
+        LOG_DEBUGF(job->filepath, "Starting parse job {%s}", doc->doc_id)
    }

    if (job->vfile.info.st_size == 0) {
@ -218,10 +216,10 @@ void parse(void *arg) {
    abort:

    //Parent meta
-    if (!md5_digest_is_null(job->parent)) {
-        meta_line_t *meta_parent = malloc(sizeof(meta_line_t) + MD5_STR_LENGTH);
+    if (job->parent[0] != '\0') {
+        meta_line_t *meta_parent = malloc(sizeof(meta_line_t) + SIST_INDEX_ID_LEN);
        meta_parent->key = MetaParent;
-        buf2hex(job->parent, MD5_DIGEST_LENGTH, meta_parent->str_val);
+        strcpy(meta_parent->str_val, job->parent);
        APPEND_META((doc), meta_parent)

        doc->has_parent = TRUE;
--- a/src/parsing/sidecar.c
+++ b/src/parsing/sidecar.c
@ -23,16 +23,19 @@ void parse_sidecar(vfile_t *vfile, document_t *doc) {
    }
    char *json_str = cJSON_PrintUnformatted(json);

-    unsigned char path_md5[MD5_DIGEST_LENGTH];
-    MD5((unsigned char *) vfile->filepath + ScanCtx.index.desc.root_len, doc->ext - 1 - ScanCtx.index.desc.root_len,
-        path_md5);
+    char assoc_doc_id[SIST_DOC_ID_LEN];

-    char path_md5_str[MD5_STR_LENGTH];
-    buf2hex(path_md5, MD5_DIGEST_LENGTH, path_md5_str);
+    char rel_path[PATH_MAX];
+    size_t rel_path_len = doc->ext - 1 - ScanCtx.index.desc.root_len;
+    memcpy(rel_path, vfile->filepath + ScanCtx.index.desc.root_len, rel_path_len);
+    *(rel_path + rel_path_len) = '\0';

-    store_write(ScanCtx.index.meta_store, path_md5_str, MD5_STR_LENGTH, json_str, strlen(json_str) + 1);
+    generate_doc_id(rel_path, assoc_doc_id);
+
+    store_write(ScanCtx.index.meta_store, assoc_doc_id, sizeof(assoc_doc_id), json_str,
+                strlen(json_str) + 1);

    cJSON_Delete(json);
    free(json_str);
    free(buf);
-}
+}
--- a/src/sist.h
+++ b/src/sist.h
@ -53,7 +53,7 @@
 #include <ctype.h>
 #include "git_hash.h"

-#define VERSION "2.11.7"
+#define VERSION "2.12.0"
 static const char *const Version = VERSION;

 #ifndef SIST_PLATFORM
--- a/src/stats.c
+++ b/src/stats.c
@ -20,7 +20,7 @@ typedef struct {
    long count;
 } agg_t;

-void fill_tables(cJSON *document, UNUSED(const char index_id[MD5_STR_LENGTH])) {
+void fill_tables(cJSON *document, UNUSED(const char index_id[SIST_INDEX_ID_LEN])) {

    if (cJSON_GetObjectItem(document, "parent") != NULL) {
        return;
--- a/src/types.h
+++ b/src/types.h
@ -4,7 +4,7 @@
 #define INDEX_TYPE_NDJSON "ndjson"

 typedef struct index_descriptor {
-    char id[MD5_STR_LENGTH];
+    char id[SIST_INDEX_ID_LEN];
    char version[64];
    long timestamp;
    char root[PATH_MAX];
--- a/src/util.h
+++ b/src/util.h
@ -10,8 +10,6 @@
 #include "third-party/utf8.h/utf8.h"
 #include "libscan/scan.h"

-#define MD5_STR_LENGTH 33
-

 char *abspath(const char *path);

@ -94,40 +92,24 @@ static void buf2hex(const unsigned char *buf, size_t buflen, char *hex_string) {


 __always_inline
-static int md5_digest_is_null(const unsigned char digest[MD5_DIGEST_LENGTH]) {
-    return (*(int64_t *) digest) == 0 && (*((int64_t *) digest + 1)) == 0;
+static void generate_doc_id(const char *rel_path, char *doc_id) {
+    unsigned char md[MD5_DIGEST_LENGTH];
+
+    MD5((unsigned char *) rel_path, strlen(rel_path), md);
+    buf2hex(md, sizeof(md), doc_id);
 }

-
 __always_inline
-static void incremental_put(GHashTable *table, const unsigned char path_md5[MD5_DIGEST_LENGTH], int mtime) {
-    char *ptr = malloc(MD5_STR_LENGTH);
-    buf2hex(path_md5, MD5_DIGEST_LENGTH, ptr);
+static void incremental_put(GHashTable *table, const char doc_id[SIST_DOC_ID_LEN], int mtime) {
+    char *ptr = malloc(SIST_DOC_ID_LEN);
+    strcpy(ptr, doc_id);
    g_hash_table_insert(table, ptr, GINT_TO_POINTER(mtime));
 }

 __always_inline
-static void incremental_put_str(GHashTable *table, const char *path_md5, int mtime) {
-    char *ptr = malloc(MD5_STR_LENGTH);
-    strcpy(ptr, path_md5);
-    g_hash_table_insert(table, ptr, GINT_TO_POINTER(mtime));
-}
-
-__always_inline
-static int incremental_get(GHashTable *table, const unsigned char path_md5[MD5_DIGEST_LENGTH]) {
+static int incremental_get(GHashTable *table, const char doc_id[SIST_DOC_ID_LEN]) {
    if (table != NULL) {
-        char md5_str[MD5_STR_LENGTH];
-        buf2hex(path_md5, MD5_DIGEST_LENGTH, md5_str);
-        return GPOINTER_TO_INT(g_hash_table_lookup(table, md5_str));
-    } else {
-        return 0;
-    }
-}
-
-__always_inline
-static int incremental_get_str(GHashTable *table, const char *path_md5) {
-    if (table != NULL) {
-        return GPOINTER_TO_INT(g_hash_table_lookup(table, path_md5));
+        return GPOINTER_TO_INT(g_hash_table_lookup(table, doc_id));
    } else {
        return 0;
    }
@ -138,9 +120,9 @@ static int incremental_get_str(GHashTable *table, const char *path_md5) {
 * !!Not thread safe.
 */
 __always_inline
-static int incremental_mark_file(GHashTable *table, const unsigned char path_md5[MD5_DIGEST_LENGTH]) {
-    char *ptr = malloc(MD5_STR_LENGTH);
-    buf2hex(path_md5, MD5_DIGEST_LENGTH, ptr);
+static int incremental_mark_file(GHashTable *table, const char doc_id[SIST_DOC_ID_LEN]) {
+    char *ptr = malloc(SIST_DOC_ID_LEN);
+    strcpy(ptr, doc_id);
    return g_hash_table_insert(table, ptr, GINT_TO_POINTER(1));
 }

--- a/src/web/serve.c
+++ b/src/web/serve.c
@ -36,7 +36,7 @@ static void send_response_line(struct mg_connection *nc, int status_code, size_t

 index_t *get_index_by_id(const char *index_id) {
    for (int i = WebCtx.index_count; i >= 0; i--) {
-        if (strncmp(index_id, WebCtx.indices[i].desc.id, MD5_STR_LENGTH) == 0) {
+        if (strncmp(index_id, WebCtx.indices[i].desc.id, SIST_INDEX_ID_LEN) == 0) {
            return &WebCtx.indices[i];
        }
    }
@ -70,23 +70,23 @@ void search_index(struct mg_connection *nc, struct mg_http_message *hm) {

 void stats_files(struct mg_connection *nc, struct mg_http_message *hm) {

-    if (hm->uri.len != MD5_STR_LENGTH + 4) {
+    if (hm->uri.len != SIST_INDEX_ID_LEN + 4) {
        HTTP_REPLY_NOT_FOUND
        return;
    }

-    char arg_md5[MD5_STR_LENGTH];
-    memcpy(arg_md5, hm->uri.ptr + 3, MD5_STR_LENGTH);
-    *(arg_md5 + MD5_STR_LENGTH - 1) = '\0';
+    char arg_index_id[SIST_INDEX_ID_LEN];
+    memcpy(arg_index_id, hm->uri.ptr + 3, SIST_INDEX_ID_LEN);
+    *(arg_index_id + SIST_INDEX_ID_LEN - 1) = '\0';

-    index_t *index = get_index_by_id(arg_md5);
+    index_t *index = get_index_by_id(arg_index_id);
    if (index == NULL) {
        HTTP_REPLY_NOT_FOUND
        return;
    }

    const char *file;
-    switch (atoi(hm->uri.ptr + 3 + MD5_STR_LENGTH)) {
+    switch (atoi(hm->uri.ptr + 3 + SIST_INDEX_ID_LEN)) {
        case 1:
            file = "treemap.csv";
            break;
@ -150,28 +150,25 @@ void style_vendor(struct mg_connection *nc, struct mg_http_message *hm) {

 void thumbnail(struct mg_connection *nc, struct mg_http_message *hm) {

-    int parse_tn_num = FALSE;
+    int has_thumbnail_index = FALSE;

-    if (hm->uri.len != 68) {
+    if (hm->uri.len != SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2) {

-        if (hm->uri.len != 68 + 4) {
+        if (hm->uri.len != SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2 + 4) {
            LOG_DEBUGF("serve.c", "Invalid thumbnail path: %.*s", (int) hm->uri.len, hm->uri.ptr)
            HTTP_REPLY_NOT_FOUND
            return;
        }
-        parse_tn_num = TRUE;
+        has_thumbnail_index = TRUE;
    }

-    char arg_file_md5[MD5_STR_LENGTH];
-    char arg_index[MD5_STR_LENGTH];
+    char arg_doc_id[SIST_DOC_ID_LEN];
+    char arg_index[SIST_INDEX_ID_LEN];

-    memcpy(arg_index, hm->uri.ptr + 3, MD5_STR_LENGTH);
-    *(arg_index + MD5_STR_LENGTH - 1) = '\0';
-    memcpy(arg_file_md5, hm->uri.ptr + 3 + MD5_STR_LENGTH, MD5_STR_LENGTH);
-    *(arg_file_md5 + MD5_STR_LENGTH - 1) = '\0';
-
-    unsigned char md5_buf[MD5_DIGEST_LENGTH];
-    hex2buf(arg_file_md5, MD5_STR_LENGTH - 1, md5_buf);
+    memcpy(arg_index, hm->uri.ptr + 3, SIST_INDEX_ID_LEN);
+    *(arg_index + SIST_INDEX_ID_LEN - 1) = '\0';
+    memcpy(arg_doc_id, hm->uri.ptr + 3 + SIST_INDEX_ID_LEN, SIST_DOC_ID_LEN);
+    *(arg_doc_id + SIST_DOC_ID_LEN - 1) = '\0';

    store_t *store = get_store(arg_index);
    if (store == NULL) {
@ -183,16 +180,17 @@ void thumbnail(struct mg_connection *nc, struct mg_http_message *hm) {
    char *data;
    size_t data_len = 0;

-    if (parse_tn_num) {
-        int tn_num = atoi(hm->uri.ptr + 68);
+    if (has_thumbnail_index) {
+        const char *tn_index = hm->uri.ptr + SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2;

-        char tn_key[sizeof(md5_buf) + sizeof(int)];
-        memcpy(tn_key, md5_buf, sizeof(md5_buf));
-        memcpy(tn_key + sizeof(md5_buf), &tn_num, sizeof(tn_num));
+        char tn_key[sizeof(arg_doc_id) + sizeof(char) * 4];
+
+        memcpy(tn_key, arg_doc_id, sizeof(arg_doc_id));
+        memcpy(tn_key + sizeof(arg_doc_id) - 1, tn_index, sizeof(char) * 4);

        data = store_read(store, (char *) tn_key, sizeof(tn_key), &data_len);
    } else {
-        data = store_read(store, (char *) md5_buf, sizeof(md5_buf), &data_len);
+        data = store_read(store, (char *) arg_doc_id, sizeof(arg_doc_id), &data_len);
    }

    if (data_len != 0) {
@ -357,17 +355,17 @@ void index_info(struct mg_connection *nc) {

 void document_info(struct mg_connection *nc, struct mg_http_message *hm) {

-    if (hm->uri.len != MD5_STR_LENGTH + 2) {
+    if (hm->uri.len != SIST_DOC_ID_LEN + 2) {
        LOG_DEBUGF("serve.c", "Invalid document_info path: %.*s", (int) hm->uri.len, hm->uri.ptr)
        HTTP_REPLY_NOT_FOUND
        return;
    }

-    char arg_md5[MD5_STR_LENGTH];
-    memcpy(arg_md5, hm->uri.ptr + 3, MD5_STR_LENGTH);
-    *(arg_md5 + MD5_STR_LENGTH - 1) = '\0';
+    char arg_doc_id[SIST_DOC_ID_LEN];
+    memcpy(arg_doc_id, hm->uri.ptr + 3, SIST_DOC_ID_LEN);
+    *(arg_doc_id + SIST_DOC_ID_LEN - 1) = '\0';

-    cJSON *doc = elastic_get_document(arg_md5);
+    cJSON *doc = elastic_get_document(arg_doc_id);
    cJSON *source = cJSON_GetObjectItem(doc, "_source");

    cJSON *index_id = cJSON_GetObjectItem(source, "index");
@ -393,17 +391,17 @@ void document_info(struct mg_connection *nc, struct mg_http_message *hm) {

 void file(struct mg_connection *nc, struct mg_http_message *hm) {

-    if (hm->uri.len != MD5_STR_LENGTH + 2) {
+    if (hm->uri.len != SIST_DOC_ID_LEN + 2) {
        LOG_DEBUGF("serve.c", "Invalid file path: %.*s", (int) hm->uri.len, hm->uri.ptr)
        HTTP_REPLY_NOT_FOUND
        return;
    }

-    char arg_md5[MD5_STR_LENGTH];
-    memcpy(arg_md5, hm->uri.ptr + 3, MD5_STR_LENGTH);
-    *(arg_md5 + MD5_STR_LENGTH - 1) = '\0';
+    char arg_doc_id[SIST_DOC_ID_LEN];
+    memcpy(arg_doc_id, hm->uri.ptr + 3, SIST_DOC_ID_LEN);
+    *(arg_doc_id + SIST_DOC_ID_LEN - 1) = '\0';

-    const char *next = arg_md5;
+    const char *next = arg_doc_id;
    cJSON *doc = NULL;
    cJSON *index_id = NULL;
    cJSON *source = NULL;
@ -454,7 +452,6 @@ void status(struct mg_connection *nc) {
 typedef struct {
    char *name;
    int delete;
-    char *path_md5_str;
    char *doc_id;
 } tag_req_t;

@ -474,12 +471,6 @@ tag_req_t *parse_tag_request(cJSON *json) {
        return NULL;
    }

-    cJSON *arg_path_md5 = cJSON_GetObjectItem(json, "path_md5");
-    if (arg_path_md5 == NULL || !cJSON_IsString(arg_path_md5) ||
-        strlen(arg_path_md5->valuestring) != MD5_STR_LENGTH - 1) {
-        return NULL;
-    }
-
    cJSON *arg_doc_id = cJSON_GetObjectItem(json, "doc_id");
    if (arg_doc_id == NULL || !cJSON_IsString(arg_doc_id)) {
        return NULL;
@ -488,22 +479,21 @@ tag_req_t *parse_tag_request(cJSON *json) {
    tag_req_t *req = malloc(sizeof(tag_req_t));
    req->delete = arg_delete->valueint;
    req->name = arg_name->valuestring;
-    req->path_md5_str = arg_path_md5->valuestring;
    req->doc_id = arg_doc_id->valuestring;

    return req;
 }

 void tag(struct mg_connection *nc, struct mg_http_message *hm) {
-    if (hm->uri.len != MD5_STR_LENGTH + 4) {
+    if (hm->uri.len != SIST_INDEX_ID_LEN + 4) {
        LOG_DEBUGF("serve.c", "Invalid tag path: %.*s", (int) hm->uri.len, hm->uri.ptr)
        HTTP_REPLY_NOT_FOUND
        return;
    }

-    char arg_index[MD5_STR_LENGTH];
-    memcpy(arg_index, hm->uri.ptr + 5, MD5_STR_LENGTH);
-    *(arg_index + MD5_STR_LENGTH - 1) = '\0';
+    char arg_index[SIST_INDEX_ID_LEN];
+    memcpy(arg_index, hm->uri.ptr + 5, SIST_INDEX_ID_LEN);
+    *(arg_index + SIST_INDEX_ID_LEN - 1) = '\0';

    if (hm->body.len < 2 || hm->method.len != 4 || memcmp(&hm->method, "POST", 4) == 0) {
        LOG_DEBUG("serve.c", "Invalid tag request")
@ -535,7 +525,7 @@ void tag(struct mg_connection *nc, struct mg_http_message *hm) {
    cJSON *arr = NULL;

    size_t data_len = 0;
-    const char *data = store_read(store, arg_req->path_md5_str, MD5_STR_LENGTH, &data_len);
+    const char *data = store_read(store, arg_req->doc_id, SIST_DOC_ID_LEN, &data_len);
    if (data_len == 0) {
        arr = cJSON_CreateArray();
    } else {
@ -595,7 +585,7 @@ void tag(struct mg_connection *nc, struct mg_http_message *hm) {
    }

    char *json_str = cJSON_PrintUnformatted(arr);
-    store_write(store, arg_req->path_md5_str, MD5_STR_LENGTH, json_str, strlen(json_str) + 1);
+    store_write(store, arg_req->doc_id, SIST_DOC_ID_LEN, json_str, strlen(json_str) + 1);
    store_flush(store);

    free(arg_req);
--- a/src/web/static_generated.c
+++ b/src/web/static_generated.c
--- a/third-party/libscan/CMakeLists.txt
+++ b/third-party/libscan/CMakeLists.txt
@ -6,26 +6,6 @@ set(CMAKE_C_STANDARD 11)
 option(BUILD_TESTS "Build tests" on)

 add_subdirectory(third-party/antiword)
-if (SIST_DEBUG)
-    add_compile_definitions(
-            antiword
-            DEBUG
-    )
-    target_compile_options(
-            antiword
-            PRIVATE
-            -g
-            -fstack-protector
-            -fno-omit-frame-pointer
-            -fsanitize=address
-            -fno-inline
-    )
-else()
-    add_compile_definitions(
-            antiword
-            NDEBUG
-    )
-endif()

 add_library(
        scan
@ -48,6 +28,38 @@ add_library(
        libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h)
 set_target_properties(scan PROPERTIES LINKER_LANGUAGE C)

+if (SIST_DEBUG)
+    add_compile_definitions(
+            antiword
+            DEBUG
+    )
+    target_compile_options(
+            antiword
+            PRIVATE
+            -g
+            -fstack-protector
+            -fno-omit-frame-pointer
+            -fsanitize=address
+            -fno-inline
+    )
+else()
+    add_compile_definitions(
+            antiword
+            NDEBUG
+    )
+
+    target_compile_options(
+            scan
+            PRIVATE
+
+            -Ofast
+            #-march=native
+            -fno-stack-protector
+            -fomit-frame-pointer
+            #-freciprocal-math
+    )
+endif()
+
 set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib .so)

 find_package(cJSON CONFIG REQUIRED)
--- a/third-party/libscan/libscan/arc/arc.c
+++ b/third-party/libscan/libscan/arc/arc.c
@ -202,7 +202,7 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre
        sub_job->vfile.logf = ctx->logf;
        sub_job->vfile.has_checksum = FALSE;
        sub_job->vfile.calculate_checksum = f->calculate_checksum;
-        memcpy(sub_job->parent, doc->path_md5, MD5_DIGEST_LENGTH);
+        strcpy(sub_job->parent, doc->doc_id);

        while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
            sub_job->vfile.info = *archive_entry_stat(entry);
--- a/third-party/libscan/libscan/ebook/ebook.c
+++ b/third-party/libscan/libscan/ebook/ebook.c
@ -156,7 +156,7 @@ int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_d
    avcodec_receive_packet(jpeg_encoder, &jpeg_packet);

    APPEND_LONG_META(doc, MetaThumbnail, 1)
-    ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
+    ctx->store(doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size);

    free(samples);
    av_packet_unref(&jpeg_packet);
--- a/third-party/libscan/libscan/font/font.c
+++ b/third-party/libscan/libscan/font/font.c
@ -232,7 +232,7 @@ void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc) {
    bmp_format(&bmp_data, dimensions, bitmap);

    APPEND_LONG_META(doc, MetaThumbnail, 1)
-    ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) bmp_data.buf, bmp_data.cur);
+    ctx->store(doc->doc_id, sizeof(doc->doc_id), (char *) bmp_data.buf, bmp_data.cur);

    dyn_buffer_destroy(&bmp_data);
    free(bitmap);
--- a/third-party/libscan/libscan/media/media.c
+++ b/third-party/libscan/libscan/media/media.c
@ -459,7 +459,7 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
    if (scaled_frame == STORE_AS_IS) {
        return_value = SAVE_THUMBNAIL_OK;

-        ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data,
+        ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) frame_and_packet->packet->data,
                   frame_and_packet->packet->size);
    } else {
        // Encode frame to jpeg
@ -473,7 +473,7 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor

        // Save thumbnail
        if (thumbnail_index == 0) {
-            ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
+            ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size);
            return_value = SAVE_THUMBNAIL_OK;

        } else if (thumbnail_index > 1) {
@ -482,9 +482,8 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
            //  I figure out a better fix.
            thumbnail_index -= 1;

-            char tn_key[sizeof(doc->path_md5) + sizeof(int)];
-            memcpy(tn_key, doc->path_md5, sizeof(doc->path_md5));
-            memcpy(tn_key + sizeof(doc->path_md5), &thumbnail_index, sizeof(thumbnail_index));
+            char tn_key[sizeof(doc->doc_id) + sizeof(char) * 4];
+            snprintf(tn_key, sizeof(tn_key), "%s%04d", doc->doc_id, thumbnail_index);

            ctx->store((char *) tn_key, sizeof(tn_key), (char *) jpeg_packet.data, jpeg_packet.size);
        } else {
@ -579,8 +578,8 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
        int video_duration_in_seconds = (int) (pFormatCtx->duration / AV_TIME_BASE);

        int thumbnails_to_generate = (IS_VIDEO(pFormatCtx) && stream->codecpar->codec_id != AV_CODEC_ID_GIF && video_duration_in_seconds >= 15)
-                                     // Limit to ~1 thumbnail every 5s
-                                     ? MAX(MIN(ctx->tn_count, video_duration_in_seconds / 5 + 1), 1) + 1
+                                     // Limit to ~1 thumbnail every 7s
+                                     ? MAX(MIN(ctx->tn_count, video_duration_in_seconds / 7 + 1), 1) + 1
                                     : 1;

        const double seek_increment = thumbnails_to_generate == 1
@ -845,7 +844,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu

    if (scaled_frame == STORE_AS_IS) {
        APPEND_LONG_META(doc, MetaThumbnail, 1)
-        ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data,
+        ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) frame_and_packet->packet->data,
                   frame_and_packet->packet->size);
    } else {
        // Encode frame to jpeg
@ -859,7 +858,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu

        // Save thumbnail
        APPEND_LONG_META(doc, MetaThumbnail, 1)
-        ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
+        ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size);

        av_packet_unref(&jpeg_packet);
        avcodec_free_context(&jpeg_encoder);
--- a/third-party/libscan/libscan/ooxml/ooxml.c
+++ b/third-party/libscan/libscan/ooxml/ooxml.c
@ -191,7 +191,7 @@ void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, s
    archive_read_data(a, buf, entry_size);

    APPEND_LONG_META(doc, MetaThumbnail, 1)
-    ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), buf, entry_size);
+    ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), buf, entry_size);
    free(buf);
 }

--- a/third-party/libscan/libscan/raw/raw.c
+++ b/third-party/libscan/libscan/raw/raw.c
@ -84,7 +84,7 @@ int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, do
    avcodec_receive_packet(jpeg_encoder, &jpeg_packet);

    APPEND_LONG_META(doc, MetaThumbnail, 1)
-    ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
+    ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size);

    av_packet_unref(&jpeg_packet);
    av_free(*scaled_frame->data);
--- a/third-party/libscan/libscan/scan.h
+++ b/third-party/libscan/libscan/scan.h
@ -48,6 +48,10 @@ typedef int scan_code_t;
 #define CTX_LOG_FATALF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_FATAL, fmt, __VA_ARGS__); exit(-1);
 #define CTX_LOG_FATAL(filepath, str) ctx->log(filepath, LEVEL_FATAL, str); exit(-1);

+#define MD5_STR_LENGTH 33
+#define SIST_DOC_ID_LEN MD5_STR_LENGTH
+#define SIST_INDEX_ID_LEN MD5_STR_LENGTH
+
 enum metakey {
    // String
    MetaContent = 1,
@ -103,7 +107,7 @@ typedef struct meta_line {


 typedef struct document {
-    unsigned char path_md5[MD5_DIGEST_LENGTH];
+    char doc_id[SIST_DOC_ID_LEN];
    unsigned long size;
    unsigned int mime;
    int mtime;
@ -159,7 +163,7 @@ typedef struct parse_job_t {
    int base;
    int ext;
    struct vfile vfile;
-    unsigned char parent[MD5_DIGEST_LENGTH];
+    char parent[SIST_DOC_ID_LEN];
    char filepath[1];
 } parse_job_t;