Rework document IDs

2025-12-11 14:38:54 +00:00 · 2022-03-03 17:44:59 -05:00
parent cdc4c0ad3d
commit 16a4fb4874
35 changed files with 246 additions and 250 deletions
--- a/src/index/elastic.c
+++ b/src/index/elastic.c
@@ -45,7 +45,7 @@ void elastic_cleanup() {
    destroy_indexer(Indexer);
 }

-void print_json(cJSON *document, const char id_str[MD5_STR_LENGTH]) {
+void print_json(cJSON *document, const char id_str[SIST_DOC_ID_LEN]) {

    cJSON *line = cJSON_CreateObject();

@@ -72,19 +72,19 @@ void delete_document(const char* document_id_str, void* UNUSED(_data)) {
    bulk_line->type = ES_BULK_LINE_DELETE;
    bulk_line->next = NULL;

-    memcpy(bulk_line->path_md5_str, document_id_str, MD5_STR_LENGTH);
+    strcpy(bulk_line->doc_id, document_id_str);
    tpool_add_work(IndexCtx.pool, index_json_func, bulk_line);
 }


-void index_json(cJSON *document, const char index_id_str[MD5_STR_LENGTH]) {
+void index_json(cJSON *document, const char doc_id[SIST_DOC_ID_LEN]) {
    char *json = cJSON_PrintUnformatted(document);

    size_t json_len = strlen(json);
    es_bulk_line_t *bulk_line = malloc(sizeof(es_bulk_line_t) + json_len + 2);
    bulk_line->type = ES_BULK_LINE_INDEX;
    memcpy(bulk_line->line, json, json_len);
-    memcpy(bulk_line->path_md5_str, index_id_str, MD5_STR_LENGTH);
+    strcpy(bulk_line->doc_id, doc_id);
    *(bulk_line->line + json_len) = '\n';
    *(bulk_line->line + json_len + 1) = '\0';
    bulk_line->next = NULL;
@@ -93,7 +93,7 @@ void index_json(cJSON *document, const char index_id_str[MD5_STR_LENGTH]) {
    tpool_add_work(IndexCtx.pool, index_json_func, bulk_line);
 }

-void execute_update_script(const char *script, int async, const char index_id[MD5_STR_LENGTH]) {
+void execute_update_script(const char *script, int async, const char index_id[SIST_INDEX_ID_LEN]) {

    if (Indexer == NULL) {
        Indexer = create_indexer(IndexCtx.es_url, IndexCtx.es_index);
@@ -167,7 +167,7 @@ void *create_bulk_buffer(int max, int *count, size_t *buf_len) {
            snprintf(
                    action_str, sizeof(action_str),
                    "{\"index\":{\"_id\":\"%s\",\"_type\":\"_doc\",\"_index\":\"%s\"}}\n",
-                    line->path_md5_str, Indexer->es_index
+                    line->doc_id, Indexer->es_index
            );

            size_t action_str_len = strlen(action_str);
@@ -184,7 +184,7 @@ void *create_bulk_buffer(int max, int *count, size_t *buf_len) {
            snprintf(
                    action_str, sizeof(action_str),
                    "{\"delete\":{\"_id\":\"%s\",\"_index\":\"%s\"}}\n",
-                    line->path_md5_str, Indexer->es_index
+                    line->doc_id, Indexer->es_index
            );

            size_t action_str_len = strlen(action_str);
@@ -263,7 +263,7 @@ void _elastic_flush(int max) {
    if (r->status_code == 413) {

        if (max <= 1) {
-            LOG_ERRORF("elastic.c", "Single document too large, giving up: {%s}", Indexer->line_head->path_md5_str)
+            LOG_ERRORF("elastic.c", "Single document too large, giving up: {%s}", Indexer->line_head->doc_id)
            free_response(r);
            free(buf);
            free_queue(1);
--- a/src/index/elastic.h
+++ b/src/index/elastic.h
@@ -8,7 +8,7 @@

 typedef struct es_bulk_line {
    struct es_bulk_line *next;
-    char path_md5_str[MD5_STR_LENGTH];
+    char doc_id[SIST_DOC_ID_LEN];
    int type;
    char line[0];
 } es_bulk_line_t;
@@ -40,9 +40,9 @@ typedef struct es_indexer es_indexer_t;

 void elastic_index_line(es_bulk_line_t *line);

-void print_json(cJSON *document, const char index_id_str[MD5_STR_LENGTH]);
+void print_json(cJSON *document, const char index_id_str[SIST_INDEX_ID_LEN]);

-void index_json(cJSON *document, const char index_id_str[MD5_STR_LENGTH]);
+void index_json(cJSON *document, const char doc_id[SIST_INDEX_ID_LEN]);

 void delete_document(const char *document_id_str, void* data);

@@ -59,6 +59,6 @@ char *elastic_get_status();

 es_version_t *elastic_get_version(const char *es_url);

-void execute_update_script(const char *script, int async, const char index_id[MD5_STR_LENGTH]);
+void execute_update_script(const char *script, int async, const char index_id[SIST_INDEX_ID_LEN]);

 #endif
--- a/src/index/static_generated.c
+++ b/src/index/static_generated.c
--- a/src/io/serialize.c
+++ b/src/io/serialize.c
@@ -124,9 +124,7 @@ char *build_json_string(document_t *doc) {
        cJSON_AddStringToObject(json, "path", "");
    }

-    char md5_str[MD5_STR_LENGTH];
-    buf2hex(doc->path_md5, MD5_DIGEST_LENGTH, md5_str);
-    cJSON_AddStringToObject(json, "_id", md5_str);
+    cJSON_AddStringToObject(json, "_id", doc->doc_id);

    // Metadata
    meta_line_t *meta = doc->meta_head;
@@ -452,32 +450,31 @@ void read_lines(const char *path, const line_processor_t processor) {

    dyn_buffer_destroy(&buf);
    fclose(file);
-
 }

-void read_index_ndjson(const char *line, void* _data) {
-    void** data = _data;
-    const char* index_id = data[0];
+void read_index_ndjson(const char *line, void *_data) {
+    void **data = _data;
+    const char *index_id = data[0];
    index_func func = data[1];
    read_index_bin_handle_line(line, index_id, func);
 }

-void read_index(const char *path, const char index_id[MD5_STR_LENGTH], const char *type, index_func func) {
+void read_index(const char *path, const char index_id[SIST_INDEX_ID_LEN], const char *type, index_func func) {
    if (strcmp(type, INDEX_TYPE_NDJSON) == 0) {
        read_lines(path, (line_processor_t) {
-            .data = (void*[2]){(void*)index_id, func} ,
-            .func = read_index_ndjson,
+                .data = (void *[2]) {(void *) index_id, func},
+                .func = read_index_ndjson,
        });
    }
 }

 static __thread GHashTable *IncrementalReadTable = NULL;

-void json_put_incremental(cJSON *document, UNUSED(const char id_str[MD5_STR_LENGTH])) {
+void json_put_incremental(cJSON *document, UNUSED(const char doc_id[SIST_DOC_ID_LEN])) {
    const char *path_md5_str = cJSON_GetObjectItem(document, "_id")->valuestring;
    const int mtime = cJSON_GetObjectItem(document, "mtime")->valueint;

-    incremental_put_str(IncrementalReadTable, path_md5_str, mtime);
+    incremental_put(IncrementalReadTable, path_md5_str, mtime);
 }

 void incremental_read(GHashTable *table, const char *filepath, index_descriptor_t *desc) {
@@ -490,13 +487,11 @@ static __thread GHashTable *IncrementalNewTable = NULL;
 static __thread store_t *IncrementalCopySourceStore = NULL;
 static __thread store_t *IncrementalCopyDestinationStore = NULL;

-void incremental_copy_handle_doc(cJSON *document, UNUSED(const char id_str[MD5_STR_LENGTH])) {
+void incremental_copy_handle_doc(cJSON *document, UNUSED(const char id_str[SIST_DOC_ID_LEN])) {

-    const char *path_md5_str = cJSON_GetObjectItem(document, "_id")->valuestring;
-    unsigned char path_md5[MD5_DIGEST_LENGTH];
-    hex2buf(path_md5_str, MD5_STR_LENGTH - 1, path_md5);
+    const char *doc_id = cJSON_GetObjectItem(document, "_id")->valuestring;

-    if (cJSON_GetObjectItem(document, "parent") != NULL || incremental_get_str(IncrementalCopyTable, path_md5_str)) {
+    if (cJSON_GetObjectItem(document, "parent") != NULL || incremental_get(IncrementalCopyTable, doc_id)) {
        // Copy index line
        cJSON_DeleteItemFromObject(document, "index");
        char *json_str = cJSON_PrintUnformatted(document);
@@ -510,9 +505,9 @@ void incremental_copy_handle_doc(cJSON *document, UNUSED(const char id_str[MD5_S

        // Copy tn store contents
        size_t buf_len;
-        char *buf = store_read(IncrementalCopySourceStore, (char *) path_md5, sizeof(path_md5), &buf_len);
+        char *buf = store_read(IncrementalCopySourceStore, (char *) doc_id, sizeof(doc_id), &buf_len);
        if (buf_len != 0) {
-            store_write(IncrementalCopyDestinationStore, (char *) path_md5, sizeof(path_md5), buf, buf_len);
+            store_write(IncrementalCopyDestinationStore, (char *) doc_id, sizeof(doc_id), buf, buf_len);
            free(buf);
        }
    }
@@ -536,24 +531,24 @@ void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
    read_index(filepath, "", INDEX_TYPE_NDJSON, incremental_copy_handle_doc);
 }

-void incremental_delete_handle_doc(cJSON *document, UNUSED(const char id_str[MD5_STR_LENGTH])) {
+void incremental_delete_handle_doc(cJSON *document, UNUSED(const char id_str[SIST_DOC_ID_LEN])) {

-    char path_md5_n[MD5_STR_LENGTH + 1];
-    path_md5_n[MD5_STR_LENGTH] = '\0';
-    path_md5_n[MD5_STR_LENGTH - 1] = '\n';
-    const char *path_md5_str = cJSON_GetObjectItem(document, "_id")->valuestring;
+    char doc_id_n[SIST_DOC_ID_LEN + 1];
+    doc_id_n[SIST_DOC_ID_LEN] = '\0';
+    doc_id_n[SIST_DOC_ID_LEN - 1] = '\n';
+    const char *doc_id = cJSON_GetObjectItem(document, "_id")->valuestring;

    // do not delete archive virtual entries
    if (cJSON_GetObjectItem(document, "parent") == NULL 
-        && !incremental_get_str(IncrementalCopyTable, path_md5_str)
-        && !incremental_get_str(IncrementalNewTable, path_md5_str)
+        && !incremental_get(IncrementalCopyTable, doc_id)
+        && !incremental_get(IncrementalNewTable, doc_id)
        ) {
-        memcpy(path_md5_n, path_md5_str, MD5_STR_LENGTH - 1);
-        zstd_write_string(path_md5_n, MD5_STR_LENGTH);
+        memcpy(doc_id_n, doc_id, SIST_DOC_ID_LEN - 1);
+        zstd_write_string(doc_id, sizeof(doc_id_n));
    }
 }

-void incremental_delete(const char *del_filepath, const char* index_filepath, 
+void incremental_delete(const char *del_filepath, const char *index_filepath,
                        GHashTable *copy_table, GHashTable *new_table) {

    if (WriterCtx.out_file == NULL) {
--- a/src/io/serialize.h
+++ b/src/io/serialize.h
@@ -12,7 +12,7 @@ typedef struct line_processor {
  void (*func)(const char*, void*);
 } line_processor_t;

-typedef void(*index_func)(cJSON *, const char[MD5_STR_LENGTH]);
+typedef void(*index_func)(cJSON *, const char[SIST_DOC_ID_LEN]);

 void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
                      const char *dst_filepath, GHashTable *copy_table);
@@ -24,7 +24,7 @@ void write_document(document_t *doc);

 void read_lines(const char *path, const line_processor_t processor);

-void read_index(const char *path, const char[MD5_STR_LENGTH], const char *type, index_func);
+void read_index(const char *path, const char index_id[SIST_INDEX_ID_LEN], const char *type, index_func);

 void incremental_read(GHashTable *table, const char *filepath, index_descriptor_t *desc);

--- a/src/io/store.c
+++ b/src/io/store.c
@@ -52,22 +52,7 @@ void store_flush(store_t *store) {
 void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len) {

    if (LogCtx.very_verbose) {
-        if (key_len == MD5_DIGEST_LENGTH) {
-            char path_md5_str[MD5_STR_LENGTH];
-            buf2hex((unsigned char *) key, MD5_DIGEST_LENGTH, path_md5_str);
-
-            LOG_DEBUGF("store.c", "Store write {%s} %lu bytes", path_md5_str, buf_len)
-
-        } else if (key_len == MD5_DIGEST_LENGTH + sizeof(int)) {
-            char path_md5_str[MD5_STR_LENGTH];
-            buf2hex((unsigned char *) key, MD5_DIGEST_LENGTH, path_md5_str);
-
-            LOG_DEBUGF("store.c", "Store write {%s/%d} %lu bytes",
-                       path_md5_str, *(int *) (key + MD5_DIGEST_LENGTH), buf_len);
-
-        } else {
-            LOG_DEBUGF("store.c", "Store write {%s} %lu bytes", key, buf_len)
-        }
+        LOG_DEBUGF("store.c", "Store write %s@{%s} %lu bytes", store->path, key, buf_len)
    }

 #if (SIST_FAKE_STORE != 1)
--- a/src/io/walk.c
+++ b/src/io/walk.c
@@ -22,7 +22,7 @@ parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info,

    job->vfile.info = *info;

-    memset(job->parent, 0, MD5_DIGEST_LENGTH);
+    job->parent[0] = '\0';

    job->vfile.filepath = job->filepath;
    job->vfile.read = fs_read;
--- a/src/main.c
+++ b/src/main.c
@@ -118,7 +118,7 @@ void init_dir(const char *dirpath, scan_args_t* args) {
      index_descriptor_t original_desc = read_index_descriptor(descriptor_path);
      memcpy(ScanCtx.index.desc.id, original_desc.id, sizeof(original_desc.id));
    } else {
-      // genreate new index id based on timestamp
+      // generate new index id based on timestamp
      unsigned char index_md5[MD5_DIGEST_LENGTH];
      MD5((unsigned char *) &ScanCtx.index.desc.timestamp, sizeof(ScanCtx.index.desc.timestamp), index_md5);
      buf2hex(index_md5, MD5_DIGEST_LENGTH, ScanCtx.index.desc.id);
--- a/src/parsing/parse.c
+++ b/src/parsing/parse.c
@@ -69,7 +69,7 @@ void parse(void *arg) {
    doc->base = (short) job->base;

    char *rel_path = doc->filepath + ScanCtx.index.desc.root_len;
-    MD5((unsigned char *) rel_path, strlen(rel_path), doc->path_md5);
+    generate_doc_id(rel_path, doc->doc_id);

    doc->meta_head = NULL;
    doc->meta_tail = NULL;
@@ -77,10 +77,10 @@ void parse(void *arg) {
    doc->size = job->vfile.info.st_size;
    doc->mtime = (int) job->vfile.info.st_mtim.tv_sec;

-    int inc_ts = incremental_get(ScanCtx.original_table, doc->path_md5);
+    int inc_ts = incremental_get(ScanCtx.original_table, doc->doc_id);
    if (inc_ts != 0 && inc_ts == job->vfile.info.st_mtim.tv_sec) {
        pthread_mutex_lock(&ScanCtx.copy_table_mu);
-        incremental_mark_file(ScanCtx.copy_table, doc->path_md5);
+        incremental_mark_file(ScanCtx.copy_table, doc->doc_id);
        pthread_mutex_unlock(&ScanCtx.copy_table_mu);

        pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
@@ -96,16 +96,14 @@ void parse(void *arg) {

    if (ScanCtx.new_table != NULL) {
        pthread_mutex_lock(&ScanCtx.copy_table_mu);
-        incremental_mark_file(ScanCtx.new_table, doc->path_md5);
+        incremental_mark_file(ScanCtx.new_table, doc->doc_id);
        pthread_mutex_unlock(&ScanCtx.copy_table_mu);
    }

    char *buf[MAGIC_BUF_SIZE];

    if (LogCtx.very_verbose) {
-        char path_md5_str[MD5_STR_LENGTH];
-        buf2hex(doc->path_md5, MD5_DIGEST_LENGTH, path_md5_str);
-        LOG_DEBUGF(job->filepath, "Starting parse job {%s}", path_md5_str)
+        LOG_DEBUGF(job->filepath, "Starting parse job {%s}", doc->doc_id)
    }

    if (job->vfile.info.st_size == 0) {
@@ -218,10 +216,10 @@ void parse(void *arg) {
    abort:

    //Parent meta
-    if (!md5_digest_is_null(job->parent)) {
-        meta_line_t *meta_parent = malloc(sizeof(meta_line_t) + MD5_STR_LENGTH);
+    if (job->parent[0] != '\0') {
+        meta_line_t *meta_parent = malloc(sizeof(meta_line_t) + SIST_INDEX_ID_LEN);
        meta_parent->key = MetaParent;
-        buf2hex(job->parent, MD5_DIGEST_LENGTH, meta_parent->str_val);
+        strcpy(meta_parent->str_val, job->parent);
        APPEND_META((doc), meta_parent)

        doc->has_parent = TRUE;
--- a/src/parsing/sidecar.c
+++ b/src/parsing/sidecar.c
@@ -23,16 +23,19 @@ void parse_sidecar(vfile_t *vfile, document_t *doc) {
    }
    char *json_str = cJSON_PrintUnformatted(json);

-    unsigned char path_md5[MD5_DIGEST_LENGTH];
-    MD5((unsigned char *) vfile->filepath + ScanCtx.index.desc.root_len, doc->ext - 1 - ScanCtx.index.desc.root_len,
-        path_md5);
+    char assoc_doc_id[SIST_DOC_ID_LEN];

-    char path_md5_str[MD5_STR_LENGTH];
-    buf2hex(path_md5, MD5_DIGEST_LENGTH, path_md5_str);
+    char rel_path[PATH_MAX];
+    size_t rel_path_len = doc->ext - 1 - ScanCtx.index.desc.root_len;
+    memcpy(rel_path, vfile->filepath + ScanCtx.index.desc.root_len, rel_path_len);
+    *(rel_path + rel_path_len) = '\0';

-    store_write(ScanCtx.index.meta_store, path_md5_str, MD5_STR_LENGTH, json_str, strlen(json_str) + 1);
+    generate_doc_id(rel_path, assoc_doc_id);
+
+    store_write(ScanCtx.index.meta_store, assoc_doc_id, sizeof(assoc_doc_id), json_str,
+                strlen(json_str) + 1);

    cJSON_Delete(json);
    free(json_str);
    free(buf);
-}
+}
--- a/src/sist.h
+++ b/src/sist.h
@@ -53,7 +53,7 @@
 #include <ctype.h>
 #include "git_hash.h"

-#define VERSION "2.11.7"
+#define VERSION "2.12.0"
 static const char *const Version = VERSION;

 #ifndef SIST_PLATFORM
--- a/src/stats.c
+++ b/src/stats.c
@@ -20,7 +20,7 @@ typedef struct {
    long count;
 } agg_t;

-void fill_tables(cJSON *document, UNUSED(const char index_id[MD5_STR_LENGTH])) {
+void fill_tables(cJSON *document, UNUSED(const char index_id[SIST_INDEX_ID_LEN])) {

    if (cJSON_GetObjectItem(document, "parent") != NULL) {
        return;
--- a/src/types.h
+++ b/src/types.h
@@ -4,7 +4,7 @@
 #define INDEX_TYPE_NDJSON "ndjson"

 typedef struct index_descriptor {
-    char id[MD5_STR_LENGTH];
+    char id[SIST_INDEX_ID_LEN];
    char version[64];
    long timestamp;
    char root[PATH_MAX];
--- a/src/util.h
+++ b/src/util.h
@@ -10,8 +10,6 @@
 #include "third-party/utf8.h/utf8.h"
 #include "libscan/scan.h"

-#define MD5_STR_LENGTH 33
-

 char *abspath(const char *path);

@@ -94,40 +92,24 @@ static void buf2hex(const unsigned char *buf, size_t buflen, char *hex_string) {


 __always_inline
-static int md5_digest_is_null(const unsigned char digest[MD5_DIGEST_LENGTH]) {
-    return (*(int64_t *) digest) == 0 && (*((int64_t *) digest + 1)) == 0;
+static void generate_doc_id(const char *rel_path, char *doc_id) {
+    unsigned char md[MD5_DIGEST_LENGTH];
+
+    MD5((unsigned char *) rel_path, strlen(rel_path), md);
+    buf2hex(md, sizeof(md), doc_id);
 }

-
 __always_inline
-static void incremental_put(GHashTable *table, const unsigned char path_md5[MD5_DIGEST_LENGTH], int mtime) {
-    char *ptr = malloc(MD5_STR_LENGTH);
-    buf2hex(path_md5, MD5_DIGEST_LENGTH, ptr);
+static void incremental_put(GHashTable *table, const char doc_id[SIST_DOC_ID_LEN], int mtime) {
+    char *ptr = malloc(SIST_DOC_ID_LEN);
+    strcpy(ptr, doc_id);
    g_hash_table_insert(table, ptr, GINT_TO_POINTER(mtime));
 }

 __always_inline
-static void incremental_put_str(GHashTable *table, const char *path_md5, int mtime) {
-    char *ptr = malloc(MD5_STR_LENGTH);
-    strcpy(ptr, path_md5);
-    g_hash_table_insert(table, ptr, GINT_TO_POINTER(mtime));
-}
-
-__always_inline
-static int incremental_get(GHashTable *table, const unsigned char path_md5[MD5_DIGEST_LENGTH]) {
+static int incremental_get(GHashTable *table, const char doc_id[SIST_DOC_ID_LEN]) {
    if (table != NULL) {
-        char md5_str[MD5_STR_LENGTH];
-        buf2hex(path_md5, MD5_DIGEST_LENGTH, md5_str);
-        return GPOINTER_TO_INT(g_hash_table_lookup(table, md5_str));
-    } else {
-        return 0;
-    }
-}
-
-__always_inline
-static int incremental_get_str(GHashTable *table, const char *path_md5) {
-    if (table != NULL) {
-        return GPOINTER_TO_INT(g_hash_table_lookup(table, path_md5));
+        return GPOINTER_TO_INT(g_hash_table_lookup(table, doc_id));
    } else {
        return 0;
    }
@@ -138,9 +120,9 @@ static int incremental_get_str(GHashTable *table, const char *path_md5) {
 * !!Not thread safe.
 */
 __always_inline
-static int incremental_mark_file(GHashTable *table, const unsigned char path_md5[MD5_DIGEST_LENGTH]) {
-    char *ptr = malloc(MD5_STR_LENGTH);
-    buf2hex(path_md5, MD5_DIGEST_LENGTH, ptr);
+static int incremental_mark_file(GHashTable *table, const char doc_id[SIST_DOC_ID_LEN]) {
+    char *ptr = malloc(SIST_DOC_ID_LEN);
+    strcpy(ptr, doc_id);
    return g_hash_table_insert(table, ptr, GINT_TO_POINTER(1));
 }

--- a/src/web/serve.c
+++ b/src/web/serve.c
@@ -36,7 +36,7 @@ static void send_response_line(struct mg_connection *nc, int status_code, size_t

 index_t *get_index_by_id(const char *index_id) {
    for (int i = WebCtx.index_count; i >= 0; i--) {
-        if (strncmp(index_id, WebCtx.indices[i].desc.id, MD5_STR_LENGTH) == 0) {
+        if (strncmp(index_id, WebCtx.indices[i].desc.id, SIST_INDEX_ID_LEN) == 0) {
            return &WebCtx.indices[i];
        }
    }
@@ -70,23 +70,23 @@ void search_index(struct mg_connection *nc, struct mg_http_message *hm) {

 void stats_files(struct mg_connection *nc, struct mg_http_message *hm) {

-    if (hm->uri.len != MD5_STR_LENGTH + 4) {
+    if (hm->uri.len != SIST_INDEX_ID_LEN + 4) {
        HTTP_REPLY_NOT_FOUND
        return;
    }

-    char arg_md5[MD5_STR_LENGTH];
-    memcpy(arg_md5, hm->uri.ptr + 3, MD5_STR_LENGTH);
-    *(arg_md5 + MD5_STR_LENGTH - 1) = '\0';
+    char arg_index_id[SIST_INDEX_ID_LEN];
+    memcpy(arg_index_id, hm->uri.ptr + 3, SIST_INDEX_ID_LEN);
+    *(arg_index_id + SIST_INDEX_ID_LEN - 1) = '\0';

-    index_t *index = get_index_by_id(arg_md5);
+    index_t *index = get_index_by_id(arg_index_id);
    if (index == NULL) {
        HTTP_REPLY_NOT_FOUND
        return;
    }

    const char *file;
-    switch (atoi(hm->uri.ptr + 3 + MD5_STR_LENGTH)) {
+    switch (atoi(hm->uri.ptr + 3 + SIST_INDEX_ID_LEN)) {
        case 1:
            file = "treemap.csv";
            break;
@@ -150,28 +150,25 @@ void style_vendor(struct mg_connection *nc, struct mg_http_message *hm) {

 void thumbnail(struct mg_connection *nc, struct mg_http_message *hm) {

-    int parse_tn_num = FALSE;
+    int has_thumbnail_index = FALSE;

-    if (hm->uri.len != 68) {
+    if (hm->uri.len != SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2) {

-        if (hm->uri.len != 68 + 4) {
+        if (hm->uri.len != SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2 + 4) {
            LOG_DEBUGF("serve.c", "Invalid thumbnail path: %.*s", (int) hm->uri.len, hm->uri.ptr)
            HTTP_REPLY_NOT_FOUND
            return;
        }
-        parse_tn_num = TRUE;
+        has_thumbnail_index = TRUE;
    }

-    char arg_file_md5[MD5_STR_LENGTH];
-    char arg_index[MD5_STR_LENGTH];
+    char arg_doc_id[SIST_DOC_ID_LEN];
+    char arg_index[SIST_INDEX_ID_LEN];

-    memcpy(arg_index, hm->uri.ptr + 3, MD5_STR_LENGTH);
-    *(arg_index + MD5_STR_LENGTH - 1) = '\0';
-    memcpy(arg_file_md5, hm->uri.ptr + 3 + MD5_STR_LENGTH, MD5_STR_LENGTH);
-    *(arg_file_md5 + MD5_STR_LENGTH - 1) = '\0';
-
-    unsigned char md5_buf[MD5_DIGEST_LENGTH];
-    hex2buf(arg_file_md5, MD5_STR_LENGTH - 1, md5_buf);
+    memcpy(arg_index, hm->uri.ptr + 3, SIST_INDEX_ID_LEN);
+    *(arg_index + SIST_INDEX_ID_LEN - 1) = '\0';
+    memcpy(arg_doc_id, hm->uri.ptr + 3 + SIST_INDEX_ID_LEN, SIST_DOC_ID_LEN);
+    *(arg_doc_id + SIST_DOC_ID_LEN - 1) = '\0';

    store_t *store = get_store(arg_index);
    if (store == NULL) {
@@ -183,16 +180,17 @@ void thumbnail(struct mg_connection *nc, struct mg_http_message *hm) {
    char *data;
    size_t data_len = 0;

-    if (parse_tn_num) {
-        int tn_num = atoi(hm->uri.ptr + 68);
+    if (has_thumbnail_index) {
+        const char *tn_index = hm->uri.ptr + SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2;

-        char tn_key[sizeof(md5_buf) + sizeof(int)];
-        memcpy(tn_key, md5_buf, sizeof(md5_buf));
-        memcpy(tn_key + sizeof(md5_buf), &tn_num, sizeof(tn_num));
+        char tn_key[sizeof(arg_doc_id) + sizeof(char) * 4];
+
+        memcpy(tn_key, arg_doc_id, sizeof(arg_doc_id));
+        memcpy(tn_key + sizeof(arg_doc_id) - 1, tn_index, sizeof(char) * 4);

        data = store_read(store, (char *) tn_key, sizeof(tn_key), &data_len);
    } else {
-        data = store_read(store, (char *) md5_buf, sizeof(md5_buf), &data_len);
+        data = store_read(store, (char *) arg_doc_id, sizeof(arg_doc_id), &data_len);
    }

    if (data_len != 0) {
@@ -357,17 +355,17 @@ void index_info(struct mg_connection *nc) {

 void document_info(struct mg_connection *nc, struct mg_http_message *hm) {

-    if (hm->uri.len != MD5_STR_LENGTH + 2) {
+    if (hm->uri.len != SIST_DOC_ID_LEN + 2) {
        LOG_DEBUGF("serve.c", "Invalid document_info path: %.*s", (int) hm->uri.len, hm->uri.ptr)
        HTTP_REPLY_NOT_FOUND
        return;
    }

-    char arg_md5[MD5_STR_LENGTH];
-    memcpy(arg_md5, hm->uri.ptr + 3, MD5_STR_LENGTH);
-    *(arg_md5 + MD5_STR_LENGTH - 1) = '\0';
+    char arg_doc_id[SIST_DOC_ID_LEN];
+    memcpy(arg_doc_id, hm->uri.ptr + 3, SIST_DOC_ID_LEN);
+    *(arg_doc_id + SIST_DOC_ID_LEN - 1) = '\0';

-    cJSON *doc = elastic_get_document(arg_md5);
+    cJSON *doc = elastic_get_document(arg_doc_id);
    cJSON *source = cJSON_GetObjectItem(doc, "_source");

    cJSON *index_id = cJSON_GetObjectItem(source, "index");
@@ -393,17 +391,17 @@ void document_info(struct mg_connection *nc, struct mg_http_message *hm) {

 void file(struct mg_connection *nc, struct mg_http_message *hm) {

-    if (hm->uri.len != MD5_STR_LENGTH + 2) {
+    if (hm->uri.len != SIST_DOC_ID_LEN + 2) {
        LOG_DEBUGF("serve.c", "Invalid file path: %.*s", (int) hm->uri.len, hm->uri.ptr)
        HTTP_REPLY_NOT_FOUND
        return;
    }

-    char arg_md5[MD5_STR_LENGTH];
-    memcpy(arg_md5, hm->uri.ptr + 3, MD5_STR_LENGTH);
-    *(arg_md5 + MD5_STR_LENGTH - 1) = '\0';
+    char arg_doc_id[SIST_DOC_ID_LEN];
+    memcpy(arg_doc_id, hm->uri.ptr + 3, SIST_DOC_ID_LEN);
+    *(arg_doc_id + SIST_DOC_ID_LEN - 1) = '\0';

-    const char *next = arg_md5;
+    const char *next = arg_doc_id;
    cJSON *doc = NULL;
    cJSON *index_id = NULL;
    cJSON *source = NULL;
@@ -454,7 +452,6 @@ void status(struct mg_connection *nc) {
 typedef struct {
    char *name;
    int delete;
-    char *path_md5_str;
    char *doc_id;
 } tag_req_t;

@@ -474,12 +471,6 @@ tag_req_t *parse_tag_request(cJSON *json) {
        return NULL;
    }

-    cJSON *arg_path_md5 = cJSON_GetObjectItem(json, "path_md5");
-    if (arg_path_md5 == NULL || !cJSON_IsString(arg_path_md5) ||
-        strlen(arg_path_md5->valuestring) != MD5_STR_LENGTH - 1) {
-        return NULL;
-    }
-
    cJSON *arg_doc_id = cJSON_GetObjectItem(json, "doc_id");
    if (arg_doc_id == NULL || !cJSON_IsString(arg_doc_id)) {
        return NULL;
@@ -488,22 +479,21 @@ tag_req_t *parse_tag_request(cJSON *json) {
    tag_req_t *req = malloc(sizeof(tag_req_t));
    req->delete = arg_delete->valueint;
    req->name = arg_name->valuestring;
-    req->path_md5_str = arg_path_md5->valuestring;
    req->doc_id = arg_doc_id->valuestring;

    return req;
 }

 void tag(struct mg_connection *nc, struct mg_http_message *hm) {
-    if (hm->uri.len != MD5_STR_LENGTH + 4) {
+    if (hm->uri.len != SIST_INDEX_ID_LEN + 4) {
        LOG_DEBUGF("serve.c", "Invalid tag path: %.*s", (int) hm->uri.len, hm->uri.ptr)
        HTTP_REPLY_NOT_FOUND
        return;
    }

-    char arg_index[MD5_STR_LENGTH];
-    memcpy(arg_index, hm->uri.ptr + 5, MD5_STR_LENGTH);
-    *(arg_index + MD5_STR_LENGTH - 1) = '\0';
+    char arg_index[SIST_INDEX_ID_LEN];
+    memcpy(arg_index, hm->uri.ptr + 5, SIST_INDEX_ID_LEN);
+    *(arg_index + SIST_INDEX_ID_LEN - 1) = '\0';

    if (hm->body.len < 2 || hm->method.len != 4 || memcmp(&hm->method, "POST", 4) == 0) {
        LOG_DEBUG("serve.c", "Invalid tag request")
@@ -535,7 +525,7 @@ void tag(struct mg_connection *nc, struct mg_http_message *hm) {
    cJSON *arr = NULL;

    size_t data_len = 0;
-    const char *data = store_read(store, arg_req->path_md5_str, MD5_STR_LENGTH, &data_len);
+    const char *data = store_read(store, arg_req->doc_id, SIST_DOC_ID_LEN, &data_len);
    if (data_len == 0) {
        arr = cJSON_CreateArray();
    } else {
@@ -595,7 +585,7 @@ void tag(struct mg_connection *nc, struct mg_http_message *hm) {
    }

    char *json_str = cJSON_PrintUnformatted(arr);
-    store_write(store, arg_req->path_md5_str, MD5_STR_LENGTH, json_str, strlen(json_str) + 1);
+    store_write(store, arg_req->doc_id, SIST_DOC_ID_LEN, json_str, strlen(json_str) + 1);
    store_flush(store);

    free(arg_req);
--- a/src/web/static_generated.c
+++ b/src/web/static_generated.c