From 1cfceba51891b28e953a4fa1460c082d1927b0bb Mon Sep 17 00:00:00 2001 From: simon987 Date: Tue, 25 Apr 2023 08:49:50 -0400 Subject: [PATCH] wip --- CMakeLists.txt | 2 +- README.md | 2 +- src/cli.c | 32 +++++++++++++ src/cli.h | 11 +++++ src/database/database.c | 24 +--------- src/database/database.h | 13 +++-- src/database/database_fts.c | 88 ++++++++++++++++++++++++++++++++++ src/database/database_schema.c | 42 ++++++++++++++++ src/main.c | 38 ++++++++++++++- 9 files changed, 223 insertions(+), 29 deletions(-) create mode 100644 src/database/database_fts.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 7d52308..2ca2be4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,7 +58,7 @@ add_executable(sist2 src/auth0/auth0_c_api.h src/auth0/auth0_c_api.cpp - src/database/database_stats.c src/database/database_schema.c) + src/database/database_stats.c src/database/database_schema.c src/database/database_fts.c) set_target_properties(sist2 PROPERTIES LINKER_LANGUAGE C) target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/) diff --git a/README.md b/README.md index 00fe53c..47de10e 100644 --- a/README.md +++ b/README.md @@ -185,7 +185,7 @@ docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux 3. Install vcpkg dependencies ```bash - vcpkg install curl[core,openssl] sqlite3 cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample] + vcpkg install curl[core,openssl] sqlite3[core,fts5] cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample] ``` 4. Build diff --git a/src/cli.c b/src/cli.c index 9546a27..d56d7e6 100644 --- a/src/cli.c +++ b/src/cli.c @@ -410,6 +410,33 @@ int index_args_validate(index_args_t *args, int argc, const char **argv) { return 0; } +int sqlite_index_args_validate(sqlite_index_args_t *args, int argc, const char **argv) { + + LogCtx.verbose = 1; + + if (argc < 2) { + fprintf(stderr, "Required positional argument: PATH.\n"); + return 1; + } + + char *index_path = abspath(argv[1]); + if (index_path == NULL) { + LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1]); + } else { + args->index_path = index_path; + } + + if (args->search_index_path == NULL) { + LOG_FATAL("cli.c", "Missing required argument --search-index"); + } + + LOG_DEBUGF("cli.c", "arg index_path=%s", args->index_path); + LOG_DEBUGF("cli.c", "arg search_index_path=%s", args->search_index_path); + LOG_DEBUGF("cli.c", "arg optimize_index=%d", args->optimize_database); + + return 0; +} + int web_args_validate(web_args_t *args, int argc, const char **argv) { LogCtx.verbose = 1; @@ -554,6 +581,11 @@ index_args_t *index_args_create() { return args; } +sqlite_index_args_t *sqlite_index_args_create() { + sqlite_index_args_t *args = calloc(sizeof(sqlite_index_args_t), 1); + return args; +} + web_args_t *web_args_create() { web_args_t *args = calloc(sizeof(web_args_t), 1); return args; diff --git a/src/cli.h b/src/cli.h index 10d48c3..d539b02 100644 --- a/src/cli.h +++ b/src/cli.h @@ -66,6 +66,12 @@ typedef struct index_args { int incremental; } index_args_t; +typedef struct { + char *index_path; + char *search_index_path; + int optimize_database; +} sqlite_index_args_t; + typedef struct web_args { char *es_url; char *es_index; @@ -102,6 +108,8 @@ typedef struct exec_args { index_args_t *index_args_create(); +sqlite_index_args_t *sqlite_index_args_create(); + void index_args_destroy(index_args_t *args); web_args_t *web_args_create(); @@ -110,6 +118,8 @@ void web_args_destroy(web_args_t *args); int index_args_validate(index_args_t *args, int argc, const char **argv); +int sqlite_index_args_validate(sqlite_index_args_t *args, int argc, const char **argv); + int web_args_validate(web_args_t *args, int argc, const char **argv); exec_args_t *exec_args_create(); @@ -118,4 +128,5 @@ void exec_args_destroy(exec_args_t *args); int exec_args_validate(exec_args_t *args, int argc, const char **argv); + #endif diff --git a/src/database/database.c b/src/database/database.c index 1800038..f878f20 100644 --- a/src/database/database.c +++ b/src/database/database.c @@ -74,6 +74,8 @@ void database_initialize(database_t *db) { CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, IndexDatabaseSchema, NULL, NULL, NULL)); } else if (db->type == IPC_CONSUMER_DATABASE || db->type == IPC_PRODUCER_DATABASE) { CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, IpcDatabaseSchema, NULL, NULL, NULL)); + } else if (db->type == FTS_DATABASE) { + CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, FtsDatabaseSchema, NULL, NULL, NULL)); } sqlite3_close(db->db); @@ -479,28 +481,6 @@ void database_write_thumbnail(database_t *db, const char *id, int num, void *dat } -//void database_create_fts_index(database_t *db, database_t *fts_db) { -// // In a separate file, -// -// // use database_initialize() to create FTS schema -// // if --force-reset, then truncate the tables first -// -// /* -// * create/append fts table -// * -// * create/append scalar index table with -// * id,index,size,mtime,mime -// * -// * create/append path index table with -// * index,path,depth -// * -// * content table is a view with SELECT UNION for all attached tables -// * random_seed column -// */ -// -// // INSERT INTO ft(ft) VALUES('optimize'); -//} - job_t *database_get_work(database_t *db, job_type_t job_type) { job_t *job; diff --git a/src/database/database.h b/src/database/database.h index feef5c0..f000cd7 100644 --- a/src/database/database.h +++ b/src/database/database.h @@ -10,6 +10,7 @@ typedef struct index_descriptor index_descriptor_t; extern const char *IpcDatabaseSchema; extern const char *IndexDatabaseSchema; +extern const char *FtsDatabaseSchema; typedef enum { INDEX_DATABASE, @@ -86,8 +87,6 @@ typedef struct { long size; } treemap_row_t; -static treemap_row_t null_treemap_row = {0, 0, 0}; - database_t *database_create(const char *filename, database_type_t type); @@ -116,7 +115,7 @@ cJSON *database_document_iter(database_iterator_t *); database_iterator_t *database_create_delete_list_iterator(database_t *db); -char * database_delete_list_iter(database_iterator_t *iter); +char *database_delete_list_iter(database_iterator_t *iter); #define database_delete_list_iter_foreach(element, iter) \ for (char *(element) = database_delete_list_iter(iter); (element) != NULL; (element) = database_delete_list_iter(iter)) @@ -160,8 +159,14 @@ cJSON *database_get_stats(database_t *db, database_stat_type_d type); #define CRASH_IF_NOT_SQLITE_OK(x) do { \ int return_value = x; \ if (return_value != SQLITE_OK) { \ - LOG_FATALF("database.c", "Sqlite error @ database.c:%d : (%d) %s", __LINE__, return_value, sqlite3_errmsg(db->db)); \ + LOG_FATALF("database.c", "Sqlite error @ %s:%d : (%d) %s", __BASE_FILE__, __LINE__, return_value, sqlite3_errmsg(db->db)); \ } \ } while (0) +void database_fts_attach(database_t *db, const char *fts_database_path); + +void database_fts_index(database_t *db); + +void database_fts_optimize(database_t *db); + #endif //SIST2_DATABASE_H \ No newline at end of file diff --git a/src/database/database_fts.c b/src/database/database_fts.c new file mode 100644 index 0000000..1791ec7 --- /dev/null +++ b/src/database/database_fts.c @@ -0,0 +1,88 @@ +#include "database.h" +#include "src/ctx.h" + +void database_fts_attach(database_t *db, const char *fts_database_path) { + + LOG_DEBUGF("database_fts.c", "Attaching to %s", fts_database_path); + + sqlite3_stmt *stmt; + CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2( + db->db, "ATTACH DATABASE ? AS fts" + "", -1, &stmt, NULL)); + + sqlite3_bind_text(stmt, 1, fts_database_path, -1, SQLITE_STATIC); + + CRASH_IF_STMT_FAIL(sqlite3_step(stmt)); + sqlite3_finalize(stmt); +} + +void database_fts_index(database_t *db) { + + LOG_INFO("database_fts.c", "Creating content table."); + + CRASH_IF_NOT_SQLITE_OK(sqlite3_exec( + db->db, + "WITH docs AS (SELECT document.id as id,\n" + " (SELECT id FROM descriptor) as index_id,\n" + " size,\n" + " document.json_data ->> 'path' as path,\n" + " length(document.json_data->>'path') - length(REPLACE(document.json_data->>'path', '/', '')) as path_depth,\n" + " document.json_data ->> 'mime' as mime,\n" + " mtime,\n" + " CASE\n" + " WHEN sc.json_data IS NULL THEN CASE\n" + " WHEN t.tag IS NULL THEN json_set(\n" + " document.json_data, '$._id',\n" + " document.id, '$.size',\n" + " document.size, '$.mtime',\n" + " document.mtime)\n" + " ELSE json_set(document.json_data, '$._id',\n" + " document.id, '$.size',\n" + " document.size, '$.mtime',\n" + " document.mtime, '$.tag',\n" + " json_group_array(t.tag)) END\n" + " ELSE CASE\n" + " WHEN t.tag IS NULL THEN json_patch(\n" + " json_set(document.json_data, '$._id', document.id, '$.size',\n" + " document.size, '$.mtime', document.mtime),\n" + " sc.json_data)\n" + " ELSE json_set(json_patch(document.json_data, sc.json_data), '$._id',\n" + " document.id, '$.size', document.size, '$.mtime',\n" + " document.mtime, '$.tag',\n" + " json_group_array(t.tag)) END END as json_data\n" + " FROM document\n" + " LEFT JOIN document_sidecar sc ON document.id = sc.id\n" + " LEFT JOIN tag t ON document.id = t.id\n" + " GROUP BY document.id)\n" + "INSERT\n" + "INTO fts.document_index (id, index_id, size, path, path_depth, mtime, mime, json_data)\n" + "SELECT *\n" + "FROM docs\n" + "WHERE true\n" + "on conflict (id, index_id) do update set size=excluded.size,\n" + " mtime=excluded.mtime,\n" + " json_data=excluded.json_data;", + NULL, NULL, NULL)); + + CRASH_IF_NOT_SQLITE_OK(sqlite3_exec( + db->db, + "DELETE\n" + "FROM fts.document_index\n" + "WHERE id IN (SELECT id FROM delete_list)\n" + " AND index_id = (SELECT id FROM descriptor);", + NULL, NULL, NULL + )); +} + +void database_fts_optimize(database_t *db) { + LOG_INFO("database_fts.c", "Optimizing search index."); + + CRASH_IF_NOT_SQLITE_OK(sqlite3_exec( + db->db, + "INSERT INTO search(search) VALUES('optimize');", + NULL, NULL, NULL)); + LOG_DEBUG("database_fts.c", "Optimized fts5 table."); + + CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA fts.optimize;", NULL, NULL, NULL)); + LOG_DEBUG("database_fts.c", "optimized indices."); +} diff --git a/src/database/database_schema.c b/src/database/database_schema.c index 23cb05f..a55ccd9 100644 --- a/src/database/database_schema.c +++ b/src/database/database_schema.c @@ -1,3 +1,45 @@ +const char *FtsDatabaseSchema = + "CREATE TABLE IF NOT EXISTS document_index (" + " id TEXT NOT NULL," + " index_id TEXT NOT NULL," + " size INTEGER NOT NULL," + " path TEXT NOT NULL," + " path_depth INT NOT NULL," + " mtime INTEGER NOT NULL," + " mime TEXT NOT NULL," + " json_data TEXT NOT NULL," + " PRIMARY KEY (id, index_id)" + ");" + "" + "CREATE VIEW IF NOT EXISTS document_view (rowid, name, content)" + " AS" + " SELECT rowid," + " json_data->>'name'," + " json_data->>'content'" + " FROM document_index;" + "" + "CREATE INDEX IF NOT EXISTS document_index_size_idx ON document_index (size);" + "CREATE INDEX IF NOT EXISTS document_index_mtime_idx ON document_index (mtime);" + "CREATE INDEX IF NOT EXISTS document_index_mime_idx ON document_index (mime);" + "CREATE INDEX IF NOT EXISTS document_index_path_idx ON document_index (path);" + "CREATE INDEX IF NOT EXISTS document_index_path_depth_idx ON document_index (path_depth);" + "" + "CREATE VIRTUAL TABLE IF NOT EXISTS search USING fts5 (" + " name," + " content," + " content='document_view'" + ");" + "" + "CREATE TRIGGER IF NOT EXISTS on_insert AFTER INSERT ON document_index BEGIN" + " INSERT INTO search(rowid, name, content) VALUES (new.rowid, new.json_data->>'name', new.json_data->>'content');" + "END;" + "CREATE TRIGGER IF NOT EXISTS on_delete AFTER DELETE ON document_index BEGIN" + " INSERT INTO search(search, name, content) VALUES('delete', old.json_data->>'name', old.json_data->>'content');" + "END;" + "CREATE TRIGGER IF NOT EXISTS on_update AFTER UPDATE ON document_index BEGIN" + " INSERT INTO search(search, rowid, name, content) VALUES('delete', old.rowid, old.json_data->>'name', old.json_data->>'content');" + " INSERT INTO search(rowid, name, content) VALUES (new.rowid, new.json_data->>'name', new.json_data->>'content');" + "END;"; const char *IpcDatabaseSchema = "CREATE TABLE parse_job (" diff --git a/src/main.c b/src/main.c index 0d15d11..433483b 100644 --- a/src/main.c +++ b/src/main.c @@ -22,6 +22,7 @@ static const char *const usage[] = { "sist2 scan [OPTION]... PATH", "sist2 index [OPTION]... INDEX", + "sist2 sqlite-index [OPTION]... INDEX", "sist2 web [OPTION]... INDEX...", "sist2 exec-script [OPTION]... INDEX", NULL, @@ -351,6 +352,23 @@ void sist2_index(index_args_t *args) { free(desc); } +void sist2_sqlite_index(sqlite_index_args_t *args) { + database_t *db = database_create(args->index_path, INDEX_DATABASE); + database_open(db); + + database_t *search_db = database_create(args->search_index_path, FTS_DATABASE); + database_initialize(search_db); + + database_fts_attach(db, args->search_index_path); + + database_fts_index(db); + if (args->optimize_database) { + database_fts_optimize(db); + } + + database_close(db, FALSE); +} + void sist2_exec_script(exec_args_t *args) { LogCtx.verbose = TRUE; @@ -436,6 +454,7 @@ int main(int argc, const char *argv[]) { index_args_t *index_args = index_args_create(); web_args_t *web_args = web_args_create(); exec_args_t *exec_args = exec_args_create(); + sqlite_index_args_t *sqlite_index_args = sqlite_index_args_create(); int arg_version = 0; @@ -445,6 +464,7 @@ int main(int argc, const char *argv[]) { char *common_script_path = NULL; int common_async_script = 0; int common_threads = 0; + int common_optimize_database = 0; struct argparse_option options[] = { OPT_HELP(), @@ -471,7 +491,7 @@ int main(int argc, const char *argv[]) { OPT_STRING('o', "output", &scan_args->output, "Output index file path. DEFAULT: index.sist2"), OPT_BOOLEAN(0, "incremental", &scan_args->incremental, "If the output file path exists, only scan new or modified files."), - OPT_BOOLEAN(0, "optimize-index", &scan_args->optimize_database, + OPT_BOOLEAN(0, "optimize-index", &common_optimize_database, "Defragment index file after scan to reduce its file size."), OPT_STRING(0, "rewrite-url", &scan_args->rewrite_url, "Serve files from this url instead of from disk."), OPT_STRING(0, "name", &scan_args->name, "Index display name. DEFAULT: index"), @@ -520,6 +540,11 @@ int main(int argc, const char *argv[]) { OPT_INTEGER(0, "batch-size", &index_args->batch_size, "Index batch size. DEFAULT: 70"), OPT_BOOLEAN('f', "force-reset", &index_args->force_reset, "Reset Elasticsearch mappings and settings."), + OPT_GROUP("sqlite-index options"), + OPT_STRING(0, "search-index", &sqlite_index_args->search_index_path, "Path to search index. Will be created if it does not exist yet."), + OPT_BOOLEAN(0, "optimize-index", &common_optimize_database, + "Optimize search index file for smaller size and faster queries."), + OPT_GROUP("Web options"), OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT: http://localhost:9200"), OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl, @@ -586,6 +611,9 @@ int main(int argc, const char *argv[]) { exec_args->async_script = common_async_script; index_args->async_script = common_async_script; + scan_args->optimize_database = common_optimize_database; + sqlite_index_args->optimize_database = common_optimize_database; + if (argc == 0) { argparse_usage(&argparse); goto end; @@ -605,6 +633,14 @@ int main(int argc, const char *argv[]) { } sist2_index(index_args); + } else if (strcmp(argv[0], "sqlite-index") == 0) { + + int err = sqlite_index_args_validate(sqlite_index_args, argc, argv); + if (err != 0) { + goto end; + } + sist2_sqlite_index(sqlite_index_args); + } else if (strcmp(argv[0], "web") == 0) { int err = web_args_validate(web_args, argc, argv);