This commit is contained in:
simon987 2023-04-25 08:49:50 -04:00
parent 35cfd3b3b1
commit 1cfceba518
9 changed files with 223 additions and 29 deletions

View File

@ -58,7 +58,7 @@ add_executable(sist2
src/auth0/auth0_c_api.h src/auth0/auth0_c_api.cpp
src/database/database_stats.c src/database/database_schema.c)
src/database/database_stats.c src/database/database_schema.c src/database/database_fts.c)
set_target_properties(sist2 PROPERTIES LINKER_LANGUAGE C)
target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/)

View File

@ -185,7 +185,7 @@ docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
3. Install vcpkg dependencies
```bash
vcpkg install curl[core,openssl] sqlite3 cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample]
vcpkg install curl[core,openssl] sqlite3[core,fts5] cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample]
```
4. Build

View File

@ -410,6 +410,33 @@ int index_args_validate(index_args_t *args, int argc, const char **argv) {
return 0;
}
int sqlite_index_args_validate(sqlite_index_args_t *args, int argc, const char **argv) {
LogCtx.verbose = 1;
if (argc < 2) {
fprintf(stderr, "Required positional argument: PATH.\n");
return 1;
}
char *index_path = abspath(argv[1]);
if (index_path == NULL) {
LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1]);
} else {
args->index_path = index_path;
}
if (args->search_index_path == NULL) {
LOG_FATAL("cli.c", "Missing required argument --search-index");
}
LOG_DEBUGF("cli.c", "arg index_path=%s", args->index_path);
LOG_DEBUGF("cli.c", "arg search_index_path=%s", args->search_index_path);
LOG_DEBUGF("cli.c", "arg optimize_index=%d", args->optimize_database);
return 0;
}
int web_args_validate(web_args_t *args, int argc, const char **argv) {
LogCtx.verbose = 1;
@ -554,6 +581,11 @@ index_args_t *index_args_create() {
return args;
}
sqlite_index_args_t *sqlite_index_args_create() {
sqlite_index_args_t *args = calloc(sizeof(sqlite_index_args_t), 1);
return args;
}
web_args_t *web_args_create() {
web_args_t *args = calloc(sizeof(web_args_t), 1);
return args;

View File

@ -66,6 +66,12 @@ typedef struct index_args {
int incremental;
} index_args_t;
typedef struct {
char *index_path;
char *search_index_path;
int optimize_database;
} sqlite_index_args_t;
typedef struct web_args {
char *es_url;
char *es_index;
@ -102,6 +108,8 @@ typedef struct exec_args {
index_args_t *index_args_create();
sqlite_index_args_t *sqlite_index_args_create();
void index_args_destroy(index_args_t *args);
web_args_t *web_args_create();
@ -110,6 +118,8 @@ void web_args_destroy(web_args_t *args);
int index_args_validate(index_args_t *args, int argc, const char **argv);
int sqlite_index_args_validate(sqlite_index_args_t *args, int argc, const char **argv);
int web_args_validate(web_args_t *args, int argc, const char **argv);
exec_args_t *exec_args_create();
@ -118,4 +128,5 @@ void exec_args_destroy(exec_args_t *args);
int exec_args_validate(exec_args_t *args, int argc, const char **argv);
#endif

View File

@ -74,6 +74,8 @@ void database_initialize(database_t *db) {
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, IndexDatabaseSchema, NULL, NULL, NULL));
} else if (db->type == IPC_CONSUMER_DATABASE || db->type == IPC_PRODUCER_DATABASE) {
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, IpcDatabaseSchema, NULL, NULL, NULL));
} else if (db->type == FTS_DATABASE) {
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, FtsDatabaseSchema, NULL, NULL, NULL));
}
sqlite3_close(db->db);
@ -479,28 +481,6 @@ void database_write_thumbnail(database_t *db, const char *id, int num, void *dat
}
//void database_create_fts_index(database_t *db, database_t *fts_db) {
// // In a separate file,
//
// // use database_initialize() to create FTS schema
// // if --force-reset, then truncate the tables first
//
// /*
// * create/append fts table
// *
// * create/append scalar index table with
// * id,index,size,mtime,mime
// *
// * create/append path index table with
// * index,path,depth
// *
// * content table is a view with SELECT UNION for all attached tables
// * random_seed column
// */
//
// // INSERT INTO ft(ft) VALUES('optimize');
//}
job_t *database_get_work(database_t *db, job_type_t job_type) {
job_t *job;

View File

@ -10,6 +10,7 @@ typedef struct index_descriptor index_descriptor_t;
extern const char *IpcDatabaseSchema;
extern const char *IndexDatabaseSchema;
extern const char *FtsDatabaseSchema;
typedef enum {
INDEX_DATABASE,
@ -86,8 +87,6 @@ typedef struct {
long size;
} treemap_row_t;
static treemap_row_t null_treemap_row = {0, 0, 0};
database_t *database_create(const char *filename, database_type_t type);
@ -116,7 +115,7 @@ cJSON *database_document_iter(database_iterator_t *);
database_iterator_t *database_create_delete_list_iterator(database_t *db);
char * database_delete_list_iter(database_iterator_t *iter);
char *database_delete_list_iter(database_iterator_t *iter);
#define database_delete_list_iter_foreach(element, iter) \
for (char *(element) = database_delete_list_iter(iter); (element) != NULL; (element) = database_delete_list_iter(iter))
@ -160,8 +159,14 @@ cJSON *database_get_stats(database_t *db, database_stat_type_d type);
#define CRASH_IF_NOT_SQLITE_OK(x) do { \
int return_value = x; \
if (return_value != SQLITE_OK) { \
LOG_FATALF("database.c", "Sqlite error @ database.c:%d : (%d) %s", __LINE__, return_value, sqlite3_errmsg(db->db)); \
LOG_FATALF("database.c", "Sqlite error @ %s:%d : (%d) %s", __BASE_FILE__, __LINE__, return_value, sqlite3_errmsg(db->db)); \
} \
} while (0)
void database_fts_attach(database_t *db, const char *fts_database_path);
void database_fts_index(database_t *db);
void database_fts_optimize(database_t *db);
#endif //SIST2_DATABASE_H

View File

@ -0,0 +1,88 @@
#include "database.h"
#include "src/ctx.h"
void database_fts_attach(database_t *db, const char *fts_database_path) {
LOG_DEBUGF("database_fts.c", "Attaching to %s", fts_database_path);
sqlite3_stmt *stmt;
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
db->db, "ATTACH DATABASE ? AS fts"
"", -1, &stmt, NULL));
sqlite3_bind_text(stmt, 1, fts_database_path, -1, SQLITE_STATIC);
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
sqlite3_finalize(stmt);
}
void database_fts_index(database_t *db) {
LOG_INFO("database_fts.c", "Creating content table.");
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
db->db,
"WITH docs AS (SELECT document.id as id,\n"
" (SELECT id FROM descriptor) as index_id,\n"
" size,\n"
" document.json_data ->> 'path' as path,\n"
" length(document.json_data->>'path') - length(REPLACE(document.json_data->>'path', '/', '')) as path_depth,\n"
" document.json_data ->> 'mime' as mime,\n"
" mtime,\n"
" CASE\n"
" WHEN sc.json_data IS NULL THEN CASE\n"
" WHEN t.tag IS NULL THEN json_set(\n"
" document.json_data, '$._id',\n"
" document.id, '$.size',\n"
" document.size, '$.mtime',\n"
" document.mtime)\n"
" ELSE json_set(document.json_data, '$._id',\n"
" document.id, '$.size',\n"
" document.size, '$.mtime',\n"
" document.mtime, '$.tag',\n"
" json_group_array(t.tag)) END\n"
" ELSE CASE\n"
" WHEN t.tag IS NULL THEN json_patch(\n"
" json_set(document.json_data, '$._id', document.id, '$.size',\n"
" document.size, '$.mtime', document.mtime),\n"
" sc.json_data)\n"
" ELSE json_set(json_patch(document.json_data, sc.json_data), '$._id',\n"
" document.id, '$.size', document.size, '$.mtime',\n"
" document.mtime, '$.tag',\n"
" json_group_array(t.tag)) END END as json_data\n"
" FROM document\n"
" LEFT JOIN document_sidecar sc ON document.id = sc.id\n"
" LEFT JOIN tag t ON document.id = t.id\n"
" GROUP BY document.id)\n"
"INSERT\n"
"INTO fts.document_index (id, index_id, size, path, path_depth, mtime, mime, json_data)\n"
"SELECT *\n"
"FROM docs\n"
"WHERE true\n"
"on conflict (id, index_id) do update set size=excluded.size,\n"
" mtime=excluded.mtime,\n"
" json_data=excluded.json_data;",
NULL, NULL, NULL));
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
db->db,
"DELETE\n"
"FROM fts.document_index\n"
"WHERE id IN (SELECT id FROM delete_list)\n"
" AND index_id = (SELECT id FROM descriptor);",
NULL, NULL, NULL
));
}
void database_fts_optimize(database_t *db) {
LOG_INFO("database_fts.c", "Optimizing search index.");
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
db->db,
"INSERT INTO search(search) VALUES('optimize');",
NULL, NULL, NULL));
LOG_DEBUG("database_fts.c", "Optimized fts5 table.");
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA fts.optimize;", NULL, NULL, NULL));
LOG_DEBUG("database_fts.c", "optimized indices.");
}

View File

@ -1,3 +1,45 @@
const char *FtsDatabaseSchema =
"CREATE TABLE IF NOT EXISTS document_index ("
" id TEXT NOT NULL,"
" index_id TEXT NOT NULL,"
" size INTEGER NOT NULL,"
" path TEXT NOT NULL,"
" path_depth INT NOT NULL,"
" mtime INTEGER NOT NULL,"
" mime TEXT NOT NULL,"
" json_data TEXT NOT NULL,"
" PRIMARY KEY (id, index_id)"
");"
""
"CREATE VIEW IF NOT EXISTS document_view (rowid, name, content)"
" AS"
" SELECT rowid,"
" json_data->>'name',"
" json_data->>'content'"
" FROM document_index;"
""
"CREATE INDEX IF NOT EXISTS document_index_size_idx ON document_index (size);"
"CREATE INDEX IF NOT EXISTS document_index_mtime_idx ON document_index (mtime);"
"CREATE INDEX IF NOT EXISTS document_index_mime_idx ON document_index (mime);"
"CREATE INDEX IF NOT EXISTS document_index_path_idx ON document_index (path);"
"CREATE INDEX IF NOT EXISTS document_index_path_depth_idx ON document_index (path_depth);"
""
"CREATE VIRTUAL TABLE IF NOT EXISTS search USING fts5 ("
" name,"
" content,"
" content='document_view'"
");"
""
"CREATE TRIGGER IF NOT EXISTS on_insert AFTER INSERT ON document_index BEGIN"
" INSERT INTO search(rowid, name, content) VALUES (new.rowid, new.json_data->>'name', new.json_data->>'content');"
"END;"
"CREATE TRIGGER IF NOT EXISTS on_delete AFTER DELETE ON document_index BEGIN"
" INSERT INTO search(search, name, content) VALUES('delete', old.json_data->>'name', old.json_data->>'content');"
"END;"
"CREATE TRIGGER IF NOT EXISTS on_update AFTER UPDATE ON document_index BEGIN"
" INSERT INTO search(search, rowid, name, content) VALUES('delete', old.rowid, old.json_data->>'name', old.json_data->>'content');"
" INSERT INTO search(rowid, name, content) VALUES (new.rowid, new.json_data->>'name', new.json_data->>'content');"
"END;";
const char *IpcDatabaseSchema =
"CREATE TABLE parse_job ("

View File

@ -22,6 +22,7 @@
static const char *const usage[] = {
"sist2 scan [OPTION]... PATH",
"sist2 index [OPTION]... INDEX",
"sist2 sqlite-index [OPTION]... INDEX",
"sist2 web [OPTION]... INDEX...",
"sist2 exec-script [OPTION]... INDEX",
NULL,
@ -351,6 +352,23 @@ void sist2_index(index_args_t *args) {
free(desc);
}
void sist2_sqlite_index(sqlite_index_args_t *args) {
database_t *db = database_create(args->index_path, INDEX_DATABASE);
database_open(db);
database_t *search_db = database_create(args->search_index_path, FTS_DATABASE);
database_initialize(search_db);
database_fts_attach(db, args->search_index_path);
database_fts_index(db);
if (args->optimize_database) {
database_fts_optimize(db);
}
database_close(db, FALSE);
}
void sist2_exec_script(exec_args_t *args) {
LogCtx.verbose = TRUE;
@ -436,6 +454,7 @@ int main(int argc, const char *argv[]) {
index_args_t *index_args = index_args_create();
web_args_t *web_args = web_args_create();
exec_args_t *exec_args = exec_args_create();
sqlite_index_args_t *sqlite_index_args = sqlite_index_args_create();
int arg_version = 0;
@ -445,6 +464,7 @@ int main(int argc, const char *argv[]) {
char *common_script_path = NULL;
int common_async_script = 0;
int common_threads = 0;
int common_optimize_database = 0;
struct argparse_option options[] = {
OPT_HELP(),
@ -471,7 +491,7 @@ int main(int argc, const char *argv[]) {
OPT_STRING('o', "output", &scan_args->output, "Output index file path. DEFAULT: index.sist2"),
OPT_BOOLEAN(0, "incremental", &scan_args->incremental,
"If the output file path exists, only scan new or modified files."),
OPT_BOOLEAN(0, "optimize-index", &scan_args->optimize_database,
OPT_BOOLEAN(0, "optimize-index", &common_optimize_database,
"Defragment index file after scan to reduce its file size."),
OPT_STRING(0, "rewrite-url", &scan_args->rewrite_url, "Serve files from this url instead of from disk."),
OPT_STRING(0, "name", &scan_args->name, "Index display name. DEFAULT: index"),
@ -520,6 +540,11 @@ int main(int argc, const char *argv[]) {
OPT_INTEGER(0, "batch-size", &index_args->batch_size, "Index batch size. DEFAULT: 70"),
OPT_BOOLEAN('f', "force-reset", &index_args->force_reset, "Reset Elasticsearch mappings and settings."),
OPT_GROUP("sqlite-index options"),
OPT_STRING(0, "search-index", &sqlite_index_args->search_index_path, "Path to search index. Will be created if it does not exist yet."),
OPT_BOOLEAN(0, "optimize-index", &common_optimize_database,
"Optimize search index file for smaller size and faster queries."),
OPT_GROUP("Web options"),
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT: http://localhost:9200"),
OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl,
@ -586,6 +611,9 @@ int main(int argc, const char *argv[]) {
exec_args->async_script = common_async_script;
index_args->async_script = common_async_script;
scan_args->optimize_database = common_optimize_database;
sqlite_index_args->optimize_database = common_optimize_database;
if (argc == 0) {
argparse_usage(&argparse);
goto end;
@ -605,6 +633,14 @@ int main(int argc, const char *argv[]) {
}
sist2_index(index_args);
} else if (strcmp(argv[0], "sqlite-index") == 0) {
int err = sqlite_index_args_validate(sqlite_index_args, argc, argv);
if (err != 0) {
goto end;
}
sist2_sqlite_index(sqlite_index_args);
} else if (strcmp(argv[0], "web") == 0) {
int err = web_args_validate(web_args, argc, argv);