use sqlite to save index, major thread pool refactor

This commit is contained in:
simon987 2023-04-03 21:39:50 -04:00
parent ca973d63a4
commit fc36f33d52
62 changed files with 3630 additions and 4673 deletions

2
.gitignore vendored
View File

@ -41,3 +41,5 @@ build.ninja
src/web/static_generated.c src/web/static_generated.c
src/magic_generated.c src/magic_generated.c
src/index/static_generated.c src/index/static_generated.c
*.sist2
*-shm

View File

@ -22,30 +22,33 @@ set(ARGPARSE_SHARED off)
add_subdirectory(third-party/argparse) add_subdirectory(third-party/argparse)
add_executable(sist2 add_executable(sist2
# argparse
third-party/argparse/argparse.h third-party/argparse/argparse.c
src/main.c src/main.c
src/sist.h src/sist.h
src/io/walk.h src/io/walk.c src/io/walk.h src/io/walk.c
src/io/store.h src/io/store.c
src/tpool.h src/tpool.c src/tpool.h src/tpool.c
src/parsing/parse.h src/parsing/parse.c src/parsing/parse.h src/parsing/parse.c
src/parsing/magic_util.c src/parsing/magic_util.h
src/io/serialize.h src/io/serialize.c src/io/serialize.h src/io/serialize.c
src/parsing/mime.h src/parsing/mime.c src/parsing/mime_generated.c src/parsing/mime.h src/parsing/mime.c src/parsing/mime_generated.c
src/index/web.c src/index/web.h src/index/web.c src/index/web.h
src/web/serve.c src/web/serve.h src/web/serve.c src/web/serve.h
src/web/web_util.c src/web/web_util.h
src/index/elastic.c src/index/elastic.h src/index/elastic.c src/index/elastic.h
src/util.c src/util.h src/util.c src/util.h
src/ctx.h src/types.h src/ctx.c src/ctx.h
src/types.h
src/log.c src/log.h src/log.c src/log.h
src/cli.c src/cli.h src/cli.c src/cli.h
src/stats.c src/stats.h src/ctx.c
src/parsing/sidecar.c src/parsing/sidecar.h src/parsing/sidecar.c src/parsing/sidecar.h
src/mempool/mempool.c src/mempool/mempool.h src/database/database.c src/database/database.h
src/parsing/fs_util.h
src/auth0/auth0_c_api.h src/auth0/auth0_c_api.cpp src/auth0/auth0_c_api.h src/auth0/auth0_c_api.cpp
# argparse src/database/database_stats.c src/database/database_stats.h src/database/database_schema.c)
third-party/argparse/argparse.h third-party/argparse/argparse.c
)
set_target_properties(sist2 PROPERTIES LINKER_LANGUAGE C) set_target_properties(sist2 PROPERTIES LINKER_LANGUAGE C)
target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/) target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/)
@ -53,8 +56,6 @@ set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib)
find_package(PkgConfig REQUIRED) find_package(PkgConfig REQUIRED)
pkg_search_module(GLIB REQUIRED glib-2.0)
find_package(lmdb CONFIG REQUIRED) find_package(lmdb CONFIG REQUIRED)
find_package(cJSON CONFIG REQUIRED) find_package(cJSON CONFIG REQUIRED)
find_package(unofficial-mongoose CONFIG REQUIRED) find_package(unofficial-mongoose CONFIG REQUIRED)
@ -63,6 +64,7 @@ find_library(MAGIC_LIB
NAMES libmagic.so.1 magic NAMES libmagic.so.1 magic
PATHS /usr/lib/x86_64-linux-gnu/ /usr/lib/aarch64-linux-gnu/ PATHS /usr/lib/x86_64-linux-gnu/ /usr/lib/aarch64-linux-gnu/
) )
find_package(unofficial-sqlite3 CONFIG REQUIRED)
target_include_directories( target_include_directories(
@ -71,7 +73,6 @@ target_include_directories(
${CMAKE_SOURCE_DIR}/third-party/utf8.h/ ${CMAKE_SOURCE_DIR}/third-party/utf8.h/
${CMAKE_SOURCE_DIR}/third-party/libscan/ ${CMAKE_SOURCE_DIR}/third-party/libscan/
${CMAKE_SOURCE_DIR}/ ${CMAKE_SOURCE_DIR}/
${GLIB_INCLUDE_DIRS}
) )
target_compile_options( target_compile_options(
@ -90,6 +91,7 @@ if (SIST_DEBUG)
-fsanitize=address -fsanitize=address
-fno-inline -fno-inline
# -O2 # -O2
-w
) )
target_link_options( target_link_options(
sist2 sist2
@ -121,6 +123,7 @@ else ()
-Ofast -Ofast
-fno-stack-protector -fno-stack-protector
-fomit-frame-pointer -fomit-frame-pointer
-w
) )
endif () endif ()
@ -137,17 +140,15 @@ target_link_libraries(
lmdb lmdb
cjson cjson
argparse argparse
${GLIB_LDFLAGS}
unofficial::mongoose::mongoose unofficial::mongoose::mongoose
CURL::libcurl CURL::libcurl
pthread pthread
c
scan scan
${MAGIC_LIB} ${MAGIC_LIB}
unofficial::sqlite3::sqlite3
) )
add_custom_target( add_custom_target(

View File

@ -150,7 +150,7 @@ docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
```bash ```bash
vcpkg install curl[core,openssl] vcpkg install curl[core,openssl]
vcpkg install lmdb cjson glib brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw jasper lcms gumbo vcpkg install lmdb sqlite3 cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw jasper lcms gumbo
``` ```
1. Build 1. Build

View File

@ -1,10 +1,13 @@
#!/usr/bin/env bash #!/usr/bin/env bash
rm -rf index.sist2/ (
cd ..
rm -rf index.sist2
python3 scripts/mime.py > src/parsing/mime_generated.c python3 scripts/mime.py > src/parsing/mime_generated.c
python3 scripts/serve_static.py > src/web/static_generated.c python3 scripts/serve_static.py > src/web/static_generated.c
python3 scripts/index_static.py > src/index/static_generated.c python3 scripts/index_static.py > src/index/static_generated.c
python3 scripts/magic_static.py > src/magic_generated.c python3 scripts/magic_static.py > src/magic_generated.c
printf "static const char *const Sist2CommitHash = \"%s\";\n" $(git rev-parse HEAD) > src/git_hash.h printf "static const char *const Sist2CommitHash = \"%s\";\n" $(git rev-parse HEAD) > src/git_hash.h
)

View File

@ -29,7 +29,7 @@ application/mime, aps
application/mspowerpoint, ppz application/mspowerpoint, ppz
application/msword, doc|dot|w6w|wiz|word application/msword, doc|dot|w6w|wiz|word
application/netmc, mcp application/netmc, mcp
application/octet-stream, bin|dump|gpg application/octet-stream, bin|dump|gpg|pack|idx
application/oda, oda application/oda, oda
application/ogg, ogv application/ogg, ogv
application/pdf, pdf application/pdf, pdf
@ -243,7 +243,7 @@ audio/make, funk|my|pfunk
audio/midi, kar audio/midi, kar
audio/mid, rmi audio/mid, rmi
audio/mp4, m4b audio/mp4, m4b
audio/mpeg, m2a|mpa audio/mpeg, m2a|mpa|mpga
audio/ogg, ogg audio/ogg, ogg
audio/s3m, s3m audio/s3m, s3m
audio/tsp-audio, tsi audio/tsp-audio, tsi
@ -382,7 +382,7 @@ text/x-pascal, p
text/x-perl, pl text/x-perl, pl
text/x-php, php text/x-php, php
text/x-po, po text/x-po, po
text/x-python, py text/x-python, py|pyi
text/x-ruby, rb text/x-ruby, rb
text/x-sass, sass text/x-sass, sass
text/x-scss, scss text/x-scss, scss

1 application/arj arj
29 application/mspowerpoint ppz
30 application/msword doc|dot|w6w|wiz|word
31 application/netmc mcp
32 application/octet-stream bin|dump|gpg bin|dump|gpg|pack|idx
33 application/oda oda
34 application/ogg ogv
35 application/pdf pdf
243 audio/midi kar
244 audio/mid rmi
245 audio/mp4 m4b
246 audio/mpeg m2a|mpa m2a|mpa|mpga
247 audio/ogg ogg
248 audio/s3m s3m
249 audio/tsp-audio tsi
382 text/x-perl pl
383 text/x-php php
384 text/x-po po
385 text/x-python py py|pyi
386 text/x-ruby rb
387 text/x-sass sass
388 text/x-scss scss

View File

@ -1,3 +1,5 @@
import zlib
mimes = {} mimes = {}
noparse = set() noparse = set()
ext_in_hash = set() ext_in_hash = set()
@ -135,24 +137,40 @@ def clean(t):
return t.replace("/", "_").replace(".", "_").replace("+", "_").replace("-", "_") return t.replace("/", "_").replace(".", "_").replace("+", "_").replace("-", "_")
def crc(s):
return zlib.crc32(s.encode()) & 0xffffffff
with open("scripts/mime.csv") as f: with open("scripts/mime.csv") as f:
for l in f: for l in f:
mime, ext_list = l.split(",") mime, ext_list = l.split(",")
if l.startswith("!"): if l.startswith("!"):
mime = mime[1:] mime = mime[1:]
noparse.add(mime) noparse.add(mime)
ext = [x.strip() for x in ext_list.split("|")] ext = [x.strip() for x in ext_list.split("|") if x.strip() != ""]
mimes[mime] = ext mimes[mime] = ext
seen_crc = set()
for ext in mimes.values():
for e in ext:
if crc(e) in seen_crc:
raise Exception("CRC32 collision")
seen_crc.add(crc(e))
seen_crc = set()
for mime in mimes.keys():
if crc(mime) in seen_crc:
raise Exception("CRC32 collision")
seen_crc.add(crc(mime))
print("// **Generated by mime.py**") print("// **Generated by mime.py**")
print("#ifndef MIME_GENERATED_C") print("#ifndef MIME_GENERATED_C")
print("#define MIME_GENERATED_C") print("#define MIME_GENERATED_C")
print("#include <glib.h>\n")
print("#include <stdlib.h>\n") print("#include <stdlib.h>\n")
# Enum # Enum
print("enum mime {") print("enum mime {")
for mime, ext in sorted(mimes.items()): for mime, ext in sorted(mimes.items()):
print(" " + clean(mime) + "=" + mime_id(mime) + ",") print(f"{clean(mime)}={mime_id(mime)},")
print("};") print("};")
# Enum -> string # Enum -> string
@ -163,20 +181,20 @@ with open("scripts/mime.csv") as f:
print("default: return NULL;}}") print("default: return NULL;}}")
# Ext -> Enum # Ext -> Enum
print("GHashTable *mime_get_ext_table() {" print("unsigned int mime_extension_lookup(unsigned long extension_crc32) {"
"GHashTable *ext_table = g_hash_table_new(g_str_hash, g_str_equal);") "switch (extension_crc32) {")
for mime, ext in mimes.items(): for mime, ext in mimes.items():
for e in [e for e in ext if e]: if len(ext) > 0:
print("g_hash_table_insert(ext_table, \"" + e + "\", (gpointer)" + clean(mime) + ");") for e in ext:
if e in ext_in_hash: print(f"case {crc(e)}:", end="")
raise Exception("extension already in hash: " + e) print(f"return {clean(mime)};")
ext_in_hash.add(e) print("default: return 0;}}")
print("return ext_table;}")
# string -> Enum # string -> Enum
print("GHashTable *mime_get_mime_table() {" print("unsigned int mime_name_lookup(unsigned long mime_crc32) {"
"GHashTable *mime_table = g_hash_table_new(g_str_hash, g_str_equal);") "switch (mime_crc32) {")
for mime, ext in mimes.items(): for mime in mimes.keys():
print("g_hash_table_insert(mime_table, \"" + mime + "\", (gpointer)" + clean(mime) + ");") print(f"case {crc(mime)}: return {clean(mime)};")
print("return mime_table;}")
print("default: return 0;}}")
print("#endif") print("#endif")

View File

@ -4,7 +4,7 @@
<meta charset="utf-8"> <meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge"> <meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width,initial-scale=1.0"> <meta name="viewport" content="width=device-width,initial-scale=1.0">
<link rel="icon" href="<%= BASE_URL %>favicon.ico"> <link rel="icon" href="<%= BASE_URL %>serve_favicon_ico.ico">
<title>sist2-admin</title> <title>sist2-admin</title>
</head> </head>
<body> <body>

View File

@ -1,12 +1,13 @@
#ifndef SIST2_AUTH0_C_API_H #ifndef SIST2_AUTH0_C_API_H
#define SIST2_AUTH0_C_API_H #define SIST2_AUTH0_C_API_H
#include "stdlib.h"
#ifdef __cplusplus #ifdef __cplusplus
#define EXTERNC extern "C" #define EXTERNC extern "C"
#include "cstdlib"
#else #else
#define EXTERNC #define EXTERNC
#include "stdlib.h"
#endif #endif
#define AUTH0_OK (0) #define AUTH0_OK (0)

163
src/cli.c
View File

@ -2,16 +2,17 @@
#include "ctx.h" #include "ctx.h"
#include <tesseract/capi.h> #include <tesseract/capi.h>
#define DEFAULT_OUTPUT "index.sist2/" #define DEFAULT_OUTPUT "index.sist2"
#define DEFAULT_NAME "index"
#define DEFAULT_CONTENT_SIZE 32768 #define DEFAULT_CONTENT_SIZE 32768
#define DEFAULT_QUALITY 2 #define DEFAULT_QUALITY 2
#define DEFAULT_THUMBNAIL_SIZE 500 #define DEFAULT_THUMBNAIL_SIZE 552
#define DEFAULT_THUMBNAIL_COUNT 1 #define DEFAULT_THUMBNAIL_COUNT 1
#define DEFAULT_REWRITE_URL "" #define DEFAULT_REWRITE_URL ""
#define DEFAULT_ES_URL "http://localhost:9200" #define DEFAULT_ES_URL "http://localhost:9200"
#define DEFAULT_ES_INDEX "sist2" #define DEFAULT_ES_INDEX "sist2"
#define DEFAULT_BATCH_SIZE 100 #define DEFAULT_BATCH_SIZE 70
#define DEFAULT_TAGLINE "Lightning-fast file system indexer and search tool" #define DEFAULT_TAGLINE "Lightning-fast file system indexer and search tool"
#define DEFAULT_LANG "en" #define DEFAULT_LANG "en"
@ -20,8 +21,6 @@
#define DEFAULT_MAX_MEM_BUFFER 2000 #define DEFAULT_MAX_MEM_BUFFER 2000
#define DEFAULT_THROTTLE_MEMORY_THRESHOLD 0
const char *TESS_DATAPATHS[] = { const char *TESS_DATAPATHS[] = {
"/usr/share/tessdata/", "/usr/share/tessdata/",
"/usr/share/tesseract-ocr/tessdata/", "/usr/share/tesseract-ocr/tessdata/",
@ -48,9 +47,6 @@ void scan_args_destroy(scan_args_t *args) {
if (args->name != NULL) { if (args->name != NULL) {
free(args->name); free(args->name);
} }
if (args->incremental != NULL) {
free(args->incremental);
}
if (args->path != NULL) { if (args->path != NULL) {
free(args->path); free(args->path);
} }
@ -61,7 +57,6 @@ void scan_args_destroy(scan_args_t *args) {
} }
void index_args_destroy(index_args_t *args) { void index_args_destroy(index_args_t *args) {
//todo
if (args->es_mappings_path) { if (args->es_mappings_path) {
free(args->es_mappings); free(args->es_mappings);
} }
@ -76,7 +71,6 @@ void index_args_destroy(index_args_t *args) {
} }
void web_args_destroy(web_args_t *args) { void web_args_destroy(web_args_t *args) {
//todo
free(args); free(args);
} }
@ -97,19 +91,13 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
char *abs_path = abspath(argv[1]); char *abs_path = abspath(argv[1]);
if (abs_path == NULL) { if (abs_path == NULL) {
LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1]) LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1]);
} else { } else {
abs_path = realloc(abs_path, strlen(abs_path) + 2);
strcat(abs_path, "/");
args->path = abs_path; args->path = abs_path;
} }
if (args->incremental != OPTION_VALUE_UNSPECIFIED) {
args->incremental = abspath(args->incremental);
if (abs_path == NULL) {
sist_log("main.c", LOG_SIST_WARNING, "Could not open original index! Disabled incremental scan feature.");
args->incremental = NULL;
}
}
if (args->tn_quality == OPTION_VALUE_UNSPECIFIED) { if (args->tn_quality == OPTION_VALUE_UNSPECIFIED) {
args->tn_quality = DEFAULT_QUALITY; args->tn_quality = DEFAULT_QUALITY;
} else if (args->tn_quality < 2 || args->tn_quality > 31) { } else if (args->tn_quality < 2 || args->tn_quality > 31) {
@ -152,20 +140,24 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
args->output = expandpath(args->output); args->output = expandpath(args->output);
} }
int ret = mkdir(args->output, S_IRUSR | S_IWUSR | S_IXUSR); char *abs_output = abspath(args->output);
if (ret != 0) { if (args->incremental && abs_output == NULL) {
fprintf(stderr, "Invalid output: '%s' (%s).\n", args->output, strerror(errno)); LOG_WARNINGF("main.c", "Could not open original index for incremental scan: %s. Will not perform incremental scan.", abs_output);
return 1; args->incremental = FALSE;
} else if (!args->incremental && abs_output != NULL) {
LOG_FATALF("main.c", "Index already exists: %s. If you wish to perform incremental scan, you must specify --incremental", abs_output);
} }
free(abs_output);
if (args->depth <= 0) { if (args->depth <= 0) {
args->depth = G_MAXINT32; args->depth = 2147483647;
} else { } else {
args->depth += 1; args->depth += 1;
} }
if (args->name == OPTION_VALUE_UNSPECIFIED) { if (args->name == OPTION_VALUE_UNSPECIFIED) {
args->name = g_path_get_basename(args->output); args->name = malloc(strlen(DEFAULT_NAME) + 1);
strcpy(args->name, DEFAULT_NAME);
} else { } else {
char *tmp = malloc(strlen(args->name) + 1); char *tmp = malloc(strlen(args->name) + 1);
strcpy(tmp, args->name); strcpy(tmp, args->name);
@ -224,7 +216,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
} }
if (trained_data_path != NULL && path != trained_data_path) { if (trained_data_path != NULL && path != trained_data_path) {
LOG_FATAL("cli.c", "When specifying more than one tesseract language, all the traineddata " LOG_FATAL("cli.c", "When specifying more than one tesseract language, all the traineddata "
"files must be in the same folder") "files must be in the same folder");
} }
trained_data_path = path; trained_data_path = path;
@ -232,7 +224,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
} }
free(lang); free(lang);
ret = TessBaseAPIInit3(api, trained_data_path, args->tesseract_lang); int ret = TessBaseAPIInit3(api, trained_data_path, args->tesseract_lang);
if (ret != 0) { if (ret != 0) {
fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang); fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang);
return 1; return 1;
@ -249,12 +241,12 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
pcre *re = pcre_compile(args->exclude_regex, 0, &error, &error_offset, 0); pcre *re = pcre_compile(args->exclude_regex, 0, &error, &error_offset, 0);
if (error != NULL) { if (error != NULL) {
LOG_FATALF("cli.c", "pcre_compile returned error: %s (offset:%d)", error, error_offset) LOG_FATALF("cli.c", "pcre_compile returned error: %s (offset:%d)", error, error_offset);
} }
pcre_extra *re_extra = pcre_study(re, 0, &error); pcre_extra *re_extra = pcre_study(re, 0, &error);
if (error != NULL) { if (error != NULL) {
LOG_FATALF("cli.c", "pcre_study returned error: %s", error) LOG_FATALF("cli.c", "pcre_study returned error: %s", error);
} }
ScanCtx.exclude = re; ScanCtx.exclude = re;
@ -276,7 +268,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
if (args->list_path != OPTION_VALUE_UNSPECIFIED) { if (args->list_path != OPTION_VALUE_UNSPECIFIED) {
if (strcmp(args->list_path, "-") == 0) { if (strcmp(args->list_path, "-") == 0) {
args->list_file = stdin; args->list_file = stdin;
LOG_DEBUG("cli.c", "Using stdin as list file") LOG_DEBUG("cli.c", "Using stdin as list file");
} else { } else {
args->list_file = fopen(args->list_path, "r"); args->list_file = fopen(args->list_path, "r");
@ -286,27 +278,27 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
} }
} }
LOG_DEBUGF("cli.c", "arg tn_quality=%f", args->tn_quality) LOG_DEBUGF("cli.c", "arg tn_quality=%f", args->tn_quality);
LOG_DEBUGF("cli.c", "arg tn_size=%d", args->tn_size) LOG_DEBUGF("cli.c", "arg tn_size=%d", args->tn_size);
LOG_DEBUGF("cli.c", "arg tn_count=%d", args->tn_count) LOG_DEBUGF("cli.c", "arg tn_count=%d", args->tn_count);
LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size) LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size);
LOG_DEBUGF("cli.c", "arg threads=%d", args->threads) LOG_DEBUGF("cli.c", "arg threads=%d", args->threads);
LOG_DEBUGF("cli.c", "arg incremental=%s", args->incremental) LOG_DEBUGF("cli.c", "arg incremental=%d", args->incremental);
LOG_DEBUGF("cli.c", "arg output=%s", args->output) LOG_DEBUGF("cli.c", "arg output=%s", args->output);
LOG_DEBUGF("cli.c", "arg rewrite_url=%s", args->rewrite_url) LOG_DEBUGF("cli.c", "arg rewrite_url=%s", args->rewrite_url);
LOG_DEBUGF("cli.c", "arg name=%s", args->name) LOG_DEBUGF("cli.c", "arg name=%s", args->name);
LOG_DEBUGF("cli.c", "arg depth=%d", args->depth) LOG_DEBUGF("cli.c", "arg depth=%d", args->depth);
LOG_DEBUGF("cli.c", "arg path=%s", args->path) LOG_DEBUGF("cli.c", "arg path=%s", args->path);
LOG_DEBUGF("cli.c", "arg archive=%s", args->archive) LOG_DEBUGF("cli.c", "arg archive=%s", args->archive);
LOG_DEBUGF("cli.c", "arg archive_passphrase=%s", args->archive_passphrase) LOG_DEBUGF("cli.c", "arg archive_passphrase=%s", args->archive_passphrase);
LOG_DEBUGF("cli.c", "arg tesseract_lang=%s", args->tesseract_lang) LOG_DEBUGF("cli.c", "arg tesseract_lang=%s", args->tesseract_lang);
LOG_DEBUGF("cli.c", "arg tesseract_path=%s", args->tesseract_path) LOG_DEBUGF("cli.c", "arg tesseract_path=%s", args->tesseract_path);
LOG_DEBUGF("cli.c", "arg exclude=%s", args->exclude_regex) LOG_DEBUGF("cli.c", "arg exclude=%s", args->exclude_regex);
LOG_DEBUGF("cli.c", "arg fast=%d", args->fast) LOG_DEBUGF("cli.c", "arg fast=%d", args->fast);
LOG_DEBUGF("cli.c", "arg fast_epub=%d", args->fast_epub) LOG_DEBUGF("cli.c", "arg fast_epub=%d", args->fast_epub);
LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold) LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold);
LOG_DEBUGF("cli.c", "arg max_memory_buffer_mib=%d", args->max_memory_buffer_mib) LOG_DEBUGF("cli.c", "arg max_memory_buffer_mib=%d", args->max_memory_buffer_mib);
LOG_DEBUGF("cli.c", "arg list_path=%s", args->list_path) LOG_DEBUGF("cli.c", "arg list_path=%s", args->list_path);
return 0; return 0;
} }
@ -316,20 +308,20 @@ int load_external_file(const char *file_path, char **dst) {
int res = stat(file_path, &info); int res = stat(file_path, &info);
if (res == -1) { if (res == -1) {
LOG_ERRORF("cli.c", "Error opening file '%s': %s\n", file_path, strerror(errno)) LOG_ERRORF("cli.c", "Error opening file '%s': %s\n", file_path, strerror(errno));
return 1; return 1;
} }
int fd = open(file_path, O_RDONLY); int fd = open(file_path, O_RDONLY);
if (fd == -1) { if (fd == -1) {
LOG_ERRORF("cli.c", "Error opening file '%s': %s\n", file_path, strerror(errno)) LOG_ERRORF("cli.c", "Error opening file '%s': %s\n", file_path, strerror(errno));
return 1; return 1;
} }
*dst = malloc(info.st_size + 1); *dst = malloc(info.st_size + 1);
res = read(fd, *dst, info.st_size); res = read(fd, *dst, info.st_size);
if (res < 0) { if (res < 0) {
LOG_ERRORF("cli.c", "Error reading file '%s': %s\n", file_path, strerror(errno)) LOG_ERRORF("cli.c", "Error reading file '%s': %s\n", file_path, strerror(errno));
return 1; return 1;
} }
@ -357,7 +349,7 @@ int index_args_validate(index_args_t *args, int argc, const char **argv) {
char *index_path = abspath(argv[1]); char *index_path = abspath(argv[1]);
if (index_path == NULL) { if (index_path == NULL) {
LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1]) LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1]);
} else { } else {
args->index_path = index_path; args->index_path = index_path;
} }
@ -392,28 +384,28 @@ int index_args_validate(index_args_t *args, int argc, const char **argv) {
args->batch_size = DEFAULT_BATCH_SIZE; args->batch_size = DEFAULT_BATCH_SIZE;
} }
LOG_DEBUGF("cli.c", "arg es_url=%s", args->es_url) LOG_DEBUGF("cli.c", "arg es_url=%s", args->es_url);
LOG_DEBUGF("cli.c", "arg es_index=%s", args->es_index) LOG_DEBUGF("cli.c", "arg es_index=%s", args->es_index);
LOG_DEBUGF("cli.c", "arg es_insecure_ssl=%d", args->es_insecure_ssl) LOG_DEBUGF("cli.c", "arg es_insecure_ssl=%d", args->es_insecure_ssl);
LOG_DEBUGF("cli.c", "arg index_path=%s", args->index_path) LOG_DEBUGF("cli.c", "arg index_path=%s", args->index_path);
LOG_DEBUGF("cli.c", "arg script_path=%s", args->script_path) LOG_DEBUGF("cli.c", "arg script_path=%s", args->script_path);
LOG_DEBUGF("cli.c", "arg async_script=%d", args->async_script) LOG_DEBUGF("cli.c", "arg async_script=%d", args->async_script);
if (args->script) { if (args->script) {
char log_buf[5000]; char log_buf[5000];
strncpy(log_buf, args->script, sizeof(log_buf)); strncpy(log_buf, args->script, sizeof(log_buf));
*(log_buf + sizeof(log_buf) - 1) = '\0'; *(log_buf + sizeof(log_buf) - 1) = '\0';
LOG_DEBUGF("cli.c", "arg script=%s", log_buf) LOG_DEBUGF("cli.c", "arg script=%s", log_buf);
} }
LOG_DEBUGF("cli.c", "arg print=%d", args->print) LOG_DEBUGF("cli.c", "arg print=%d", args->print);
LOG_DEBUGF("cli.c", "arg es_mappings_path=%s", args->es_mappings_path) LOG_DEBUGF("cli.c", "arg es_mappings_path=%s", args->es_mappings_path);
LOG_DEBUGF("cli.c", "arg es_mappings=%s", args->es_mappings) LOG_DEBUGF("cli.c", "arg es_mappings=%s", args->es_mappings);
LOG_DEBUGF("cli.c", "arg es_settings_path=%s", args->es_settings_path) LOG_DEBUGF("cli.c", "arg es_settings_path=%s", args->es_settings_path);
LOG_DEBUGF("cli.c", "arg es_settings=%s", args->es_settings) LOG_DEBUGF("cli.c", "arg es_settings=%s", args->es_settings);
LOG_DEBUGF("cli.c", "arg batch_size=%d", args->batch_size) LOG_DEBUGF("cli.c", "arg batch_size=%d", args->batch_size);
LOG_DEBUGF("cli.c", "arg force_reset=%d", args->force_reset) LOG_DEBUGF("cli.c", "arg force_reset=%d", args->force_reset);
return 0; return 0;
} }
@ -534,23 +526,24 @@ int web_args_validate(web_args_t *args, int argc, const char **argv) {
for (int i = 0; i < args->index_count; i++) { for (int i = 0; i < args->index_count; i++) {
char *abs_path = abspath(args->indices[i]); char *abs_path = abspath(args->indices[i]);
if (abs_path == NULL) { if (abs_path == NULL) {
LOG_FATALF("cli.c", "Index not found: %s", args->indices[i]) LOG_FATALF("cli.c", "Index not found: %s", args->indices[i]);
} }
free(abs_path);
} }
LOG_DEBUGF("cli.c", "arg es_url=%s", args->es_url) LOG_DEBUGF("cli.c", "arg es_url=%s", args->es_url);
LOG_DEBUGF("cli.c", "arg es_index=%s", args->es_index) LOG_DEBUGF("cli.c", "arg es_index=%s", args->es_index);
LOG_DEBUGF("cli.c", "arg es_insecure_ssl=%d", args->es_insecure_ssl) LOG_DEBUGF("cli.c", "arg es_insecure_ssl=%d", args->es_insecure_ssl);
LOG_DEBUGF("cli.c", "arg tagline=%s", args->tagline) LOG_DEBUGF("cli.c", "arg tagline=%s", args->tagline);
LOG_DEBUGF("cli.c", "arg dev=%d", args->dev) LOG_DEBUGF("cli.c", "arg dev=%d", args->dev);
LOG_DEBUGF("cli.c", "arg listen=%s", args->listen_address) LOG_DEBUGF("cli.c", "arg listen=%s", args->listen_address);
LOG_DEBUGF("cli.c", "arg credentials=%s", args->credentials) LOG_DEBUGF("cli.c", "arg credentials=%s", args->credentials);
LOG_DEBUGF("cli.c", "arg tag_credentials=%s", args->tag_credentials) LOG_DEBUGF("cli.c", "arg tag_credentials=%s", args->tag_credentials);
LOG_DEBUGF("cli.c", "arg auth_user=%s", args->auth_user) LOG_DEBUGF("cli.c", "arg auth_user=%s", args->auth_user);
LOG_DEBUGF("cli.c", "arg auth_pass=%s", args->auth_pass) LOG_DEBUGF("cli.c", "arg auth_pass=%s", args->auth_pass);
LOG_DEBUGF("cli.c", "arg index_count=%d", args->index_count) LOG_DEBUGF("cli.c", "arg index_count=%d", args->index_count);
for (int i = 0; i < args->index_count; i++) { for (int i = 0; i < args->index_count; i++) {
LOG_DEBUGF("cli.c", "arg indices[%d]=%s", i, args->indices[i]) LOG_DEBUGF("cli.c", "arg indices[%d]=%s", i, args->indices[i]);
} }
return 0; return 0;
@ -575,7 +568,7 @@ int exec_args_validate(exec_args_t *args, int argc, const char **argv) {
char *index_path = abspath(argv[1]); char *index_path = abspath(argv[1]);
if (index_path == NULL) { if (index_path == NULL) {
LOG_FATALF("cli.c", "Invalid index PATH argument. File not found: %s", argv[1]) LOG_FATALF("cli.c", "Invalid index PATH argument. File not found: %s", argv[1]);
} else { } else {
args->index_path = index_path; args->index_path = index_path;
} }
@ -596,12 +589,12 @@ int exec_args_validate(exec_args_t *args, int argc, const char **argv) {
return 1; return 1;
} }
LOG_DEBUGF("cli.c", "arg script_path=%s", args->script_path) LOG_DEBUGF("cli.c", "arg script_path=%s", args->script_path);
char log_buf[5000]; char log_buf[5000];
strncpy(log_buf, args->script, sizeof(log_buf)); strncpy(log_buf, args->script, sizeof(log_buf));
*(log_buf + sizeof(log_buf) - 1) = '\0'; *(log_buf + sizeof(log_buf) - 1) = '\0';
LOG_DEBUGF("cli.c", "arg script=%s", log_buf) LOG_DEBUGF("cli.c", "arg script=%s", log_buf);
return 0; return 0;
} }

View File

@ -13,7 +13,7 @@ typedef struct scan_args {
int tn_size; int tn_size;
int content_size; int content_size;
int threads; int threads;
char *incremental; int incremental;
char *output; char *output;
char *rewrite_url; char *rewrite_url;
char *name; char *name;

View File

@ -3,9 +3,10 @@
ScanCtx_t ScanCtx = { ScanCtx_t ScanCtx = {
.stat_index_size = 0, .stat_index_size = 0,
.stat_tn_size = 0, .stat_tn_size = 0,
.dbg_current_files = NULL, .pool = NULL,
.pool = NULL .index.path = {0,},
}; };
WebCtx_t WebCtx; WebCtx_t WebCtx;
IndexCtx_t IndexCtx; IndexCtx_t IndexCtx;
LogCtx_t LogCtx; LogCtx_t LogCtx;
__thread ProcData_t ProcData;

View File

@ -16,22 +16,17 @@
#include "libscan/msdoc/msdoc.h" #include "libscan/msdoc/msdoc.h"
#include "libscan/wpd/wpd.h" #include "libscan/wpd/wpd.h"
#include "libscan/json/json.h" #include "libscan/json/json.h"
#include "src/io/store.h" #include "src/database/database.h"
#include "src/index/elastic.h" #include "src/index/elastic.h"
#include "sqlite3.h"
#include <glib.h>
#include <pcre.h> #include <pcre.h>
typedef struct { typedef struct {
struct index_t index; struct index_t index;
GHashTable *mime_table;
GHashTable *ext_table;
tpool_t *pool; tpool_t *pool;
tpool_t *writer_pool;
int threads; int threads;
int depth; int depth;
int calculate_checksums; int calculate_checksums;
@ -39,16 +34,10 @@ typedef struct {
size_t stat_tn_size; size_t stat_tn_size;
size_t stat_index_size; size_t stat_index_size;
GHashTable *original_table;
GHashTable *copy_table;
GHashTable *new_table;
pthread_mutex_t copy_table_mu;
pcre *exclude; pcre *exclude;
pcre_extra *exclude_extra; pcre_extra *exclude_extra;
int fast; int fast;
GHashTable *dbg_current_files;
pthread_mutex_t dbg_current_files_mu; pthread_mutex_t dbg_current_files_mu;
int dbg_failed_files_count; int dbg_failed_files_count;
@ -84,10 +73,6 @@ typedef struct {
char *es_index; char *es_index;
int batch_size; int batch_size;
tpool_t *pool; tpool_t *pool;
store_t *tag_store;
GHashTable *tags;
store_t *meta_store;
GHashTable *meta;
/** /**
* Set to false when using --print * Set to false when using --print
*/ */
@ -117,10 +102,18 @@ typedef struct {
int dev; int dev;
} WebCtx_t; } WebCtx_t;
typedef struct {
int thread_id;
database_t *ipc_db;
database_t *index_db;
} ProcData_t;
extern ScanCtx_t ScanCtx; extern ScanCtx_t ScanCtx;
extern WebCtx_t WebCtx; extern WebCtx_t WebCtx;
extern IndexCtx_t IndexCtx; extern IndexCtx_t IndexCtx;
extern LogCtx_t LogCtx; extern LogCtx_t LogCtx;
extern __thread ProcData_t ProcData;
#endif #endif

586
src/database/database.c Normal file
View File

@ -0,0 +1,586 @@
#include "database.h"
#include "malloc.h"
#include "src/ctx.h"
#include <string.h>
#include <pthread.h>
#include "src/util.h"
#include <time.h>
database_t *database_create(const char *filename, database_type_t type) {
database_t *db = malloc(sizeof(database_t));
strcpy(db->filename, filename);
db->type = type;
db->select_thumbnail_stmt = NULL;
db->ipc_ctx = NULL;
return db;
}
__always_inline
static int sep_rfind(const char *str) {
for (int i = (int) strlen(str); i >= 0; i--) {
if (str[i] == '/') {
return i;
}
}
return -1;
}
void path_parent_func(sqlite3_context *ctx, int argc, sqlite3_value **argv) {
if (argc != 1 || sqlite3_value_type(argv[0]) != SQLITE_TEXT) {
sqlite3_result_error(ctx, "Invalid parameters", -1);
}
const char *value = (const char *) sqlite3_value_text(argv[0]);
int stop = sep_rfind(value);
if (stop == -1) {
sqlite3_result_null(ctx);
return;
}
char parent[PATH_MAX * 3];
strncpy(parent, value, stop);
sqlite3_result_text(ctx, parent, stop, SQLITE_TRANSIENT);
}
void save_current_job_info(sqlite3_context *ctx, int argc, sqlite3_value **argv) {
if (argc != 1 || sqlite3_value_type(argv[0]) != SQLITE_TEXT) {
sqlite3_result_error(ctx, "Invalid parameters", -1);
}
database_ipc_ctx_t *ipc_ctx = sqlite3_user_data(ctx);
const char *current_job = (const char *) sqlite3_value_text(argv[0]);
char buf[PATH_MAX];
strcpy(buf, current_job);
strcpy(ipc_ctx->current_job[ProcData.thread_id], current_job);
sqlite3_result_text(ctx, "ok", -1, SQLITE_STATIC);
}
void database_initialize(database_t *db) {
CRASH_IF_NOT_SQLITE_OK(sqlite3_open(db->filename, &db->db));
LOG_DEBUGF("database.c", "Initializing database %s", db->filename);
if (db->type == INDEX_DATABASE) {
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, IndexDatabaseSchema, NULL, NULL, NULL));
} else if (db->type == IPC_CONSUMER_DATABASE || db->type == IPC_PRODUCER_DATABASE) {
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, IpcDatabaseSchema, NULL, NULL, NULL));
}
sqlite3_close(db->db);
}
void database_open(database_t *db) {
LOG_DEBUGF("tpool.c", "Opening database %s (%d)", db->filename, db->type);
CRASH_IF_NOT_SQLITE_OK(sqlite3_open(db->filename, &db->db));
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA cache_size = -200000;", NULL, NULL, NULL));
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA synchronous = OFF;", NULL, NULL, NULL));
if (db->type == INDEX_DATABASE) {
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA temp_store = memory;", NULL, NULL, NULL));
}
if (db->type == INDEX_DATABASE) {
// Prepare statements;
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
db->db,
"SELECT data FROM thumbnail WHERE id=? AND num=? LIMIT 1;", -1,
&db->select_thumbnail_stmt, NULL));
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
db->db,
"UPDATE document SET marked=1 WHERE id=? AND mtime=? RETURNING id",
-1,
&db->mark_document_stmt, NULL));
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
db->db,
"REPLACE INTO document_sidecar (id, json_data) VALUES (?,?)", -1,
&db->write_document_sidecar_stmt, NULL));
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
db->db,
"REPLACE INTO document (id, mtime, size, json_data) VALUES (?, ?, ?, ?);", -1,
&db->write_document_stmt, NULL));
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
db->db,
"INSERT INTO thumbnail (id, num, data) VALUES (?,?,?) ON CONFLICT DO UPDATE SET data=excluded.data;", -1,
&db->write_thumbnail_stmt, NULL));
// Create functions
sqlite3_create_function(
db->db,
"path_parent",
1,
SQLITE_UTF8,
NULL,
path_parent_func,
NULL,
NULL
);
} else if (db->type == IPC_CONSUMER_DATABASE) {
sqlite3_create_function(
db->db,
"save_current_job_info",
1,
SQLITE_UTF8,
db->ipc_ctx,
save_current_job_info,
NULL,
NULL
);
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
db->db,
"DELETE FROM parse_job WHERE id = (SELECT MIN(id) FROM parse_job)"
" RETURNING filepath,mtime,st_size,save_current_job_info(filepath);",
-1, &db->pop_parse_job_stmt, NULL
));
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
db->db,
"DELETE FROM index_job WHERE id = (SELECT MIN(id) FROM index_job)"
" RETURNING doc_id,type,line;",
-1, &db->pop_index_job_stmt, NULL
));
} else if (db->type == IPC_PRODUCER_DATABASE) {
char sql[40];
int max_size_mb = 10; // TODO: read from args.
snprintf(sql, sizeof(sql), "PRAGMA max_page_count=%d", (max_size_mb * 1024 * 1024) / 4096);
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, sql, NULL, NULL, NULL));
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
db->db, "INSERT INTO parse_job (filepath,mtime,st_size) VALUES (?,?,?);", -1,
&db->insert_parse_job_stmt, NULL));
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
db->db, "INSERT INTO index_job (doc_id,type,line) VALUES (?,?,?);", -1,
&db->insert_index_job_stmt, NULL));
sqlite3_create_function(
db->db,
"path_parent",
1,
SQLITE_UTF8,
NULL,
path_parent_func,
NULL,
NULL
);
}
}
void database_close(database_t *db, int optimize) {
LOG_DEBUGF("database.c", "Closing database %s", db->filename);
if (optimize) {
LOG_DEBUG("database.c", "Optimizing database");
// TODO: This should be an optional argument
// CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "VACUUM;", NULL, NULL, NULL));
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA optimize;", NULL, NULL, NULL));
}
sqlite3_close(db->db);
free(db);
db = NULL;
}
void *database_read_thumbnail(database_t *db, const char *id, int num, size_t *return_value_len) {
sqlite3_bind_text(db->select_thumbnail_stmt, 1, id, -1, SQLITE_STATIC);
sqlite3_bind_int(db->select_thumbnail_stmt, 2, num);
int ret = sqlite3_step(db->select_thumbnail_stmt);
// TODO: if row not found, return null
if (ret != SQLITE_ROW) {
LOG_FATALF("database.c", "FIXME: tn step returned %d", ret);
}
const void *blob = sqlite3_column_blob(db->select_thumbnail_stmt, 0);
const int blob_size = sqlite3_column_bytes(db->select_thumbnail_stmt, 0);
*return_value_len = blob_size;
void *return_data = malloc(blob_size);
memcpy(return_data, blob, blob_size);
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->select_thumbnail_stmt));
return return_data;
}
void database_write_index_descriptor(database_t *db, index_descriptor_t *desc) {
sqlite3_exec(db->db, "DELETE FROM descriptor;", NULL, NULL, NULL);
sqlite3_stmt *stmt;
sqlite3_prepare_v2(db->db, "INSERT INTO descriptor (id, version_major, version_minor, version_patch,"
" root, name, rewrite_url, timestamp) VALUES (?,?,?,?,?,?,?,?);", -1, &stmt, NULL);
sqlite3_bind_text(stmt, 1, desc->id, -1, SQLITE_STATIC);
sqlite3_bind_int(stmt, 2, desc->version_major);
sqlite3_bind_int(stmt, 3, desc->version_minor);
sqlite3_bind_int(stmt, 4, desc->version_patch);
sqlite3_bind_text(stmt, 5, desc->root, -1, SQLITE_STATIC);
sqlite3_bind_text(stmt, 6, desc->name, -1, SQLITE_STATIC);
sqlite3_bind_text(stmt, 7, desc->rewrite_url, -1, SQLITE_STATIC);
sqlite3_bind_int64(stmt, 8, desc->timestamp);
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
sqlite3_finalize(stmt);
}
index_descriptor_t *database_read_index_descriptor(database_t *db) {
sqlite3_stmt *stmt;
sqlite3_prepare_v2(db->db, "SELECT id, version_major, version_minor, version_patch,"
" root, name, rewrite_url, timestamp FROM descriptor;", -1, &stmt, NULL);
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
const char *id = (char *) sqlite3_column_text(stmt, 0);
int v_major = sqlite3_column_int(stmt, 1);
int v_minor = sqlite3_column_int(stmt, 2);
int v_patch = sqlite3_column_int(stmt, 3);
const char *root = (char *) sqlite3_column_text(stmt, 4);
const char *name = (char *) sqlite3_column_text(stmt, 5);
const char *rewrite_url = (char *) sqlite3_column_text(stmt, 6);
int timestamp = sqlite3_column_int(stmt, 7);
index_descriptor_t *desc = malloc(sizeof(index_descriptor_t));
strcpy(desc->id, id);
snprintf(desc->version, sizeof(desc->version), "%d.%d.%d", v_major, v_minor, v_patch);
desc->version_major = v_major;
desc->version_minor = v_minor;
desc->version_patch = v_patch;
strcpy(desc->root, root);
strcpy(desc->name, name);
strcpy(desc->rewrite_url, rewrite_url);
desc->timestamp = timestamp;
CRASH_IF_NOT_SQLITE_OK(sqlite3_finalize(stmt));
return desc;
}
database_iterator_t *database_create_document_iterator(database_t *db) {
sqlite3_stmt *stmt;
// TODO: remove mtime, size, _id from json_data
sqlite3_prepare_v2(db->db, "WITH doc (j) AS (SELECT CASE"
" WHEN sc.json_data IS NULL THEN"
" CASE"
" WHEN t.tag IS NULL THEN"
" document.json_data"
" ELSE"
" json_set(document.json_data, '$.tag', json_group_array(t.tag))"
" END"
" ELSE"
" CASE"
" WHEN t.tag IS NULL THEN"
" json_patch(document.json_data, sc.json_data)"
" ELSE"
// This will overwrite any tags specified in the sidecar file!
// TODO: concatenate the two arrays?
" json_set(json_patch(document.json_data, sc.json_data), '$.tag', json_group_array(t.tag))"
" END"
" END"
" FROM document"
" LEFT JOIN document_sidecar sc ON document.id = sc.id"
" LEFT JOIN tag t ON document.id = t.id"
" GROUP BY document.id)"
" SELECT json_set(j, '$.index', (SELECT id FROM descriptor)) FROM doc", -1, &stmt, NULL);
database_iterator_t *iter = malloc(sizeof(database_iterator_t));
iter->stmt = stmt;
iter->db = db;
return iter;
}
cJSON *database_document_iter(database_iterator_t *iter) {
if (iter->stmt == NULL) {
LOG_ERROR("database.c", "FIXME: database_document_iter() called after iteration stopped");
return NULL;
}
int ret = sqlite3_step(iter->stmt);
if (ret == SQLITE_ROW) {
const char *json_string = (const char *) sqlite3_column_text(iter->stmt, 0);
return cJSON_Parse(json_string);
}
if (ret != SQLITE_DONE) {
LOG_FATALF("database.c", "FIXME: doc iter returned %s", sqlite3_errmsg(iter->db->db));
}
if (sqlite3_finalize(iter->stmt) != SQLITE_OK) {
LOG_FATALF("database.c", "FIXME: doc iter returned %s", sqlite3_errmsg(iter->db->db));
}
iter->stmt = NULL;
return NULL;
}
cJSON *database_incremental_scan_begin(database_t *db) {
LOG_DEBUG("database.c", "Preparing database for incremental scan");
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "UPDATE document SET marked=0;", NULL, NULL, NULL));
}
cJSON *database_incremental_scan_end(database_t *db) {
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
db->db,
"DELETE FROM delete_list WHERE id IN (SELECT id FROM document WHERE marked=1);",
NULL, NULL, NULL
));
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
db->db,
"DELETE FROM thumbnail WHERE id IN (SELECT id FROM document WHERE marked=0);",
NULL, NULL, NULL
));
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
db->db,
"INSERT INTO delete_list (id) SELECT id FROM document WHERE marked=0;",
NULL, NULL, NULL
));
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
db->db,
"DELETE FROM document_sidecar WHERE id IN (SELECT id FROM document WHERE marked=0);",
NULL, NULL, NULL
));
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
db->db,
"DELETE FROM document WHERE marked=0;",
NULL, NULL, NULL
));
}
int database_mark_document(database_t *db, const char *id, int mtime) {
sqlite3_bind_text(db->mark_document_stmt, 1, id, -1, SQLITE_STATIC);
sqlite3_bind_int(db->mark_document_stmt, 2, mtime);
pthread_mutex_lock(&db->ipc_ctx->index_db_mutex);
int ret = sqlite3_step(db->mark_document_stmt);
if (ret == SQLITE_ROW) {
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->mark_document_stmt));
pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
return TRUE;
}
if (ret == SQLITE_DONE) {
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->mark_document_stmt));
pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
return FALSE;
}
pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
CRASH_IF_STMT_FAIL(ret);
}
void database_write_document(database_t *db, document_t *doc, const char *json_data) {
sqlite3_bind_text(db->write_document_stmt, 1, doc->doc_id, -1, SQLITE_STATIC);
sqlite3_bind_int(db->write_document_stmt, 2, doc->mtime);
sqlite3_bind_int64(db->write_document_stmt, 3, (long) doc->size);
sqlite3_bind_text(db->write_document_stmt, 4, json_data, -1, SQLITE_STATIC);
pthread_mutex_lock(&db->ipc_ctx->index_db_mutex);
CRASH_IF_STMT_FAIL(sqlite3_step(db->write_document_stmt));
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->write_document_stmt));
pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
}
void database_write_document_sidecar(database_t *db, const char *id, const char *json_data) {
sqlite3_bind_text(db->write_document_sidecar_stmt, 1, id, -1, SQLITE_STATIC);
sqlite3_bind_text(db->write_document_sidecar_stmt, 2, json_data, -1, SQLITE_STATIC);
pthread_mutex_lock(&db->ipc_ctx->index_db_mutex);
CRASH_IF_STMT_FAIL(sqlite3_step(db->write_document_sidecar_stmt));
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->write_document_sidecar_stmt));
pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
}
void database_write_thumbnail(database_t *db, const char *id, int num, void *data, size_t data_size) {
sqlite3_bind_text(db->write_thumbnail_stmt, 1, id, -1, SQLITE_STATIC);
sqlite3_bind_int(db->write_thumbnail_stmt, 2, num);
sqlite3_bind_blob(db->write_thumbnail_stmt, 3, data, (int) data_size, SQLITE_STATIC);
pthread_mutex_lock(&db->ipc_ctx->index_db_mutex);
CRASH_IF_STMT_FAIL(sqlite3_step(db->write_thumbnail_stmt));
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->write_thumbnail_stmt));
pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
}
//void database_create_fts_index(database_t *db, database_t *fts_db) {
// // In a separate file,
//
// // use database_initialize() to create FTS schema
// // if --force-reset, then truncate the tables first
//
// /*
// * create/append fts table
// *
// * create/append scalar index table with
// * id,index,size,mtime,mime
// *
// * create/append path index table with
// * index,path,depth
// *
// * content table is a view with SELECT UNION for all attached tables
// * random_seed column
// */
//
// // INSERT INTO ft(ft) VALUES('optimize');
//}
job_t *database_get_work(database_t *db, job_type_t job_type) {
job_t *job;
pthread_mutex_lock(&db->ipc_ctx->mutex);
while (db->ipc_ctx->job_count == 0 && !db->ipc_ctx->no_more_jobs) {
pthread_cond_timedwait_ms(&db->ipc_ctx->has_work_cond, &db->ipc_ctx->mutex, 10);
}
pthread_mutex_unlock(&db->ipc_ctx->mutex);
pthread_mutex_lock(&db->ipc_ctx->db_mutex);
if (job_type == JOB_PARSE_JOB) {
int ret = sqlite3_step(db->pop_parse_job_stmt);
if (ret == SQLITE_DONE) {
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->pop_parse_job_stmt));
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
return NULL;
} else {
CRASH_IF_STMT_FAIL(ret);
}
job = malloc(sizeof(*job));
job->parse_job = create_parse_job(
(const char *) sqlite3_column_text(db->pop_parse_job_stmt, 0),
sqlite3_column_int(db->pop_parse_job_stmt, 1),
sqlite3_column_int64(db->pop_parse_job_stmt, 2));
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->pop_parse_job_stmt));
} else {
int ret = sqlite3_step(db->pop_index_job_stmt);
if (ret == SQLITE_DONE) {
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->pop_index_job_stmt));
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
return NULL;
} else {
CRASH_IF_STMT_FAIL(ret);
}
job = malloc(sizeof(*job));
const char *line = (const char *) sqlite3_column_text(db->pop_index_job_stmt, 2);
if (line != NULL) {
job->bulk_line = malloc(sizeof(es_bulk_line_t) + strlen(line) + 1);
strcpy(job->bulk_line->line, line);
} else {
job->bulk_line = malloc(sizeof(es_bulk_line_t));
}
strcpy(job->bulk_line->doc_id, (const char *) sqlite3_column_text(db->pop_index_job_stmt, 0));
job->bulk_line->type = sqlite3_column_int(db->pop_index_job_stmt, 1);
job->bulk_line->next = NULL;
// TODO CRASH IF NOT OK
sqlite3_step(db->pop_parse_job_stmt);
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->pop_index_job_stmt));
}
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
pthread_mutex_lock(&db->ipc_ctx->mutex);
db->ipc_ctx->job_count -= 1;
pthread_mutex_unlock(&db->ipc_ctx->mutex);
job->type = job_type;
return job;
}
void database_add_work(database_t *db, job_t *job) {
int ret;
pthread_mutex_lock(&db->ipc_ctx->db_mutex);
if (job->type == JOB_PARSE_JOB) {
do {
sqlite3_bind_text(db->insert_parse_job_stmt, 1, job->parse_job->filepath, -1, SQLITE_STATIC);
sqlite3_bind_int(db->insert_parse_job_stmt, 2, job->parse_job->vfile.mtime);
sqlite3_bind_int64(db->insert_parse_job_stmt, 3, (long) job->parse_job->vfile.st_size);
ret = sqlite3_step(db->insert_parse_job_stmt);
if (ret == SQLITE_FULL) {
usleep(1000000);
} else {
CRASH_IF_STMT_FAIL(ret);
}
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->insert_parse_job_stmt));
} while (ret != SQLITE_DONE);
} else if (job->type == JOB_BULK_LINE) {
do {
sqlite3_bind_text(db->insert_index_job_stmt, 1, job->bulk_line->doc_id, -1, SQLITE_STATIC);
sqlite3_bind_int(db->insert_index_job_stmt, 2, job->bulk_line->type);
sqlite3_bind_text(db->insert_index_job_stmt, 3, job->bulk_line->line, -1, SQLITE_STATIC);
ret = sqlite3_step(db->insert_index_job_stmt);
if (ret == SQLITE_FULL) {
sqlite3_reset(db->insert_index_job_stmt);
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
usleep(100000);
pthread_mutex_lock(&db->ipc_ctx->db_mutex);
continue;
} else {
CRASH_IF_STMT_FAIL(ret);
}
ret = sqlite3_reset(db->insert_index_job_stmt);
if (ret == SQLITE_FULL) {
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
usleep(100000);
pthread_mutex_lock(&db->ipc_ctx->db_mutex);
}
} while (ret != SQLITE_DONE && ret != SQLITE_OK);
} else {
LOG_FATAL("database.c", "FIXME: invalid job type");
}
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
pthread_mutex_lock(&db->ipc_ctx->mutex);
db->ipc_ctx->job_count += 1;
pthread_cond_signal(&db->ipc_ctx->has_work_cond);
pthread_mutex_unlock(&db->ipc_ctx->mutex);
}

147
src/database/database.h Normal file
View File

@ -0,0 +1,147 @@
#ifndef SIST2_DATABASE_H
#define SIST2_DATABASE_H
#include <sqlite3.h>
#include <cjson/cJSON.h>
#include "src/sist.h"
#include "src/index/elastic.h"
typedef struct index_descriptor index_descriptor_t;
extern const char *IpcDatabaseSchema;
extern const char *IndexDatabaseSchema;
typedef enum {
INDEX_DATABASE,
IPC_CONSUMER_DATABASE,
IPC_PRODUCER_DATABASE,
FTS_DATABASE
} database_type_t;
typedef enum {
JOB_UNDEFINED,
JOB_BULK_LINE,
JOB_PARSE_JOB
} job_type_t;
typedef struct {
job_type_t type;
union {
parse_job_t *parse_job;
es_bulk_line_t *bulk_line;
};
} job_t;
typedef struct {
int job_count;
int no_more_jobs;
int completed_job_count;
pthread_mutex_t mutex;
pthread_mutex_t db_mutex;
pthread_mutex_t index_db_mutex;
pthread_cond_t has_work_cond;
char current_job[256][PATH_MAX * 2];
} database_ipc_ctx_t;
typedef struct database {
char filename[PATH_MAX];
database_type_t type;
sqlite3 *db;
// Prepared statements
sqlite3_stmt *select_thumbnail_stmt;
sqlite3_stmt *treemap_merge_up_update_stmt;
sqlite3_stmt *treemap_merge_up_delete_stmt;
sqlite3_stmt *mark_document_stmt;
sqlite3_stmt *write_document_stmt;
sqlite3_stmt *write_document_sidecar_stmt;
sqlite3_stmt *write_thumbnail_stmt;
sqlite3_stmt *insert_parse_job_stmt;
sqlite3_stmt *insert_index_job_stmt;
sqlite3_stmt *pop_parse_job_stmt;
sqlite3_stmt *pop_index_job_stmt;
database_ipc_ctx_t *ipc_ctx;
} database_t;
typedef struct {
database_t *db;
sqlite3_stmt *stmt;
} database_iterator_t;
typedef struct {
const char *path;
const char *parent;
long size;
} treemap_row_t;
static treemap_row_t null_treemap_row = {0, 0, 0};
database_t *database_create(const char *filename, database_type_t type);
void database_initialize(database_t *db);
void database_open(database_t *db);
void database_close(database_t *, int optimize);
void database_write_thumbnail(database_t *db, const char *id, int num, void *data, size_t data_size);
void *database_read_thumbnail(database_t *db, const char *id, int num, size_t *return_value_len);
void database_write_index_descriptor(database_t *db, index_descriptor_t *desc);
index_descriptor_t *database_read_index_descriptor(database_t *db);
void database_write_document(database_t *db, document_t *doc, const char *json_data);
database_iterator_t *database_create_document_iterator(database_t *db);
cJSON *database_document_iter(database_iterator_t *);
#define database_document_iter_foreach(element, iter) \
for (cJSON *element = database_document_iter(iter); element != NULL; element = database_document_iter(iter))
cJSON *database_incremental_scan_begin(database_t *db);
cJSON *database_incremental_scan_end(database_t *db);
int database_mark_document(database_t *db, const char *id, int mtime);
void database_write_document_sidecar(database_t *db, const char *id, const char *json_data);
database_iterator_t *database_create_treemap_iterator(database_t *db, long threshold);
treemap_row_t database_treemap_iter(database_iterator_t *iter);
#define database_treemap_iter_foreach(element, iter) \
for (treemap_row_t element = database_treemap_iter(iter); element.path != NULL; element = database_treemap_iter(iter))
void database_generate_stats(database_t *db, double treemap_threshold);
job_t *database_get_work(database_t *db, job_type_t job_type);
void database_add_work(database_t *db, job_t *job);
//void database_index(database_t *db);
#define CRASH_IF_STMT_FAIL(x) do { \
int return_value = x; \
if (return_value != SQLITE_DONE && return_value != SQLITE_ROW) { \
LOG_FATALF("database.c", "Sqlite error @ database.c:%d : (%d) %s", __LINE__, return_value, sqlite3_errmsg(db->db)); \
} \
} while (0)
#define CRASH_IF_NOT_SQLITE_OK(x) do { \
int return_value = x; \
if (return_value != SQLITE_OK) { \
LOG_FATALF("database.c", "Sqlite error @ database.c:%d : (%d) %s", __LINE__, return_value, sqlite3_errmsg(db->db)); \
} \
} while (0)
#endif //SIST2_DATABASE_H

View File

@ -0,0 +1,78 @@
const char *IpcDatabaseSchema =
"CREATE TABLE parse_job ("
" id INTEGER PRIMARY KEY,"
" filepath TEXT NOT NULL,"
" mtime INTEGER NOT NULL,"
" st_size INTEGER NOT NULL"
");"
""
"CREATE TABLE index_job ("
" id INTEGER PRIMARY KEY,"
" doc_id TEXT NOT NULL CHECK ( length(doc_id) = 32 ),"
" type INTEGER NOT NULL,"
" line TEXT"
");";
const char *IndexDatabaseSchema =
"CREATE TABLE thumbnail ("
" id TEXT NOT NULL CHECK ( length(id) = 32 ),"
" num INTEGER NOT NULL,"
" data BLOB NOT NULL,"
" PRIMARY KEY(id, num)"
") WITHOUT ROWID;"
""
"CREATE TABLE document ("
" id TEXT PRIMARY KEY CHECK ( length(id) = 32 ),"
" marked INTEGER NOT NULL DEFAULT (1),"
" mtime INTEGER NOT NULL,"
" size INTEGER NOT NULL,"
" json_data TEXT NOT NULL CHECK ( json_valid(json_data) )"
") WITHOUT ROWID;"
""
"CREATE TABLE delete_list ("
" id TEXT PRIMARY KEY CHECK ( length(id) = 32 )"
") WITHOUT ROWID;"
""
"CREATE TABLE tag ("
" id TEXT NOT NULL,"
" tag TEXT NOT NULL"
");"
""
"CREATE TABLE document_sidecar ("
" id TEXT PRIMARY KEY NOT NULL,"
" json_data TEXT NOT NULL"
") WITHOUT ROWID;"
""
"CREATE TABLE descriptor ("
" id TEXT NOT NULL,"
" version_major INTEGER NOT NULL,"
" version_minor INTEGER NOT NULL,"
" version_patch INTEGER NOT NULL,"
" root TEXT NOT NULL,"
" name TEXT NOT NULL,"
" rewrite_url TEXT,"
" timestamp INTEGER NOT NULL"
");"
""
"CREATE TABLE stats_treemap ("
" path TEXT NOT NULL,"
" size INTEGER NOT NULL"
");"
""
"CREATE TABLE stats_size_agg ("
" bucket INTEGER NOT NULL,"
" count INTEGER NOT NULL"
");"
""
"CREATE TABLE stats_date_agg ("
" bucket INTEGER NOT NULL,"
" count INTEGER NOT NULL"
");"
""
"CREATE TABLE stats_mime_agg ("
" mime TEXT NOT NULL,"
" size INTEGER NOT NULL,"
" count INTEGER NOT NULL"
");";

View File

@ -0,0 +1,159 @@
#include "database.h"
#include "src/sist.h"
#include "src/ctx.h"
#define TREEMAP_MINIMUM_MERGES_TO_CONTINUE (100)
#define SIZE_BUCKET (long)(5 * 1000 * 1000)
#define DATE_BUCKET (long)(2629800) // ~30 days
database_iterator_t *database_create_treemap_iterator(database_t *db, long threshold) {
sqlite3_stmt *stmt;
sqlite3_prepare_v2(db->db,
"SELECT path, path_parent(path), size FROM tm"
" WHERE path_parent(path) IN (SELECT path FROM tm)"
" AND size<?",
-1, &stmt, NULL);
sqlite3_bind_int64(stmt, 1, threshold);
database_iterator_t *iter = malloc(sizeof(database_iterator_t));
iter->stmt = stmt;
iter->db = db;
return iter;
}
treemap_row_t database_treemap_iter(database_iterator_t *iter) {
if (iter->stmt == NULL) {
LOG_FATAL("database.c", "FIXME: database_treemap_iter() called after iteration stopped");
}
int ret = sqlite3_step(iter->stmt);
if (ret == SQLITE_ROW) {
treemap_row_t row = {
.path = (const char *) sqlite3_column_text(iter->stmt, 0),
.parent = (const char *) sqlite3_column_text(iter->stmt, 1),
.size = sqlite3_column_int64(iter->stmt, 2)
};
return row;
}
if (ret != SQLITE_DONE) {
LOG_FATALF("database.c", "FIXME: doc iter returned %s", sqlite3_errmsg(iter->db->db));
}
sqlite3_finalize(iter->stmt);
iter->stmt = NULL;
return (treemap_row_t) {NULL, NULL, 0};
}
void database_generate_stats(database_t *db, double treemap_threshold) {
LOG_INFO("database.c", "Generating stats");
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "DELETE FROM stats_size_agg;", NULL, NULL, NULL));
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "DELETE FROM stats_date_agg;", NULL, NULL, NULL));
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "DELETE FROM stats_mime_agg;", NULL, NULL, NULL));
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "DELETE FROM stats_treemap;", NULL, NULL, NULL));
CRASH_IF_NOT_SQLITE_OK(
sqlite3_exec(db->db, "CREATE TEMP TABLE tm(path TEXT PRIMARY KEY, size INT);", NULL, NULL, NULL));
sqlite3_prepare_v2(db->db, "UPDATE tm SET size=size+? WHERE path=?;", -1, &db->treemap_merge_up_update_stmt, NULL);
sqlite3_prepare_v2(db->db, "DELETE FROM tm WHERE path = ?;", -1, &db->treemap_merge_up_delete_stmt, NULL);
// size aggregation
sqlite3_stmt *stmt;
sqlite3_prepare_v2(db->db, "INSERT INTO stats_size_agg"
" SELECT"
" cast(size / ?1 as int) * ?1 as bucket,"
" count(*) as count"
" FROM document"
" GROUP BY bucket", -1, &stmt, NULL);
sqlite3_bind_int(stmt, 1, SIZE_BUCKET);
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
sqlite3_finalize(stmt);
// date aggregation
sqlite3_prepare_v2(db->db, "INSERT INTO stats_date_agg"
" SELECT"
" cast(mtime / ?1 as int) * ?1 as bucket,"
" count(*) as count"
" FROM document"
" GROUP BY bucket", -1, &stmt, NULL);
sqlite3_bind_int(stmt, 1, DATE_BUCKET);
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
sqlite3_finalize(stmt);
// mime aggregation
sqlite3_prepare_v2(db->db, "INSERT INTO stats_mime_agg"
" SELECT"
" (json_data->>'mime') as bucket,"
" sum(size),"
" count(*)"
" FROM document"
" WHERE bucket IS NOT NULL"
" GROUP BY bucket", -1, &stmt, NULL);
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
sqlite3_finalize(stmt);
// Treemap
sqlite3_prepare_v2(db->db, "SELECT SUM(size) FROM document;", -1, &stmt, NULL);
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
long total_size = sqlite3_column_int64(stmt, 0);
long threshold = (long) ((double) total_size * treemap_threshold);
sqlite3_finalize(stmt);
// flat map
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db,
"INSERT INTO tm (path, size) SELECT json_data->>'path' as path, sum(size)"
" FROM document WHERE json_data->>'parent' IS NULL GROUP BY path;",
NULL, NULL, NULL));
// Merge up
int merged_rows = 0;
do {
if (merged_rows) {
LOG_INFOF("database.c", "Treemap merge iteration (%d rows changed)", merged_rows);
}
merged_rows = 0;
sqlite3_prepare_v2(db->db,
"INSERT INTO tm (path, size) SELECT path_parent(path) as parent, 0 "
" FROM tm WHERE parent not IN (SELECT path FROM tm) AND size<?"
" ON CONFLICT DO NOTHING;", -1, &stmt, NULL);
sqlite3_bind_int64(stmt, 1, threshold);
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
database_iterator_t *iter = database_create_treemap_iterator(db, threshold);
database_treemap_iter_foreach(row, iter) {
sqlite3_bind_int64(db->treemap_merge_up_update_stmt, 1, row.size);
sqlite3_bind_text(db->treemap_merge_up_update_stmt, 2, row.parent, -1, SQLITE_STATIC);
CRASH_IF_STMT_FAIL(sqlite3_step(db->treemap_merge_up_update_stmt));
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->treemap_merge_up_update_stmt));
sqlite3_bind_text(db->treemap_merge_up_delete_stmt, 1, row.path, -1, SQLITE_STATIC);
CRASH_IF_STMT_FAIL(sqlite3_step(db->treemap_merge_up_delete_stmt));
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->treemap_merge_up_delete_stmt));
merged_rows += 1;
}
} while (merged_rows > TREEMAP_MINIMUM_MERGES_TO_CONTINUE);
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db,
"INSERT INTO stats_treemap (path, size) SELECT path,size FROM tm;",
NULL, NULL, NULL));
LOG_INFO("database.c", "Done!");
}

View File

@ -0,0 +1,5 @@
#ifndef SIST2_DATABASE_STATS_H
#define SIST2_DATABASE_STATS_H
#endif //SIST2_DATABASE_STATS_H

View File

@ -29,7 +29,7 @@ void destroy_indexer(es_indexer_t *indexer) {
return; return;
} }
LOG_DEBUG("elastic.c", "Destroying indexer") LOG_DEBUG("elastic.c", "Destroying indexer");
if (indexer->es_url != NULL) { if (indexer->es_url != NULL) {
free(indexer->es_url); free(indexer->es_url);
@ -64,26 +64,21 @@ void print_json(cJSON *document, const char id_str[SIST_DOC_ID_LEN]) {
cJSON_Delete(line); cJSON_Delete(line);
} }
void index_json_func(tpool_work_arg_shm_t *arg) { void index_json_func(job_t *job) {
// Copy arg to heap because it's going to be freed immediately after this function returns elastic_index_line(job->bulk_line);
es_bulk_line_t *line = malloc(arg->arg_size);
memcpy(line, arg->arg, arg->arg_size);
elastic_index_line(line);
} }
void delete_document(const char *document_id_str, void *UNUSED(_data)) { void delete_document(const char *document_id) {
es_bulk_line_t *bulk_line = malloc(sizeof(es_bulk_line_t)); es_bulk_line_t *bulk_line = malloc(sizeof(es_bulk_line_t));
bulk_line->type = ES_BULK_LINE_DELETE; bulk_line->type = ES_BULK_LINE_DELETE;
bulk_line->next = NULL; bulk_line->next = NULL;
strcpy(bulk_line->doc_id, document_id_str); strcpy(bulk_line->doc_id, document_id);
tpool_work_arg_t arg = { tpool_add_work(IndexCtx.pool, &(job_t) {
.arg_size = sizeof(es_bulk_line_t), .type = JOB_BULK_LINE,
.arg = bulk_line .bulk_line = bulk_line,
}; });
tpool_add_work(IndexCtx.pool, index_json_func, &arg);
} }
@ -100,11 +95,10 @@ void index_json(cJSON *document, const char doc_id[SIST_DOC_ID_LEN]) {
bulk_line->next = NULL; bulk_line->next = NULL;
cJSON_free(json); cJSON_free(json);
tpool_work_arg_t arg = { tpool_add_work(IndexCtx.pool, &(job_t) {
.arg_size = sizeof(es_bulk_line_t) + json_len + 2, .type = JOB_BULK_LINE,
.arg = bulk_line .bulk_line = bulk_line,
}; });
tpool_add_work(IndexCtx.pool, index_json_func, &arg);
} }
void execute_update_script(const char *script, int async, const char index_id[SIST_INDEX_ID_LEN]) { void execute_update_script(const char *script, int async, const char index_id[SIST_INDEX_ID_LEN]) {
@ -278,7 +272,7 @@ void print_error(response_t *r) {
void _elastic_flush(int max) { void _elastic_flush(int max) {
if (max == 0) { if (max == 0) {
LOG_WARNING("elastic.c", "calling _elastic_flush with 0 in queue") LOG_WARNING("elastic.c", "calling _elastic_flush with 0 in queue");
return; return;
} }
@ -291,13 +285,13 @@ void _elastic_flush(int max) {
response_t *r = web_post(bulk_url, buf, IndexCtx.es_insecure_ssl); response_t *r = web_post(bulk_url, buf, IndexCtx.es_insecure_ssl);
if (r->status_code == 0) { if (r->status_code == 0) {
LOG_FATALF("elastic.c", "Could not connect to %s, make sure that elasticsearch is running!\n", IndexCtx.es_url) LOG_FATALF("elastic.c", "Could not connect to %s, make sure that elasticsearch is running!\n", IndexCtx.es_url);
} }
if (r->status_code == 413) { if (r->status_code == 413) {
if (max <= 1) { if (max <= 1) {
LOG_ERRORF("elastic.c", "Single document too large, giving up: {%s}", Indexer->line_head->doc_id) LOG_ERRORF("elastic.c", "Single document too large, giving up: {%s}", Indexer->line_head->doc_id);
free_response(r); free_response(r);
free(buf); free(buf);
free_queue(1); free_queue(1);
@ -318,7 +312,7 @@ void _elastic_flush(int max) {
free_response(r); free_response(r);
free(buf); free(buf);
LOG_WARNING("elastic.c", "Got 429 status, will retry after delay") LOG_WARNING("elastic.c", "Got 429 status, will retry after delay");
usleep(1000000 * 20); usleep(1000000 * 20);
_elastic_flush(max); _elastic_flush(max);
return; return;
@ -453,7 +447,7 @@ es_version_t *elastic_get_version(const char *es_url, int insecure) {
} }
if (cJSON_GetObjectItem(response, "error") != NULL) { if (cJSON_GetObjectItem(response, "error") != NULL) {
LOG_WARNING("elastic.c", "Could not get Elasticsearch version") LOG_WARNING("elastic.c", "Could not get Elasticsearch version");
print_error(r); print_error(r);
free_response(r); free_response(r);
return NULL; return NULL;
@ -489,7 +483,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
IndexCtx.es_version = es_version; IndexCtx.es_version = es_version;
if (es_version == NULL) { if (es_version == NULL) {
LOG_FATAL("elastic.c", "Could not get ES version") LOG_FATAL("elastic.c", "Could not get ES version");
} }
LOG_INFOF("elastic.c", LOG_INFOF("elastic.c",
@ -497,7 +491,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
format_es_version(es_version), IS_SUPPORTED_ES_VERSION(es_version), IS_LEGACY_VERSION(es_version)); format_es_version(es_version), IS_SUPPORTED_ES_VERSION(es_version), IS_LEGACY_VERSION(es_version));
if (!IS_SUPPORTED_ES_VERSION(es_version)) { if (!IS_SUPPORTED_ES_VERSION(es_version)) {
LOG_FATAL("elastic.c", "This elasticsearch version is not supported!") LOG_FATAL("elastic.c", "This elasticsearch version is not supported!");
} }
char *settings = NULL; char *settings = NULL;
@ -524,7 +518,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
if (r->status_code != 200) { if (r->status_code != 200) {
print_error(r); print_error(r);
LOG_FATAL("elastic.c", "Could not create index") LOG_FATAL("elastic.c", "Could not create index");
} }
LOG_INFOF("elastic.c", "Create index <%d>", r->status_code); LOG_INFOF("elastic.c", "Create index <%d>", r->status_code);
@ -545,7 +539,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
LOG_INFOF("elastic.c", "Update ES settings <%d>", r->status_code); LOG_INFOF("elastic.c", "Update ES settings <%d>", r->status_code);
if (r->status_code != 200) { if (r->status_code != 200) {
print_error(r); print_error(r);
LOG_FATAL("elastic.c", "Could not update user settings") LOG_FATAL("elastic.c", "Could not update user settings");
} }
free_response(r); free_response(r);
@ -560,7 +554,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
LOG_INFOF("elastic.c", "Update ES mappings <%d>", r->status_code); LOG_INFOF("elastic.c", "Update ES mappings <%d>", r->status_code);
if (r->status_code != 200) { if (r->status_code != 200) {
print_error(r); print_error(r);
LOG_FATAL("elastic.c", "Could not update user mappings") LOG_FATAL("elastic.c", "Could not update user mappings");
} }
free_response(r); free_response(r);

View File

@ -46,7 +46,7 @@ void print_json(cJSON *document, const char index_id_str[SIST_INDEX_ID_LEN]);
void index_json(cJSON *document, const char doc_id[SIST_INDEX_ID_LEN]); void index_json(cJSON *document, const char doc_id[SIST_INDEX_ID_LEN]);
void delete_document(const char *document_id_str, void* data); void delete_document(const char *document_id);
es_indexer_t *create_indexer(const char *url, const char *index); es_indexer_t *create_indexer(const char *url, const char *index);

View File

@ -65,7 +65,7 @@ void web_post_async_poll(subreq_ctx_t *req) {
curl_easy_getinfo(req->handle, CURLINFO_RESPONSE_CODE, &req->response->status_code); curl_easy_getinfo(req->handle, CURLINFO_RESPONSE_CODE, &req->response->status_code);
if (req->response->status_code == 0) { if (req->response->status_code == 0) {
LOG_ERRORF("web.c", "CURL Error: %s", req->curl_err_buffer) LOG_ERRORF("web.c", "CURL Error: %s", req->curl_err_buffer);
} }
curl_multi_cleanup(req->multi); curl_multi_cleanup(req->multi);
@ -104,7 +104,7 @@ subreq_ctx_t *web_post_async(const char *url, char *data, int insecure) {
curl_multi_add_handle(req->multi, curl); curl_multi_add_handle(req->multi, curl);
curl_multi_perform(req->multi, &req->running_handles); curl_multi_perform(req->multi, &req->running_handles);
LOG_DEBUGF("web.c", "async request POST %s", url) LOG_DEBUGF("web.c", "async request POST %s", url);
return req; return req;
} }
@ -136,7 +136,7 @@ response_t *web_get(const char *url, int timeout, int insecure) {
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp->status_code); curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp->status_code);
if (resp->status_code == 0) { if (resp->status_code == 0) {
LOG_ERRORF("web.c", "CURL Error: %s", err_buffer) LOG_ERRORF("web.c", "CURL Error: %s", err_buffer);
} }
curl_easy_cleanup(curl); curl_easy_cleanup(curl);
@ -180,7 +180,7 @@ response_t *web_post(const char *url, const char *data, int insecure) {
resp->size = buffer.cur; resp->size = buffer.cur;
if (resp->status_code == 0) { if (resp->status_code == 0) {
LOG_ERRORF("web.c", "CURL Error: %s", err_buffer) LOG_ERRORF("web.c", "CURL Error: %s", err_buffer);
} }
curl_easy_cleanup(curl); curl_easy_cleanup(curl);

View File

@ -1,9 +1,7 @@
#include "src/ctx.h" #include "src/ctx.h"
#include "serialize.h" #include "serialize.h"
#include "src/parsing/parse.h"
#include "src/parsing/mime.h" #include "src/parsing/mime.h"
#include <zstd.h>
char *get_meta_key_text(enum metakey meta_key) { char *get_meta_key_text(enum metakey meta_key) {
@ -79,7 +77,7 @@ char *get_meta_key_text(enum metakey meta_key) {
case MetaChecksum: case MetaChecksum:
return "checksum"; return "checksum";
default: default:
LOG_FATALF("serialize.c", "FIXME: Unknown meta key: %d", meta_key) LOG_FATALF("serialize.c", "FIXME: Unknown meta key: %d", meta_key);
} }
} }
@ -175,7 +173,7 @@ char *build_json_string(document_t *doc) {
break; break;
} }
default: default:
LOG_FATALF("serialize.c", "Invalid meta key: %x %s", meta->key, get_meta_key_text(meta->key)) LOG_FATALF("serialize.c", "Invalid meta key: %x %s", meta->key, get_meta_key_text(meta->key));
} }
meta_line_t *tmp = meta; meta_line_t *tmp = meta;
@ -189,394 +187,10 @@ char *build_json_string(document_t *doc) {
return json_str; return json_str;
} }
static struct {
FILE *out_file;
size_t buf_out_size;
void *buf_out;
ZSTD_CCtx *cctx;
} WriterCtx = {
.out_file = NULL
};
#define ZSTD_COMPRESSION_LEVEL 10
void initialize_writer_ctx(const char *file_path) {
WriterCtx.out_file = fopen(file_path, "wb");
WriterCtx.buf_out_size = ZSTD_CStreamOutSize();
WriterCtx.buf_out = malloc(WriterCtx.buf_out_size);
WriterCtx.cctx = ZSTD_createCCtx();
ZSTD_CCtx_setParameter(WriterCtx.cctx, ZSTD_c_compressionLevel, ZSTD_COMPRESSION_LEVEL);
ZSTD_CCtx_setParameter(WriterCtx.cctx, ZSTD_c_checksumFlag, FALSE);
LOG_DEBUGF("serialize.c", "Open index file for writing %s", file_path)
}
void zstd_write_string(const char *string, const size_t len) {
ZSTD_inBuffer input = {string, len, 0};
do {
ZSTD_outBuffer output = {WriterCtx.buf_out, WriterCtx.buf_out_size, 0};
ZSTD_compressStream2(WriterCtx.cctx, &output, &input, ZSTD_e_continue);
if (output.pos > 0) {
ScanCtx.stat_index_size += fwrite(WriterCtx.buf_out, 1, output.pos, WriterCtx.out_file);
}
} while (input.pos != input.size);
}
void write_document_func(tpool_work_arg_shm_t *arg) {
const char *json_str = arg->arg;
if (WriterCtx.out_file == NULL) {
char dstfile[PATH_MAX];
snprintf(dstfile, PATH_MAX, "%s_index_main.ndjson.zst", ScanCtx.index.path);
initialize_writer_ctx(dstfile);
}
zstd_write_string(json_str, arg->arg_size);
}
void zstd_close() {
if (WriterCtx.out_file == NULL) {
LOG_DEBUG("serialize.c", "No zstd stream to close, skipping cleanup")
return;
}
size_t remaining;
do {
ZSTD_outBuffer output = {WriterCtx.buf_out, WriterCtx.buf_out_size, 0};
remaining = ZSTD_endStream(WriterCtx.cctx, &output);
if (output.pos > 0) {
ScanCtx.stat_index_size += fwrite(WriterCtx.buf_out, 1, output.pos, WriterCtx.out_file);
}
} while (remaining != 0);
ZSTD_freeCCtx(WriterCtx.cctx);
free(WriterCtx.buf_out);
fclose(WriterCtx.out_file);
LOG_DEBUG("serialize.c", "End zstd stream & close index file")
}
void writer_cleanup() {
zstd_close();
WriterCtx.out_file = NULL;
}
void write_index_descriptor(char *path, index_descriptor_t *desc) {
cJSON *json = cJSON_CreateObject();
cJSON_AddStringToObject(json, "id", desc->id);
cJSON_AddStringToObject(json, "version", desc->version);
cJSON_AddStringToObject(json, "root", desc->root);
cJSON_AddStringToObject(json, "name", desc->name);
cJSON_AddStringToObject(json, "type", desc->type);
cJSON_AddStringToObject(json, "rewrite_url", desc->rewrite_url);
cJSON_AddNumberToObject(json, "timestamp", (double) desc->timestamp);
int fd = open(path, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR);
if (fd < 0) {
LOG_FATALF("serialize.c", "Could not open index descriptor: %s", strerror(errno));
}
char *str = cJSON_Print(json);
size_t ret = write(fd, str, strlen(str));
if (ret == -1) {
LOG_FATALF("serialize.c", "Could not write index descriptor: %s", strerror(errno));
}
free(str);
close(fd);
cJSON_Delete(json);
}
index_descriptor_t read_index_descriptor(char *path) {
struct stat info;
stat(path, &info);
int fd = open(path, O_RDONLY);
if (fd == -1) {
LOG_FATALF("serialize.c", "Invalid/corrupt index (Could not find descriptor): %s: %s\n", path, strerror(errno))
}
char *buf = malloc(info.st_size + 1);
size_t ret = read(fd, buf, info.st_size);
if (ret == -1) {
LOG_FATALF("serialize.c", "Could not read index descriptor: %s", strerror(errno));
}
*(buf + info.st_size) = '\0';
close(fd);
cJSON *json = cJSON_Parse(buf);
index_descriptor_t descriptor;
descriptor.timestamp = (long) cJSON_GetObjectItem(json, "timestamp")->valuedouble;
strcpy(descriptor.root, cJSON_GetObjectItem(json, "root")->valuestring);
strcpy(descriptor.name, cJSON_GetObjectItem(json, "name")->valuestring);
strcpy(descriptor.rewrite_url, cJSON_GetObjectItem(json, "rewrite_url")->valuestring);
descriptor.root_len = (short) strlen(descriptor.root);
strcpy(descriptor.version, cJSON_GetObjectItem(json, "version")->valuestring);
strcpy(descriptor.id, cJSON_GetObjectItem(json, "id")->valuestring);
if (cJSON_GetObjectItem(json, "type") == NULL) {
strcpy(descriptor.type, INDEX_TYPE_NDJSON);
} else {
strcpy(descriptor.type, cJSON_GetObjectItem(json, "type")->valuestring);
}
cJSON_Delete(json);
free(buf);
return descriptor;
}
void write_document(document_t *doc) { void write_document(document_t *doc) {
char *json_str = build_json_string(doc); char *json_str = build_json_string(doc);
database_write_document(ProcData.index_db, doc, json_str);
free(doc); free(doc);
const size_t json_str_len = strlen(json_str); free(json_str);
}
json_str = realloc(json_str, json_str_len + 1);
*(json_str + json_str_len) = '\n';
tpool_work_arg_t arg = {
.arg_size = json_str_len + 1,
.arg = json_str
};
tpool_add_work(ScanCtx.writer_pool, write_document_func, &arg);
}
void thread_cleanup() {
cleanup_parse();
cleanup_font();
}
void read_index_bin_handle_line(const char *line, const char *index_id, index_func func) {
cJSON *document = cJSON_Parse(line);
const char *path_md5_str = cJSON_GetObjectItem(document, "_id")->valuestring;
cJSON_AddStringToObject(document, "index", index_id);
// Load meta from sidecar files
cJSON *meta_obj = NULL;
if (IndexCtx.meta != NULL) {
const char *meta_string = g_hash_table_lookup(IndexCtx.meta, path_md5_str);
if (meta_string != NULL) {
meta_obj = cJSON_Parse(meta_string);
cJSON *child;
for (child = meta_obj->child; child != NULL; child = child->next) {
char meta_key[4096];
strcpy(meta_key, child->string);
cJSON_DeleteItemFromObject(document, meta_key);
cJSON_AddItemReferenceToObject(document, meta_key, child);
}
}
}
// Load tags from tags DB
if (IndexCtx.tags != NULL) {
const char *tags_string = g_hash_table_lookup(IndexCtx.tags, path_md5_str);
if (tags_string != NULL) {
cJSON *tags_arr = cJSON_Parse(tags_string);
cJSON_DeleteItemFromObject(document, "tag");
cJSON_AddItemToObject(document, "tag", tags_arr);
}
}
func(document, path_md5_str);
cJSON_DeleteItemFromObject(document, "_id");
cJSON_Delete(document);
if (meta_obj) {
cJSON_Delete(meta_obj);
}
}
void read_lines(const char *path, const line_processor_t processor) {
dyn_buffer_t buf = dyn_buffer_create();
// Initialize zstd things
FILE *file = fopen(path, "rb");
size_t const buf_in_size = ZSTD_DStreamInSize();
void *const buf_in = malloc(buf_in_size);
size_t const buf_out_size = ZSTD_DStreamOutSize();
void *const buf_out = malloc(buf_out_size);
ZSTD_DCtx *const dctx = ZSTD_createDCtx();
size_t read;
size_t last_ret = 0;
while ((read = fread(buf_in, 1, buf_in_size, file))) {
ZSTD_inBuffer input = {buf_in, read, 0};
while (input.pos < input.size) {
ZSTD_outBuffer output = {buf_out, buf_out_size, 0};
size_t const ret = ZSTD_decompressStream(dctx, &output, &input);
for (int i = 0; i < output.pos; i++) {
char c = ((char *) output.dst)[i];
if (c == '\n') {
dyn_buffer_write_char(&buf, '\0');
processor.func(buf.buf, processor.data);
buf.cur = 0;
} else {
dyn_buffer_write_char(&buf, c);
}
}
last_ret = ret;
}
}
if (last_ret != 0) {
/* The last return value from ZSTD_decompressStream did not end on a
* frame, but we reached the end of the file! We assume this is an
* error, and the input was truncated.
*/
LOG_FATALF("serialize.c", "EOF before end of stream: %zu", last_ret)
}
ZSTD_freeDCtx(dctx);
free(buf_in);
free(buf_out);
dyn_buffer_destroy(&buf);
fclose(file);
}
void read_index_ndjson(const char *line, void *_data) {
void **data = _data;
const char *index_id = data[0];
index_func func = data[1];
read_index_bin_handle_line(line, index_id, func);
}
void read_index(const char *path, const char index_id[SIST_INDEX_ID_LEN], const char *type, index_func func) {
if (strcmp(type, INDEX_TYPE_NDJSON) == 0) {
read_lines(path, (line_processor_t) {
.data = (void *[2]) {(void *) index_id, func},
.func = read_index_ndjson,
});
}
}
static __thread GHashTable *IncrementalReadTable = NULL;
void json_put_incremental(cJSON *document, UNUSED(const char doc_id[SIST_DOC_ID_LEN])) {
const char *path_md5_str = cJSON_GetObjectItem(document, "_id")->valuestring;
const int mtime = cJSON_GetObjectItem(document, "mtime")->valueint;
incremental_put(IncrementalReadTable, path_md5_str, mtime);
}
void incremental_read(GHashTable *table, const char *filepath, index_descriptor_t *desc) {
IncrementalReadTable = table;
read_index(filepath, desc->id, desc->type, json_put_incremental);
}
static __thread GHashTable *IncrementalCopyTable = NULL;
static __thread GHashTable *IncrementalNewTable = NULL;
static __thread store_t *IncrementalCopySourceStore = NULL;
static __thread store_t *IncrementalCopyDestinationStore = NULL;
void incremental_copy_handle_doc(cJSON *document, UNUSED(const char id_str[SIST_DOC_ID_LEN])) {
const char *doc_id = cJSON_GetObjectItem(document, "_id")->valuestring;
if (cJSON_GetObjectItem(document, "parent") != NULL || incremental_get(IncrementalCopyTable, doc_id)) {
// Copy index line
cJSON_DeleteItemFromObject(document, "index");
char *json_str = cJSON_PrintUnformatted(document);
const size_t json_str_len = strlen(json_str);
json_str = realloc(json_str, json_str_len + 1);
*(json_str + json_str_len) = '\n';
// Copy tn store contents
size_t buf_len;
char *buf = store_read(IncrementalCopySourceStore, (char *) doc_id, SIST_DOC_ID_LEN, &buf_len);
if (buf_len != 0) {
store_write(IncrementalCopyDestinationStore, (char *) doc_id, SIST_DOC_ID_LEN, buf, buf_len);
free(buf);
}
// Also copy additional thumbnails
if (cJSON_GetObjectItem(document, "thumbnail") != NULL) {
const int thumbnail_count = cJSON_GetObjectItem(document, "thumbnail")->valueint;
for (int i = 1; i < thumbnail_count; i++) {
char tn_key[SIST_DOC_ID_LEN + sizeof(char) * 4];
snprintf(tn_key, sizeof(tn_key), "%s%04d", doc_id, i);
buf = store_read(IncrementalCopySourceStore, tn_key, sizeof(tn_key), &buf_len);
if (buf_len != 0) {
store_write(IncrementalCopyDestinationStore, tn_key, sizeof(tn_key), buf, buf_len);
free(buf);
}
}
}
zstd_write_string(json_str, json_str_len + 1);
free(json_str);
}
}
/**
* Copy items from an index that are in the copy_table. Also copies from
* the store.
*/
void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
const char *dst_filepath, GHashTable *copy_table) {
if (WriterCtx.out_file == NULL) {
initialize_writer_ctx(dst_filepath);
}
IncrementalCopyTable = copy_table;
IncrementalCopySourceStore = store;
IncrementalCopyDestinationStore = dst_store;
read_index(filepath, "", INDEX_TYPE_NDJSON, incremental_copy_handle_doc);
}
void incremental_delete_handle_doc(cJSON *document, UNUSED(const char id_str[SIST_DOC_ID_LEN])) {
char doc_id_n[SIST_DOC_ID_LEN + 1];
doc_id_n[SIST_DOC_ID_LEN] = '\0';
doc_id_n[SIST_DOC_ID_LEN - 1] = '\n';
const char *doc_id = cJSON_GetObjectItem(document, "_id")->valuestring;
// do not delete archive virtual entries
if (cJSON_GetObjectItem(document, "parent") == NULL
&& !incremental_get(IncrementalCopyTable, doc_id)
&& !incremental_get(IncrementalNewTable, doc_id)
) {
memcpy(doc_id_n, doc_id, SIST_DOC_ID_LEN - 1);
zstd_write_string(doc_id, sizeof(doc_id_n));
}
}
void incremental_delete(const char *del_filepath, const char *index_filepath,
GHashTable *copy_table, GHashTable *new_table) {
if (WriterCtx.out_file == NULL) {
initialize_writer_ctx(del_filepath);
}
IncrementalCopyTable = copy_table;
IncrementalNewTable = new_table;
read_index(index_filepath, "", INDEX_TYPE_NDJSON, incremental_delete_handle_doc);
}

View File

@ -2,55 +2,7 @@
#define SIST2_SERIALIZE_H #define SIST2_SERIALIZE_H
#include "src/sist.h" #include "src/sist.h"
#include "store.h"
#include <sys/syscall.h>
#include <glib.h>
typedef struct line_processor {
void* data;
void (*func)(const char*, void*);
} line_processor_t;
typedef void(*index_func)(cJSON *, const char[SIST_DOC_ID_LEN]);
void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
const char *dst_filepath, GHashTable *copy_table);
void incremental_delete(const char *del_filepath, const char* index_filepath,
GHashTable *copy_table, GHashTable *new_table);
void write_document(document_t *doc); void write_document(document_t *doc);
void read_lines(const char *path, const line_processor_t processor);
void read_index(const char *path, const char index_id[SIST_INDEX_ID_LEN], const char *type, index_func);
void incremental_read(GHashTable *table, const char *filepath, index_descriptor_t *desc);
/**
* Must be called after write_document
*/
void thread_cleanup();
void writer_cleanup();
void write_index_descriptor(char *path, index_descriptor_t *desc);
index_descriptor_t read_index_descriptor(char *path);
// caller ensures char file_path[PATH_MAX]
#define READ_INDICES(file_path, index_path, action_ok, action_main_fail, cond_original) \
snprintf(file_path, PATH_MAX, "%s_index_main.ndjson.zst", index_path); \
if (access(file_path, R_OK) == 0) { \
action_ok; \
} else { \
action_main_fail; \
} \
snprintf(file_path, PATH_MAX, "%s_index_original.ndjson.zst", index_path); \
if ((cond_original) && access(file_path, R_OK) == 0) { \
action_ok; \
} \
#endif #endif

View File

@ -1,232 +0,0 @@
#include <sys/mman.h>
#include "store.h"
#include "src/ctx.h"
//#define SIST_FAKE_STORE 1
void open_env(const char *path, MDB_env **env, MDB_dbi *dbi) {
mdb_env_create(env);
int open_ret = mdb_env_open(*env,
path,
MDB_WRITEMAP | MDB_MAPASYNC,
S_IRUSR | S_IWUSR
);
if (open_ret != 0) {
LOG_FATALF("store.c", "Error while opening store: %s (%s)\n", mdb_strerror(open_ret), path)
}
MDB_txn *txn;
mdb_txn_begin(*env, NULL, 0, &txn);
mdb_dbi_open(txn, NULL, 0, dbi);
mdb_txn_commit(txn);
}
store_t *store_create(const char *path, size_t chunk_size) {
store_t *store = calloc(1, sizeof(struct store_t));
mkdir(path, S_IWUSR | S_IRUSR | S_IXUSR);
strcpy(store->path, path);
MDB_env *env;
MDB_dbi dbi;
#if (SIST_FAKE_STORE != 1)
store->chunk_size = chunk_size;
store->shm = mmap(NULL, sizeof(*store->shm), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
open_env(path, &env, &dbi);
store->shm->size = (size_t) store->chunk_size;
mdb_env_set_mapsize(env, store->shm->size);
// Close, child processes will open the environment again
mdb_env_close(env);
#endif
return store;
}
void store_destroy(store_t *store) {
LOG_DEBUG("store.c", "store_destroy()")
#if (SIST_FAKE_STORE != 1)
munmap(store->shm, sizeof(*store->shm));
mdb_dbi_close(store->proc.env, store->proc.dbi);
mdb_env_close(store->proc.env);
#endif
free(store);
}
void store_flush(store_t *store) {
mdb_env_sync(store->proc.env, TRUE);
}
void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len) {
ScanCtx.stat_tn_size += buf_len;
if (LogCtx.very_verbose) {
LOG_DEBUGF("store.c", "Store write %s@{%s} %lu bytes", store->path, key, buf_len)
}
#if (SIST_FAKE_STORE != 1)
if (store->proc.env == NULL) {
open_env(store->path, &store->proc.env, &store->proc.dbi);
LOG_DEBUGF("store.c", "Opening mdb environment %s", store->path)
}
MDB_val mdb_key;
mdb_key.mv_data = key;
mdb_key.mv_size = key_len;
MDB_val mdb_value;
mdb_value.mv_data = buf;
mdb_value.mv_size = buf_len;
MDB_txn *txn;
int db_full = FALSE;
int put_ret = 0;
int should_abort_transaction = FALSE;
int should_increase_size = TRUE;
int begin_ret = mdb_txn_begin(store->proc.env, NULL, 0, &txn);
if (begin_ret == MDB_MAP_RESIZED) {
// mapsize was increased by another process. We don't need to increase the size again, but we need
// to update the size of the environment for the current process.
db_full = TRUE;
should_increase_size = FALSE;
} else {
put_ret = mdb_put(txn, store->proc.dbi, &mdb_key, &mdb_value, 0);
if (put_ret == MDB_MAP_FULL) {
// Database is full, we need to increase the environment size
db_full = TRUE;
should_abort_transaction = TRUE;
} else {
int commit_ret = mdb_txn_commit(txn);
if (commit_ret == MDB_MAP_FULL) {
db_full = TRUE;
}
}
}
if (db_full) {
LOG_DEBUGF("store.c", "Updating mdb mapsize to %lu bytes", store->shm->size)
if (should_abort_transaction) {
mdb_txn_abort(txn);
}
// Cannot resize when there is an opened transaction in this process.
// Resize take effect on the next commit.
if (should_increase_size) {
store->shm->size += store->chunk_size;
}
int resize_ret = mdb_env_set_mapsize(store->proc.env, store->shm->size);
if (resize_ret != 0) {
LOG_ERRORF("store.c", "mdb_env_set_mapsize() failed: %s", mdb_strerror(resize_ret))
}
mdb_txn_begin(store->proc.env, NULL, 0, &txn);
int put_ret_retry = mdb_put(txn, store->proc.dbi, &mdb_key, &mdb_value, 0);
if (put_ret_retry != 0) {
LOG_ERRORF("store.c", "mdb_put() (retry) failed: %s", mdb_strerror(put_ret_retry))
}
int ret = mdb_txn_commit(txn);
if (ret != 0) {
LOG_FATALF("store.c", "FIXME: Could not commit to store %s: %s (%d), %d, %d %d",
store->path, mdb_strerror(ret), ret,
ret, put_ret_retry)
}
LOG_DEBUGF("store.c", "Updated mdb mapsize to %lu bytes", store->shm->size)
} else if (put_ret != 0) {
LOG_ERRORF("store.c", "mdb_put() failed: %s", mdb_strerror(put_ret))
}
#endif
}
char *store_read(store_t *store, char *key, size_t key_len, size_t *return_value_len) {
char *buf = NULL;
#if (SIST_FAKE_STORE != 1)
if (store->proc.env == NULL) {
open_env(store->path, &store->proc.env, &store->proc.dbi);
}
MDB_val mdb_key;
mdb_key.mv_data = key;
mdb_key.mv_size = key_len;
MDB_val mdb_value;
MDB_txn *txn;
mdb_txn_begin(store->proc.env, NULL, MDB_RDONLY, &txn);
int get_ret = mdb_get(txn, store->proc.dbi, &mdb_key, &mdb_value);
if (get_ret == MDB_NOTFOUND) {
*return_value_len = 0;
} else {
*return_value_len = mdb_value.mv_size;
buf = malloc(mdb_value.mv_size);
memcpy(buf, mdb_value.mv_data, mdb_value.mv_size);
}
mdb_txn_abort(txn);
#endif
return buf;
}
GHashTable *store_read_all(store_t *store) {
if (store->proc.env == NULL) {
open_env(store->path, &store->proc.env, &store->proc.dbi);
LOG_DEBUGF("store.c", "Opening mdb environment %s", store->path)
}
int count = 0;
GHashTable *table = g_hash_table_new_full(g_str_hash, g_str_equal, free, free);
MDB_txn *txn = NULL;
mdb_txn_begin(store->proc.env, NULL, MDB_RDONLY, &txn);
MDB_cursor *cur = NULL;
mdb_cursor_open(txn, store->proc.dbi, &cur);
MDB_val key;
MDB_val value;
while (mdb_cursor_get(cur, &key, &value, MDB_NEXT) == 0) {
char *key_str = malloc(key.mv_size);
memcpy(key_str, key.mv_data, key.mv_size);
char *val_str = malloc(value.mv_size);
memcpy(val_str, value.mv_data, value.mv_size);
g_hash_table_insert(table, key_str, val_str);
count += 1;
}
const char *path;
mdb_env_get_path(store->proc.env, &path);
LOG_DEBUGF("store.c", "Read %d entries from %s", count, path)
mdb_cursor_close(cur);
mdb_txn_abort(txn);
return table;
}
void store_copy(store_t *store, const char *destination) {
mkdir(destination, S_IWUSR | S_IRUSR | S_IXUSR);
mdb_env_copy(store->proc.env, destination);
}

View File

@ -1,42 +0,0 @@
#ifndef SIST2_STORE_H
#define SIST2_STORE_H
#include <pthread.h>
#include <lmdb.h>
#include <glib.h>
#define STORE_SIZE_TN (1024 * 1024 * 5)
#define STORE_SIZE_TAG (1024 * 1024)
#define STORE_SIZE_META STORE_SIZE_TAG
typedef struct store_t {
char path[PATH_MAX];
size_t chunk_size;
struct {
MDB_dbi dbi;
MDB_env *env;
} proc;
struct {
size_t size;
} *shm;
} store_t;
store_t *store_create(const char *path, size_t chunk_size);
void store_destroy(store_t *store);
void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len);
void store_flush(store_t *store);
char *store_read(store_t *store, char *key, size_t key_len, size_t *return_value_len);
GHashTable *store_read_all(store_t *store);
void store_copy(store_t *store, const char *destination);
#endif

View File

@ -1,46 +1,12 @@
#include "walk.h" #include "walk.h"
#include "src/ctx.h" #include "src/ctx.h"
#include "src/parsing/parse.h" #include "src/parsing/fs_util.h"
#include <ftw.h> #include <ftw.h>
#include <pthread.h>
#define STR_STARTS_WITH(x, y) (strncmp(y, x, strlen(y) - 1) == 0) #define STR_STARTS_WITH(x, y) (strncmp(y, x, strlen(y) - 1) == 0)
__always_inline
parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info, int base) {
int len = (int) strlen(filepath);
parse_job_t *job = malloc(sizeof(parse_job_t));
strcpy(job->filepath, filepath);
job->base = base;
char *p = strrchr(filepath + base, '.');
if (p != NULL) {
job->ext = (int) (p - filepath + 1);
} else {
job->ext = len;
}
job->vfile.st_size = info->st_size;
job->vfile.st_mode = info->st_mode;
job->vfile.mtime = (int) info->st_mtim.tv_sec;
job->parent[0] = '\0';
memcpy(job->vfile.filepath, job->filepath, sizeof(job->vfile.filepath));
job->vfile.read = fs_read;
// Filesystem reads are always rewindable
job->vfile.read_rewindable = fs_read;
job->vfile.reset = fs_reset;
job->vfile.close = fs_close;
job->vfile.fd = -1;
job->vfile.is_fs_file = TRUE;
job->vfile.has_checksum = FALSE;
job->vfile.rewind_buffer_size = 0;
job->vfile.rewind_buffer = NULL;
job->vfile.calculate_checksum = ScanCtx.calculate_checksums;
return job;
}
int sub_strings[30]; int sub_strings[30];
#define EXCLUDED(str) (pcre_exec(ScanCtx.exclude, ScanCtx.exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0) #define EXCLUDED(str) (pcre_exec(ScanCtx.exclude, ScanCtx.exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
@ -55,7 +21,7 @@ int handle_entry(const char *filepath, const struct stat *info, int typeflag, st
} }
if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) { if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) {
LOG_DEBUGF("walk.c", "Excluded: %s", filepath) LOG_DEBUGF("walk.c", "Excluded: %s", filepath);
if (typeflag == FTW_F && S_ISREG(info->st_mode)) { if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu); pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
@ -69,13 +35,13 @@ int handle_entry(const char *filepath, const struct stat *info, int typeflag, st
} }
if (typeflag == FTW_F && S_ISREG(info->st_mode)) { if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
parse_job_t *job = create_fs_parse_job(filepath, info, ftw->base); parse_job_t *job = create_parse_job(filepath, (int) info->st_mtim.tv_sec, info->st_size);
tpool_work_arg_t arg = { tpool_add_work(ScanCtx.pool, &(job_t) {
.arg_size = sizeof(parse_job_t), .type = JOB_PARSE_JOB,
.arg = job .parse_job = job
}; });
tpool_add_work(ScanCtx.pool, parse, &arg); free(job);
} }
return FTW_CONTINUE; return FTW_CONTINUE;
@ -116,7 +82,7 @@ int iterate_file_list(void *input_file) {
} }
if (ScanCtx.exclude != NULL && EXCLUDED(absolute_path)) { if (ScanCtx.exclude != NULL && EXCLUDED(absolute_path)) {
LOG_DEBUGF("walk.c", "Excluded: %s", absolute_path) LOG_DEBUGF("walk.c", "Excluded: %s", absolute_path);
if (S_ISREG(info.st_mode)) { if (S_ISREG(info.st_mode)) {
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu); pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
@ -131,16 +97,14 @@ int iterate_file_list(void *input_file) {
LOG_FATALF("walk.c", "File is not a children of root folder (%s): %s", ScanCtx.index.desc.root, buf); LOG_FATALF("walk.c", "File is not a children of root folder (%s): %s", ScanCtx.index.desc.root, buf);
} }
int base = (int) (strrchr(buf, '/') - buf) + 1; parse_job_t *job = create_parse_job(absolute_path, (int) info.st_mtim.tv_sec, info.st_size);
parse_job_t *job = create_fs_parse_job(absolute_path, &info, base);
free(absolute_path); free(absolute_path);
tpool_work_arg_t arg = { tpool_add_work(ScanCtx.pool, &(job_t) {
.arg = job, .type = JOB_PARSE_JOB,
.arg_size = sizeof(parse_job_t) .parse_job = job
}; });
tpool_add_work(ScanCtx.pool, parse, &arg); free(job);
} }
return 0; return 0;

View File

@ -21,8 +21,6 @@ void vsist_logf(const char *filepath, int level, char *format, va_list ap) {
char log_str[LOG_MAX_LENGTH]; char log_str[LOG_MAX_LENGTH];
unsigned long long pid = (unsigned long long) pthread_self();
char datetime[32]; char datetime[32];
time_t t; time_t t;
struct tm result; struct tm result;
@ -42,8 +40,8 @@ void vsist_logf(const char *filepath, int level, char *format, va_list ap) {
log_len = snprintf( log_len = snprintf(
log_str, sizeof(log_str), log_str, sizeof(log_str),
"{\"thread\":\"%04llX\",\"datetime\":\"%s\",\"level\":\"%s\",\"filepath\":%s,\"message\":%s}\n", "{\"thread\":\"T%d\",\"datetime\":\"%s\",\"level\":\"%s\",\"filepath\":%s,\"message\":%s}\n",
pid, datetime, log_levels[level], filepath_json_str, log_str_json_str ProcData.thread_id, datetime, log_levels[level], filepath_json_str, log_str_json_str
); );
cJSON_Delete(filepath_json); cJSON_Delete(filepath_json);
@ -58,15 +56,15 @@ void vsist_logf(const char *filepath, int level, char *format, va_list ap) {
if (is_tty) { if (is_tty) {
log_len = snprintf( log_len = snprintf(
log_str, sizeof(log_str), log_str, sizeof(log_str),
"\033[%dm[%04llX]%s [%s] [%s %s] ", "\033[%dmT%d%s [%s] [%s %s] ",
31 + ((unsigned int) (pid)) % 7, pid, log_colors[level], 31 + ProcData.thread_id % 7, ProcData.thread_id, log_colors[level],
datetime, log_levels[level], filepath datetime, log_levels[level], filepath
); );
} else { } else {
log_len = snprintf( log_len = snprintf(
log_str, sizeof(log_str), log_str, sizeof(log_str),
"[%04llX] [%s] [%s %s] ", "T%d [%s] [%s %s] ",
pid, datetime, log_levels[level], filepath ProcData.thread_id, datetime, log_levels[level], filepath
); );
} }
@ -112,8 +110,6 @@ void sist_log(const char *filepath, int level, char *str) {
char log_str[LOG_MAX_LENGTH]; char log_str[LOG_MAX_LENGTH];
unsigned long long pid = (unsigned long long) pthread_self();
char datetime[32]; char datetime[32];
time_t t; time_t t;
struct tm result; struct tm result;
@ -132,8 +128,8 @@ void sist_log(const char *filepath, int level, char *str) {
log_len = snprintf( log_len = snprintf(
log_str, sizeof(log_str), log_str, sizeof(log_str),
"{\"thread\":\"%04llX\",\"datetime\":\"%s\",\"level\":\"%s\",\"filepath\":%s,\"message\":%s}\n", "{\"thread\":\"T%d\",\"datetime\":\"%s\",\"level\":\"%s\",\"filepath\":%s,\"message\":%s}\n",
pid, datetime, log_levels[level], filepath_json_str, log_str_json_str ProcData.thread_id, datetime, log_levels[level], filepath_json_str, log_str_json_str
); );
cJSON_Delete(log_str_json); cJSON_Delete(log_str_json);
@ -147,16 +143,16 @@ void sist_log(const char *filepath, int level, char *str) {
if (is_tty) { if (is_tty) {
log_len = snprintf( log_len = snprintf(
log_str, sizeof(log_str), log_str, sizeof(log_str),
"\033[%dm[%04llX]%s [%s] [%s %s] %s \033[0m\n", "\033[%dmT%d%s [%s] [%s %s] %s \033[0m\n",
31 + ((unsigned int) (pid)) % 7, pid, log_colors[level], 31 + ProcData.thread_id % 7, ProcData.thread_id, log_colors[level],
datetime, log_levels[level], filepath, datetime, log_levels[level], filepath,
str str
); );
} else { } else {
log_len = snprintf( log_len = snprintf(
log_str, sizeof(log_str), log_str, sizeof(log_str),
"[%04llX] [%s] [%s %s] %s \n", "T%d [%s] [%s %s] %s \n",
pid, datetime, log_levels[level], filepath, ProcData.thread_id, datetime, log_levels[level], filepath,
str str
); );
} }

View File

@ -2,6 +2,7 @@
#define SIST2_LOG_H #define SIST2_LOG_H
#include <signal.h>
#define LOG_MAX_LENGTH 8192 #define LOG_MAX_LENGTH 8192
#define LOG_SIST_DEBUG 0 #define LOG_SIST_DEBUG 0
@ -10,37 +11,37 @@
#define LOG_SIST_ERROR 3 #define LOG_SIST_ERROR 3
#define LOG_SIST_FATAL 4 #define LOG_SIST_FATAL 4
#define LOG_DEBUGF(filepath, fmt, ...) \ #define LOG_DEBUGF(filepath, fmt, ...) do{\
if (LogCtx.very_verbose) {sist_logf(filepath, LOG_SIST_DEBUG, fmt, __VA_ARGS__);} if (LogCtx.very_verbose) {sist_logf(filepath, LOG_SIST_DEBUG, fmt, __VA_ARGS__);}}while(0)
#define LOG_DEBUG(filepath, str) \ #define LOG_DEBUG(filepath, str) do{\
if (LogCtx.very_verbose) {sist_log(filepath, LOG_SIST_DEBUG, str);} if (LogCtx.very_verbose) {sist_log(filepath, LOG_SIST_DEBUG, str);}}while(0)
#define LOG_INFOF(filepath, fmt, ...) \ #define LOG_INFOF(filepath, fmt, ...) do {\
if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_INFO, fmt, __VA_ARGS__);} if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_INFO, fmt, __VA_ARGS__);}} while(0)
#define LOG_INFO(filepath, str) \ #define LOG_INFO(filepath, str) do {\
if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_INFO, str);} if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_INFO, str);}} while(0)
#define LOG_WARNINGF(filepath, fmt, ...) \ #define LOG_WARNINGF(filepath, fmt, ...) do {\
if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_WARNING, fmt, __VA_ARGS__);} if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_WARNING, fmt, __VA_ARGS__);}}while(0)
#define LOG_WARNING(filepath, str) \ #define LOG_WARNING(filepath, str) do{\
if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_WARNING, str);} if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_WARNING, str);}}while(0)
#define LOG_ERRORF(filepath, fmt, ...) \ #define LOG_ERRORF(filepath, fmt, ...) do {\
if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_ERROR, fmt, __VA_ARGS__);} if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_ERROR, fmt, __VA_ARGS__);}}while(0)
#define LOG_ERROR(filepath, str) \ #define LOG_ERROR(filepath, str) do{\
if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_ERROR, str);} if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_ERROR, str);}}while(0)
#define LOG_FATALF(filepath, fmt, ...) \ #define LOG_FATALF(filepath, fmt, ...)\
sist_logf(filepath, LOG_SIST_FATAL, fmt, __VA_ARGS__);\ sist_logf(filepath, LOG_SIST_FATAL, fmt, __VA_ARGS__);\
exit(-1); raise(SIGUSR1)
#define LOG_FATAL(filepath, str) \ #define LOG_FATAL(filepath, str) \
sist_log(filepath, LOG_SIST_FATAL, str);\ sist_log(filepath, LOG_SIST_FATAL, str);\
exit(-1); exit(SIGUSR1)
#define LOG_FATALF_NO_EXIT(filepath, fmt, ...) \ #define LOG_FATALF_NO_EXIT(filepath, fmt, ...) \
sist_logf(filepath, LOG_SIST_FATAL, fmt, __VA_ARGS__); sist_logf(filepath, LOG_SIST_FATAL, fmt, __VA_ARGS__)
#define LOG_FATAL_NO_EXIT(filepath, str) \ #define LOG_FATAL_NO_EXIT(filepath, str) \
sist_log(filepath, LOG_SIST_FATAL, str); sist_log(filepath, LOG_SIST_FATAL, str)
#include "sist.h" #include "sist.h"

View File

@ -5,8 +5,6 @@
#include <locale.h> #include <locale.h>
#include "cli.h" #include "cli.h"
#include "io/serialize.h"
#include "io/store.h"
#include "tpool.h" #include "tpool.h"
#include "io/walk.h" #include "io/walk.h"
#include "index/elastic.h" #include "index/elastic.h"
@ -16,10 +14,9 @@
#include "auth0/auth0_c_api.h" #include "auth0/auth0_c_api.h"
#include <signal.h> #include <signal.h>
#include <unistd.h> #include <pthread.h>
#include <sys/mman.h>
#include "stats.h" #include "src/database/database.h"
#define DESCRIPTION "Lightning-fast file system indexer and search tool." #define DESCRIPTION "Lightning-fast file system indexer and search tool."
@ -46,30 +43,31 @@ void sig_handler(int signum) {
LOG_ERROR("*SIGNAL HANDLER*", "=============================================\n\n"); LOG_ERROR("*SIGNAL HANDLER*", "=============================================\n\n");
LOG_ERRORF("*SIGNAL HANDLER*", "Uh oh! Caught fatal signal: %s", strsignal(signum)); LOG_ERRORF("*SIGNAL HANDLER*", "Uh oh! Caught fatal signal: %s", strsignal(signum));
if (ScanCtx.dbg_current_files != NULL) { // TODO: Print debug info
GHashTableIter iter; // if (ScanCtx.dbg_current_files != NULL) {
g_hash_table_iter_init(&iter, ScanCtx.dbg_current_files); // GHashTableIter iter;
// g_hash_table_iter_init(&iter, ScanCtx.dbg_current_files);
void *key; //
void *value; // void *key;
while (g_hash_table_iter_next(&iter, &key, &value)) { // void *value;
parse_job_t *job = value; // while (g_hash_table_iter_next(&iter, &key, &value)) {
// parse_job_t *job = value;
if (isatty(STDERR_FILENO)) { //
LOG_DEBUGF( // if (isatty(STDERR_FILENO)) {
"*SIGNAL HANDLER*", // LOG_DEBUGF(
"Thread \033[%dm[%04llX]\033[0m was working on job '%s'", // "*SIGNAL HANDLER*",
31 + ((unsigned int) key) % 7, key, job->filepath // "Thread \033[%dm[%04llX]\033[0m was working on job '%s'",
); // 31 + ((unsigned int) key) % 7, key, job->filepath
} else { // );
LOG_DEBUGF( // } else {
"*SIGNAL HANDLER*", // LOG_DEBUGF(
"THREAD [%04llX] was working on job %s", // "*SIGNAL HANDLER*",
key, job->filepath // "THREAD [%04llX] was working on job %s",
); // key, job->filepath
} // );
} // }
} // }
// }
if (ScanCtx.pool != NULL) { if (ScanCtx.pool != NULL) {
tpool_dump_debug_info(ScanCtx.pool); tpool_dump_debug_info(ScanCtx.pool);
@ -82,18 +80,18 @@ void sig_handler(int signum) {
LOG_INFO( LOG_INFO(
"*SIGNAL HANDLER*", "*SIGNAL HANDLER*",
"Please consider creating a bug report at https://github.com/simon987/sist2/issues !" "Please consider creating a bug report at https://github.com/simon987/sist2/issues !"
) );
LOG_INFO( LOG_INFO(
"*SIGNAL HANDLER*", "*SIGNAL HANDLER*",
"sist2 is an open source project and relies on the collaboration of its users to diagnose and fix bugs" "sist2 is an open source project and relies on the collaboration of its users to diagnose and fix bugs"
) );
#ifndef SIST_DEBUG #ifndef SIST_DEBUG
LOG_WARNING( LOG_WARNING(
"*SIGNAL HANDLER*", "*SIGNAL HANDLER*",
"You are running sist2 in release mode! Please consider downloading the debug binary from the Github " "You are running sist2 in release mode! Please consider downloading the debug binary from the Github "
"releases page to provide additionnal information when submitting a bug report." "releases page to provide additionnal information when submitting a bug report."
) );
#endif #endif
if (signum == SIGSEGV && sigsegv_handler != NULL) { if (signum == SIGSEGV && sigsegv_handler != NULL) {
@ -105,36 +103,59 @@ void sig_handler(int signum) {
exit(-1); exit(-1);
} }
void init_dir(const char *dirpath, scan_args_t *args) { void database_scan_begin(scan_args_t *args) {
char path[PATH_MAX]; index_descriptor_t *desc = &ScanCtx.index.desc;
snprintf(path, PATH_MAX, "%sdescriptor.json", dirpath);
time(&ScanCtx.index.desc.timestamp); database_t *db = database_create(args->output, INDEX_DATABASE);
strcpy(ScanCtx.index.desc.version, Version);
strcpy(ScanCtx.index.desc.type, INDEX_TYPE_NDJSON); if (args->incremental) {
// Update existing descriptor
database_open(db);
index_descriptor_t *original_desc = database_read_index_descriptor(db);
// copy original index id
strcpy(desc->id, original_desc->id);
if (original_desc->version_major != VersionMajor) {
LOG_FATALF("main.c", "Version mismatch! Index is %s but executable is %s", original_desc->version, Version);
}
strcpy(original_desc->root, desc->root);
original_desc->root_len = desc->root_len;
strcpy(original_desc->rewrite_url, desc->rewrite_url);
strcpy(original_desc->name, desc->name);
time(&original_desc->timestamp);
database_write_index_descriptor(db, original_desc);
free(original_desc);
database_incremental_scan_begin(db);
if (args->incremental != NULL) {
// copy old index id
char descriptor_path[PATH_MAX];
snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->incremental);
index_descriptor_t original_desc = read_index_descriptor(descriptor_path);
memcpy(ScanCtx.index.desc.id, original_desc.id, sizeof(original_desc.id));
} else { } else {
// Create new descriptor
time(&desc->timestamp);
strcpy(desc->version, Version);
desc->version_major = VersionMajor;
desc->version_minor = VersionMinor;
desc->version_patch = VersionPatch;
// generate new index id based on timestamp // generate new index id based on timestamp
unsigned char index_md5[MD5_DIGEST_LENGTH]; unsigned char index_md5[MD5_DIGEST_LENGTH];
MD5((unsigned char *) &ScanCtx.index.desc.timestamp, sizeof(ScanCtx.index.desc.timestamp), index_md5); MD5((unsigned char *) &ScanCtx.index.desc.timestamp, sizeof(ScanCtx.index.desc.timestamp), index_md5);
buf2hex(index_md5, MD5_DIGEST_LENGTH, ScanCtx.index.desc.id); buf2hex(index_md5, MD5_DIGEST_LENGTH, ScanCtx.index.desc.id);
database_initialize(db);
database_open(db);
database_write_index_descriptor(db, desc);
} }
write_index_descriptor(path, &ScanCtx.index.desc); database_close(db, FALSE);
} }
void scan_print_header() { void write_thumbnail_callback(char *key, int num, void *buf, size_t buf_len) {
LOG_INFOF("main.c", "sist2 v%s", Version) database_write_thumbnail(ProcData.index_db, key, num, buf, buf_len);
}
void _store(char *key, size_t key_len, char *buf, size_t buf_len) {
store_write(ScanCtx.index.store, key, key_len, buf, buf_len);
} }
void _log(const char *filepath, int level, char *str) { void _log(const char *filepath, int level, char *str) {
@ -177,11 +198,8 @@ void _logf(const char *filepath, int level, char *format, ...) {
} }
void initialize_scan_context(scan_args_t *args) { void initialize_scan_context(scan_args_t *args) {
// TODO: shared
ScanCtx.dbg_current_files = g_hash_table_new_full(g_int64_hash, g_int64_equal, NULL, NULL);
pthread_mutex_init(&ScanCtx.dbg_current_files_mu, NULL);
pthread_mutex_init(&ScanCtx.dbg_file_counts_mu, NULL); pthread_mutex_init(&ScanCtx.dbg_file_counts_mu, NULL);
pthread_mutex_init(&ScanCtx.copy_table_mu, NULL);
ScanCtx.calculate_checksums = args->calculate_checksums; ScanCtx.calculate_checksums = args->calculate_checksums;
@ -189,7 +207,7 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.arc_ctx.mode = args->archive_mode; ScanCtx.arc_ctx.mode = args->archive_mode;
ScanCtx.arc_ctx.log = _log; ScanCtx.arc_ctx.log = _log;
ScanCtx.arc_ctx.logf = _logf; ScanCtx.arc_ctx.logf = _logf;
ScanCtx.arc_ctx.parse = (parse_callback_t) parse_job; ScanCtx.arc_ctx.parse = (parse_callback_t) parse;
if (args->archive_passphrase != NULL) { if (args->archive_passphrase != NULL) {
strcpy(ScanCtx.arc_ctx.passphrase, args->archive_passphrase); strcpy(ScanCtx.arc_ctx.passphrase, args->archive_passphrase);
} else { } else {
@ -199,12 +217,12 @@ void initialize_scan_context(scan_args_t *args) {
// Comic // Comic
ScanCtx.comic_ctx.log = _log; ScanCtx.comic_ctx.log = _log;
ScanCtx.comic_ctx.logf = _logf; ScanCtx.comic_ctx.logf = _logf;
ScanCtx.comic_ctx.store = _store; ScanCtx.comic_ctx.store = write_thumbnail_callback;
ScanCtx.comic_ctx.enable_tn = args->tn_count > 0; ScanCtx.comic_ctx.enable_tn = args->tn_count > 0;
ScanCtx.comic_ctx.tn_size = args->tn_size; ScanCtx.comic_ctx.tn_size = args->tn_size;
ScanCtx.comic_ctx.tn_qscale = args->tn_quality; ScanCtx.comic_ctx.tn_qscale = args->tn_quality;
ScanCtx.comic_ctx.cbr_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/x-cbr"); ScanCtx.comic_ctx.cbr_mime = mime_get_mime_by_string("application/x-cbr");
ScanCtx.comic_ctx.cbz_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/x-cbz"); ScanCtx.comic_ctx.cbz_mime = mime_get_mime_by_string("application/x-cbz");
// Ebook // Ebook
ScanCtx.ebook_ctx.content_size = args->content_size; ScanCtx.ebook_ctx.content_size = args->content_size;
@ -216,7 +234,7 @@ void initialize_scan_context(scan_args_t *args) {
} }
ScanCtx.ebook_ctx.log = _log; ScanCtx.ebook_ctx.log = _log;
ScanCtx.ebook_ctx.logf = _logf; ScanCtx.ebook_ctx.logf = _logf;
ScanCtx.ebook_ctx.store = _store; ScanCtx.ebook_ctx.store = write_thumbnail_callback;
ScanCtx.ebook_ctx.fast_epub_parse = args->fast_epub; ScanCtx.ebook_ctx.fast_epub_parse = args->fast_epub;
ScanCtx.ebook_ctx.tn_qscale = args->tn_quality; ScanCtx.ebook_ctx.tn_qscale = args->tn_quality;
@ -224,7 +242,7 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.font_ctx.enable_tn = args->tn_count > 0; ScanCtx.font_ctx.enable_tn = args->tn_count > 0;
ScanCtx.font_ctx.log = _log; ScanCtx.font_ctx.log = _log;
ScanCtx.font_ctx.logf = _logf; ScanCtx.font_ctx.logf = _logf;
ScanCtx.font_ctx.store = _store; ScanCtx.font_ctx.store = write_thumbnail_callback;
// Media // Media
ScanCtx.media_ctx.tn_qscale = args->tn_quality; ScanCtx.media_ctx.tn_qscale = args->tn_quality;
@ -232,7 +250,7 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.media_ctx.tn_count = args->tn_count; ScanCtx.media_ctx.tn_count = args->tn_count;
ScanCtx.media_ctx.log = _log; ScanCtx.media_ctx.log = _log;
ScanCtx.media_ctx.logf = _logf; ScanCtx.media_ctx.logf = _logf;
ScanCtx.media_ctx.store = _store; ScanCtx.media_ctx.store = write_thumbnail_callback;
ScanCtx.media_ctx.max_media_buffer = (long) args->max_memory_buffer_mib * 1024 * 1024; ScanCtx.media_ctx.max_media_buffer = (long) args->max_memory_buffer_mib * 1024 * 1024;
ScanCtx.media_ctx.read_subtitles = args->read_subtitles; ScanCtx.media_ctx.read_subtitles = args->read_subtitles;
ScanCtx.media_ctx.read_subtitles = args->tn_count; ScanCtx.media_ctx.read_subtitles = args->tn_count;
@ -248,7 +266,7 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.ooxml_ctx.content_size = args->content_size; ScanCtx.ooxml_ctx.content_size = args->content_size;
ScanCtx.ooxml_ctx.log = _log; ScanCtx.ooxml_ctx.log = _log;
ScanCtx.ooxml_ctx.logf = _logf; ScanCtx.ooxml_ctx.logf = _logf;
ScanCtx.ooxml_ctx.store = _store; ScanCtx.ooxml_ctx.store = write_thumbnail_callback;
// MOBI // MOBI
ScanCtx.mobi_ctx.content_size = args->content_size; ScanCtx.mobi_ctx.content_size = args->content_size;
@ -264,8 +282,8 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.msdoc_ctx.content_size = args->content_size; ScanCtx.msdoc_ctx.content_size = args->content_size;
ScanCtx.msdoc_ctx.log = _log; ScanCtx.msdoc_ctx.log = _log;
ScanCtx.msdoc_ctx.logf = _logf; ScanCtx.msdoc_ctx.logf = _logf;
ScanCtx.msdoc_ctx.store = _store; ScanCtx.msdoc_ctx.store = write_thumbnail_callback;
ScanCtx.msdoc_ctx.msdoc_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/msword"); ScanCtx.msdoc_ctx.msdoc_mime = mime_get_mime_by_string("application/msword");
ScanCtx.threads = args->threads; ScanCtx.threads = args->threads;
ScanCtx.depth = args->depth; ScanCtx.depth = args->depth;
@ -283,174 +301,67 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.raw_ctx.tn_size = args->tn_size; ScanCtx.raw_ctx.tn_size = args->tn_size;
ScanCtx.raw_ctx.log = _log; ScanCtx.raw_ctx.log = _log;
ScanCtx.raw_ctx.logf = _logf; ScanCtx.raw_ctx.logf = _logf;
ScanCtx.raw_ctx.store = _store; ScanCtx.raw_ctx.store = write_thumbnail_callback;
// Wpd // Wpd
ScanCtx.wpd_ctx.content_size = args->content_size; ScanCtx.wpd_ctx.content_size = args->content_size;
ScanCtx.wpd_ctx.log = _log; ScanCtx.wpd_ctx.log = _log;
ScanCtx.wpd_ctx.logf = _logf; ScanCtx.wpd_ctx.logf = _logf;
ScanCtx.wpd_ctx.wpd_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/wordperfect"); ScanCtx.wpd_ctx.wpd_mime = mime_get_mime_by_string("application/wordperfect");
// Json // Json
ScanCtx.json_ctx.content_size = args->content_size; ScanCtx.json_ctx.content_size = args->content_size;
ScanCtx.json_ctx.log = _log; ScanCtx.json_ctx.log = _log;
ScanCtx.json_ctx.logf = _logf; ScanCtx.json_ctx.logf = _logf;
ScanCtx.json_ctx.json_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/json"); ScanCtx.json_ctx.json_mime = mime_get_mime_by_string("application/json");
ScanCtx.json_ctx.ndjson_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/ndjson"); ScanCtx.json_ctx.ndjson_mime = mime_get_mime_by_string("application/ndjson");
} }
/**
* Loads an existing index as the baseline for incremental scanning.
* 1. load old index files (original+main) => original_table
* 2. allocate empty table => copy_table
* 3. allocate empty table => new_table
* the original_table/copy_table/new_table will be populated in parsing/parse.c:parse
* and consumed in main.c:save_incremental_index
*
* Note: the existing index may or may not be of incremental index form.
*/
void load_incremental_index(const scan_args_t *args) {
char file_path[PATH_MAX];
ScanCtx.original_table = incremental_get_table();
ScanCtx.copy_table = incremental_get_table();
ScanCtx.new_table = incremental_get_table();
char descriptor_path[PATH_MAX];
snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->incremental);
index_descriptor_t original_desc = read_index_descriptor(descriptor_path);
if (strcmp(original_desc.version, Version) != 0) {
LOG_FATALF("main.c", "Version mismatch! Index is %s but executable is %s", original_desc.version, Version)
}
READ_INDICES(
file_path,
args->incremental,
incremental_read(ScanCtx.original_table, file_path, &original_desc),
LOG_DEBUG("main.c", "The base index for incremental scan does not have a main index"),
TRUE
);
LOG_INFOF("main.c", "Loaded %d items in to mtime table.", g_hash_table_size(ScanCtx.original_table))
}
/**
* Saves an incremental index.
* Before calling this function, the scanner should have finished writing the main index.
* 1. Build original_table - new_table => delete_table
* 2. Incrementally copy from old index files [(original+main) /\ copy_table] => index_original.ndjson.zst & store
*/
void save_incremental_index(scan_args_t *args) {
char dst_path[PATH_MAX];
char store_path[PATH_MAX];
char file_path[PATH_MAX];
char del_path[PATH_MAX];
snprintf(store_path, PATH_MAX, "%sthumbs", args->incremental);
snprintf(dst_path, PATH_MAX, "%s_index_original.ndjson.zst", ScanCtx.index.path);
store_t *source = store_create(store_path, STORE_SIZE_TN);
LOG_INFOF("main.c", "incremental_delete: original size = %u, copy size = %u, new size = %u",
g_hash_table_size(ScanCtx.original_table),
g_hash_table_size(ScanCtx.copy_table),
g_hash_table_size(ScanCtx.new_table));
snprintf(del_path, PATH_MAX, "%s_index_delete.list.zst", ScanCtx.index.path);
READ_INDICES(file_path, args->incremental,
incremental_delete(del_path, file_path, ScanCtx.copy_table, ScanCtx.new_table),
perror("incremental_delete"), 1);
writer_cleanup();
READ_INDICES(file_path, args->incremental,
incremental_copy(source, ScanCtx.index.store, file_path, dst_path, ScanCtx.copy_table),
perror("incremental_copy"), 1);
writer_cleanup();
store_destroy(source);
snprintf(store_path, PATH_MAX, "%stags", args->incremental);
snprintf(dst_path, PATH_MAX, "%stags", ScanCtx.index.path);
store_t *source_tags = store_create(store_path, STORE_SIZE_TAG);
store_copy(source_tags, dst_path);
store_destroy(source_tags);
}
/**
* An index can be either incremental or non-incremental (initial index).
* For an initial index, there is only the "main" index.
* For an incremental index, there are, additionally:
* - An "original" index, referencing all files unchanged since the previous index.
* - A "delete" index, referencing all files that exist in the previous index, but deleted since then.
* Therefore, for an incremental index, "main"+"original" covers all the current files in the live filesystem,
* and is orthognal with the "delete" index. When building an incremental index upon an old incremental index,
* the old "delete" index can be safely ignored.
*/
void sist2_scan(scan_args_t *args) { void sist2_scan(scan_args_t *args) {
ScanCtx.mime_table = mime_get_mime_table();
ScanCtx.ext_table = mime_get_ext_table();
initialize_scan_context(args); initialize_scan_context(args);
init_dir(ScanCtx.index.path, args); database_scan_begin(args);
char store_path[PATH_MAX]; LOG_INFOF("main.c", "sist2 v%s", Version);
snprintf(store_path, PATH_MAX, "%sthumbs", ScanCtx.index.path);
ScanCtx.index.store = store_create(store_path, STORE_SIZE_TN);
snprintf(store_path, PATH_MAX, "%smeta", ScanCtx.index.path); ScanCtx.pool = tpool_create(ScanCtx.threads, TRUE);
ScanCtx.index.meta_store = store_create(store_path, STORE_SIZE_META);
scan_print_header();
if (args->incremental != NULL) {
load_incremental_index(args);
}
ScanCtx.writer_pool = tpool_create(1, writer_cleanup, FALSE);
tpool_start(ScanCtx.writer_pool);
ScanCtx.pool = tpool_create(ScanCtx.threads, thread_cleanup, TRUE);
tpool_start(ScanCtx.pool); tpool_start(ScanCtx.pool);
if (args->list_path) { if (args->list_path) {
// Scan using file list // Scan using file list
int list_ret = iterate_file_list(args->list_file); int list_ret = iterate_file_list(args->list_file);
if (list_ret != 0) { if (list_ret != 0) {
LOG_FATALF("main.c", "iterate_file_list() failed! (%d)", list_ret) LOG_FATALF("main.c", "iterate_file_list() failed! (%d)", list_ret);
} }
} else { } else {
// Scan directory recursively // Scan directory recursively
int walk_ret = walk_directory_tree(ScanCtx.index.desc.root); int walk_ret = walk_directory_tree(ScanCtx.index.desc.root);
if (walk_ret == -1) { if (walk_ret == -1) {
LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno) LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno);
} }
} }
tpool_wait(ScanCtx.pool); tpool_wait(ScanCtx.pool);
tpool_destroy(ScanCtx.pool); tpool_destroy(ScanCtx.pool);
tpool_wait(ScanCtx.writer_pool); LOG_DEBUGF("main.c", "Skipped files: %d", ScanCtx.dbg_skipped_files_count);
tpool_destroy(ScanCtx.writer_pool); LOG_DEBUGF("main.c", "Excluded files: %d", ScanCtx.dbg_excluded_files_count);
LOG_DEBUGF("main.c", "Failed files: %d", ScanCtx.dbg_failed_files_count);
LOG_DEBUGF("main.c", "Thumbnail store size: %lu", ScanCtx.stat_tn_size);
LOG_DEBUGF("main.c", "Index size: %lu", ScanCtx.stat_index_size);
LOG_DEBUGF("main.c", "Skipped files: %d", ScanCtx.dbg_skipped_files_count) database_t *db = database_create(args->output, INDEX_DATABASE);
LOG_DEBUGF("main.c", "Excluded files: %d", ScanCtx.dbg_excluded_files_count) database_open(db);
LOG_DEBUGF("main.c", "Failed files: %d", ScanCtx.dbg_failed_files_count)
LOG_DEBUGF("main.c", "Thumbnail store size: %lu", ScanCtx.stat_tn_size)
LOG_DEBUGF("main.c", "Index size: %lu", ScanCtx.stat_index_size)
if (args->incremental != NULL) { if (args->incremental != FALSE) {
save_incremental_index(args); database_incremental_scan_end(db);
} }
generate_stats(&ScanCtx.index, args->treemap_threshold, ScanCtx.index.path); database_generate_stats(db, args->treemap_threshold);
database_close(db, TRUE);
store_destroy(ScanCtx.index.store);
store_destroy(ScanCtx.index.meta_store);
} }
void sist2_index(index_args_t *args) { void sist2_index(index_args_t *args) {
char file_path[PATH_MAX];
IndexCtx.es_url = args->es_url; IndexCtx.es_url = args->es_url;
IndexCtx.es_index = args->es_index; IndexCtx.es_index = args->es_index;
IndexCtx.es_insecure_ssl = args->es_insecure_ssl; IndexCtx.es_insecure_ssl = args->es_insecure_ssl;
@ -461,91 +372,69 @@ void sist2_index(index_args_t *args) {
elastic_init(args->force_reset, args->es_mappings, args->es_settings); elastic_init(args->force_reset, args->es_mappings, args->es_settings);
} }
char descriptor_path[PATH_MAX]; database_t *db = database_create(args->index_path, INDEX_DATABASE);
snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->index_path); database_open(db);
index_descriptor_t *desc = database_read_index_descriptor(db);
database_close(db, FALSE);
index_descriptor_t desc = read_index_descriptor(descriptor_path); LOG_DEBUGF("main.c", "Index version %s", desc->version);
LOG_DEBUGF("main.c", "descriptor version %s (%s)", desc.version, desc.type) if (desc->version_major != VersionMajor) {
LOG_FATALF("main.c", "Version mismatch! Index is %s but executable is %s", desc->version, Version);
if (strcmp(desc.version, Version) != 0) {
LOG_FATALF("main.c", "Version mismatch! Index is %s but executable is %s", desc.version, Version)
} }
DIR *dir = opendir(args->index_path); IndexCtx.pool = tpool_create(args->threads, args->print == FALSE);
if (dir == NULL) {
LOG_FATALF("main.c", "Could not open index %s: %s", args->index_path, strerror(errno))
}
char path_tmp[PATH_MAX];
snprintf(path_tmp, sizeof(path_tmp), "%stags", args->index_path);
IndexCtx.tag_store = store_create(path_tmp, STORE_SIZE_TAG);
IndexCtx.tags = store_read_all(IndexCtx.tag_store);
snprintf(path_tmp, sizeof(path_tmp), "%smeta", args->index_path);
IndexCtx.meta_store = store_create(path_tmp, STORE_SIZE_META);
IndexCtx.meta = store_read_all(IndexCtx.meta_store);
index_func f;
if (args->print) {
f = print_json;
} else {
f = index_json;
}
IndexCtx.pool = tpool_create(args->threads, elastic_cleanup, args->print == 0);
tpool_start(IndexCtx.pool); tpool_start(IndexCtx.pool);
READ_INDICES(file_path, args->index_path, { int cnt = 0;
read_index(file_path, desc.id, desc.type, f);
LOG_DEBUGF("main.c", "Read index file %s (%s)", file_path, desc.type);
}, {}, !args->incremental);
// Only read the _delete index if we're sending data to ES db = database_create(args->index_path, INDEX_DATABASE);
if (!args->print) { database_open(db);
snprintf(file_path, PATH_MAX, "%s_index_delete.list.zst", args->index_path); database_iterator_t *iterator = database_create_document_iterator(db);
if (0 == access(file_path, R_OK)) { database_document_iter_foreach(json, iterator) {
read_lines(file_path, (line_processor_t) { const char *doc_id = cJSON_GetObjectItem(json, "_id")->valuestring;
.data = NULL, if (args->print) {
.func = delete_document print_json(json, doc_id);
}); } else {
LOG_DEBUGF("main.c", "Read index file %s (%s)", file_path, desc.type) index_json(json, doc_id);
cnt +=1;
} }
} }
closedir(dir); free(iterator);
database_close(db, FALSE);
// Only read the _delete index if we're sending data to ES
if (!args->print) {
// TODO: (delete_list iterator)
}
tpool_wait(IndexCtx.pool); tpool_wait(IndexCtx.pool);
tpool_destroy(IndexCtx.pool); tpool_destroy(IndexCtx.pool);
if (IndexCtx.needs_es_connection) { if (IndexCtx.needs_es_connection) {
finish_indexer(args->script, args->async_script, desc.id); finish_indexer(args->script, args->async_script, desc->id);
} }
free(desc);
store_destroy(IndexCtx.tag_store);
store_destroy(IndexCtx.meta_store);
g_hash_table_remove_all(IndexCtx.tags);
g_hash_table_destroy(IndexCtx.tags);
} }
void sist2_exec_script(exec_args_t *args) { void sist2_exec_script(exec_args_t *args) {
LogCtx.verbose = TRUE; LogCtx.verbose = TRUE;
char descriptor_path[PATH_MAX];
snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->index_path);
index_descriptor_t desc = read_index_descriptor(descriptor_path);
IndexCtx.es_url = args->es_url; IndexCtx.es_url = args->es_url;
IndexCtx.es_index = args->es_index; IndexCtx.es_index = args->es_index;
IndexCtx.es_insecure_ssl = args->es_insecure_ssl; IndexCtx.es_insecure_ssl = args->es_insecure_ssl;
IndexCtx.needs_es_connection = TRUE; IndexCtx.needs_es_connection = TRUE;
LOG_DEBUGF("main.c", "descriptor version %s (%s)", desc.version, desc.type) database_t *db = database_create(args->index_path, INDEX_DATABASE);
database_open(db);
execute_update_script(args->script, args->async_script, desc.id); index_descriptor_t *desc = database_read_index_descriptor(db);
LOG_DEBUGF("main.c", "Index version %s", desc->version);
execute_update_script(args->script, args->async_script, desc->id);
free(args->script); free(args->script);
database_close(db, FALSE);
} }
void sist2_web(web_args_t *args) { void sist2_web(web_args_t *args) {
@ -569,23 +458,17 @@ void sist2_web(web_args_t *args) {
for (int i = 0; i < args->index_count; i++) { for (int i = 0; i < args->index_count; i++) {
char *abs_path = abspath(args->indices[i]); char *abs_path = abspath(args->indices[i]);
if (abs_path == NULL) {
return;
}
char path_tmp[PATH_MAX];
snprintf(path_tmp, PATH_MAX, "%sthumbs", abs_path);
WebCtx.indices[i].store = store_create(path_tmp, STORE_SIZE_TN);
snprintf(path_tmp, PATH_MAX, "%stags", abs_path);
mkdir(path_tmp, S_IWUSR | S_IRUSR | S_IXUSR);
WebCtx.indices[i].tag_store = store_create(path_tmp, STORE_SIZE_TAG);
snprintf(path_tmp, PATH_MAX, "%sdescriptor.json", abs_path);
WebCtx.indices[i].desc = read_index_descriptor(path_tmp);
strcpy(WebCtx.indices[i].path, abs_path); strcpy(WebCtx.indices[i].path, abs_path);
LOG_INFOF("main.c", "Loaded index: [%s]", WebCtx.indices[i].desc.name)
WebCtx.indices[i].db = database_create(abs_path, INDEX_DATABASE);
database_open(WebCtx.indices[i].db);
index_descriptor_t *desc = database_read_index_descriptor(WebCtx.indices[i].db);
WebCtx.indices[i].desc = *desc;
free(desc);
LOG_INFOF("main.c", "Loaded index: [%s]", WebCtx.indices[i].desc.name);
free(abs_path); free(abs_path);
} }
@ -600,7 +483,7 @@ void sist2_web(web_args_t *args) {
* Negative number -> Raise error * Negative number -> Raise error
* Specified a valid number -> Continue as normal * Specified a valid number -> Continue as normal
*/ */
int set_to_negative_if_value_is_zero(struct argparse *self, const struct argparse_option *option) { int set_to_negative_if_value_is_zero(UNUSED(struct argparse *self), const struct argparse_option *option) {
int specified_value = *(int *) option->value; int specified_value = *(int *) option->value;
if (specified_value == 0) { if (specified_value == 0) {
@ -613,6 +496,7 @@ int set_to_negative_if_value_is_zero(struct argparse *self, const struct argpars
} }
} }
#include <zlib.h>
int main(int argc, const char *argv[]) { int main(int argc, const char *argv[]) {
// sigsegv_handler = signal(SIGSEGV, sig_handler); // sigsegv_handler = signal(SIGSEGV, sig_handler);
@ -645,8 +529,8 @@ int main(int argc, const char *argv[]) {
OPT_GROUP("Scan options"), OPT_GROUP("Scan options"),
OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"), OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"),
OPT_INTEGER('q', "thumbnail-quality", &scan_args->tn_quality, OPT_INTEGER('q', "thumbnail-quality", &scan_args->tn_quality,
"Thumbnail quality, on a scale of 2 to 31, 2 being the best. DEFAULT=2", "Thumbnail quality, on a scale of 2 to 31, 2 being the best. DEFAULT=2",
set_to_negative_if_value_is_zero, (intptr_t) &scan_args->tn_quality), set_to_negative_if_value_is_zero, (intptr_t) &scan_args->tn_quality),
OPT_INTEGER(0, "thumbnail-size", &scan_args->tn_size, OPT_INTEGER(0, "thumbnail-size", &scan_args->tn_size,
"Thumbnail size, in pixels. DEFAULT=500", "Thumbnail size, in pixels. DEFAULT=500",
set_to_negative_if_value_is_zero, (intptr_t) &scan_args->tn_size), set_to_negative_if_value_is_zero, (intptr_t) &scan_args->tn_size),
@ -656,7 +540,8 @@ int main(int argc, const char *argv[]) {
OPT_INTEGER(0, "content-size", &scan_args->content_size, OPT_INTEGER(0, "content-size", &scan_args->content_size,
"Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT=32768", "Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT=32768",
set_to_negative_if_value_is_zero, (intptr_t) &scan_args->content_size), set_to_negative_if_value_is_zero, (intptr_t) &scan_args->content_size),
OPT_STRING(0, "incremental", &scan_args->incremental, OPT_BOOLEAN(0, "incremental", &scan_args->incremental,
// TODO: Update help string
"Reuse an existing index and only scan modified files."), "Reuse an existing index and only scan modified files."),
OPT_STRING('o', "output", &scan_args->output, "Output directory. DEFAULT=index.sist2/"), OPT_STRING('o', "output", &scan_args->output, "Output directory. DEFAULT=index.sist2/"),
OPT_STRING(0, "rewrite-url", &scan_args->rewrite_url, "Serve files from this url instead of from disk."), OPT_STRING(0, "rewrite-url", &scan_args->rewrite_url, "Serve files from this url instead of from disk."),
@ -692,7 +577,8 @@ int main(int argc, const char *argv[]) {
OPT_GROUP("Index options"), OPT_GROUP("Index options"),
OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"), OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"),
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url with port. DEFAULT=http://localhost:9200"), OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url with port. DEFAULT=http://localhost:9200"),
OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl, "Do not verify SSL connections to Elasticsearch."), OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl,
"Do not verify SSL connections to Elasticsearch."),
OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"), OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"),
OPT_BOOLEAN('p', "print", &index_args->print, "Just print JSON documents to stdout."), OPT_BOOLEAN('p', "print", &index_args->print, "Just print JSON documents to stdout."),
OPT_BOOLEAN(0, "incremental-index", &index_args->incremental, OPT_BOOLEAN(0, "incremental-index", &index_args->incremental,
@ -701,20 +587,22 @@ int main(int argc, const char *argv[]) {
OPT_STRING(0, "mappings-file", &index_args->es_mappings_path, "Path to Elasticsearch mappings."), OPT_STRING(0, "mappings-file", &index_args->es_mappings_path, "Path to Elasticsearch mappings."),
OPT_STRING(0, "settings-file", &index_args->es_settings_path, "Path to Elasticsearch settings."), OPT_STRING(0, "settings-file", &index_args->es_settings_path, "Path to Elasticsearch settings."),
OPT_BOOLEAN(0, "async-script", &common_async_script, "Execute user script asynchronously."), OPT_BOOLEAN(0, "async-script", &common_async_script, "Execute user script asynchronously."),
OPT_INTEGER(0, "batch-size", &index_args->batch_size, "Index batch size. DEFAULT: 100"), OPT_INTEGER(0, "batch-size", &index_args->batch_size, "Index batch size. DEFAULT: 70"),
OPT_BOOLEAN('f', "force-reset", &index_args->force_reset, "Reset Elasticsearch mappings and settings. " OPT_BOOLEAN('f', "force-reset", &index_args->force_reset, "Reset Elasticsearch mappings and settings. "
"(You must use this option the first time you use the index command)"), "(You must use this option the first time you use the index command)"),
OPT_GROUP("Web options"), OPT_GROUP("Web options"),
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"), OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"),
OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl, "Do not verify SSL connections to Elasticsearch."), OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl,
"Do not verify SSL connections to Elasticsearch."),
OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"), OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"),
OPT_STRING(0, "bind", &web_args->listen_address, "Listen on this address. DEFAULT=localhost:4090"), OPT_STRING(0, "bind", &web_args->listen_address, "Listen on this address. DEFAULT=localhost:4090"),
OPT_STRING(0, "auth", &web_args->credentials, "Basic auth in user:password format"), OPT_STRING(0, "auth", &web_args->credentials, "Basic auth in user:password format"),
OPT_STRING(0, "auth0-audience", &web_args->auth0_audience, "API audience/identifier"), OPT_STRING(0, "auth0-audience", &web_args->auth0_audience, "API audience/identifier"),
OPT_STRING(0, "auth0-domain", &web_args->auth0_domain, "Application domain"), OPT_STRING(0, "auth0-domain", &web_args->auth0_domain, "Application domain"),
OPT_STRING(0, "auth0-client-id", &web_args->auth0_client_id, "Application client ID"), OPT_STRING(0, "auth0-client-id", &web_args->auth0_client_id, "Application client ID"),
OPT_STRING(0, "auth0-public-key-file", &web_args->auth0_public_key_path, "Path to Auth0 public key file extracted from <domain>/pem"), OPT_STRING(0, "auth0-public-key-file", &web_args->auth0_public_key_path,
"Path to Auth0 public key file extracted from <domain>/pem"),
OPT_STRING(0, "tag-auth", &web_args->tag_credentials, "Basic auth in user:password format for tagging"), OPT_STRING(0, "tag-auth", &web_args->tag_credentials, "Basic auth in user:password format for tagging"),
OPT_STRING(0, "tagline", &web_args->tagline, "Tagline in navbar"), OPT_STRING(0, "tagline", &web_args->tagline, "Tagline in navbar"),
OPT_BOOLEAN(0, "dev", &web_args->dev, "Serve html & js files from disk (for development)"), OPT_BOOLEAN(0, "dev", &web_args->dev, "Serve html & js files from disk (for development)"),
@ -722,7 +610,8 @@ int main(int argc, const char *argv[]) {
OPT_GROUP("Exec-script options"), OPT_GROUP("Exec-script options"),
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"), OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"),
OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl, "Do not verify SSL connections to Elasticsearch."), OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl,
"Do not verify SSL connections to Elasticsearch."),
OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"), OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"),
OPT_STRING(0, "script-file", &common_script_path, "Path to user script."), OPT_STRING(0, "script-file", &common_script_path, "Path to user script."),
OPT_BOOLEAN(0, "async-script", &common_async_script, "Execute user script asynchronously."), OPT_BOOLEAN(0, "async-script", &common_async_script, "Execute user script asynchronously."),
@ -800,7 +689,7 @@ int main(int argc, const char *argv[]) {
} else { } else {
argparse_usage(&argparse); argparse_usage(&argparse);
LOG_FATALF("main.c", "Invalid command: '%s'\n", argv[0]) LOG_FATALF("main.c", "Invalid command: '%s'\n", argv[0]);
} }
printf("\n"); printf("\n");

View File

@ -1,757 +0,0 @@
#include "mempool.h"
#include <unistd.h>
#define NCX_SLAB_PAGE_MASK 3
#define NCX_SLAB_PAGE 0
#define NCX_SLAB_BIG 1
#define NCX_SLAB_EXACT 2
#define NCX_SLAB_SMALL 3
#define NCX_SLAB_PAGE_FREE 0
#define NCX_SLAB_PAGE_BUSY 0xffffffffffffffff
#define NCX_SLAB_PAGE_START 0x8000000000000000
#define NCX_SLAB_SHIFT_MASK 0x000000000000000f
#define NCX_SLAB_MAP_MASK 0xffffffff00000000
#define NCX_SLAB_MAP_SHIFT 32
#define NCX_SLAB_BUSY 0xffffffffffffffff
static ncx_slab_page_t *ncx_slab_alloc_pages(ncx_slab_pool_t *pool, ncx_uint_t pages);
static void ncx_slab_free_pages(ncx_slab_pool_t *pool, ncx_slab_page_t *page, ncx_uint_t pages);
static bool ncx_slab_empty(ncx_slab_pool_t *pool, ncx_slab_page_t *page);
static ncx_uint_t ncx_slab_max_size;
static ncx_uint_t ncx_slab_exact_size;
static ncx_uint_t ncx_slab_exact_shift;
static ncx_uint_t ncx_pagesize;
static ncx_uint_t ncx_pagesize_shift;
static ncx_uint_t ncx_real_pages;
void ncx_slab_init(ncx_slab_pool_t *pool) {
u_char *p;
size_t size;
ncx_uint_t i, n, pages;
ncx_slab_page_t *slots;
/*pagesize*/
ncx_pagesize = getpagesize();
for (n = ncx_pagesize, ncx_pagesize_shift = 0;
n >>= 1; ncx_pagesize_shift++) { /* void */ }
/* STUB */
if (ncx_slab_max_size == 0) {
ncx_slab_max_size = ncx_pagesize / 2;
ncx_slab_exact_size = ncx_pagesize / (8 * sizeof(uintptr_t));
for (n = ncx_slab_exact_size; n >>= 1; ncx_slab_exact_shift++) {
/* void */
}
}
pool->min_size = 1 << pool->min_shift;
p = (u_char *) pool + sizeof(ncx_slab_pool_t);
slots = (ncx_slab_page_t *) p;
n = ncx_pagesize_shift - pool->min_shift;
for (i = 0; i < n; i++) {
slots[i].slab = 0;
slots[i].next = &slots[i];
slots[i].prev = 0;
}
p += n * sizeof(ncx_slab_page_t);
size = pool->end - p;
pages = (ncx_uint_t) (size / (ncx_pagesize + sizeof(ncx_slab_page_t)));
ncx_memzero(p, pages * sizeof(ncx_slab_page_t));
pool->pages = (ncx_slab_page_t *) p;
pool->free.prev = 0;
pool->free.next = (ncx_slab_page_t *) p;
pool->pages->slab = pages;
pool->pages->next = &pool->free;
pool->pages->prev = (uintptr_t) &pool->free;
pool->start = (u_char *)
ncx_align_ptr((uintptr_t) p + pages * sizeof(ncx_slab_page_t),
ncx_pagesize);
ncx_real_pages = (pool->end - pool->start) / ncx_pagesize;
pool->pages->slab = ncx_real_pages;
}
void *ncx_slab_alloc(ncx_slab_pool_t *pool, size_t size) {
size_t s;
uintptr_t p, n, m, mask, *bitmap;
ncx_uint_t i, slot, shift, map;
ncx_slab_page_t *page, *prev, *slots;
if (size >= ncx_slab_max_size) {
page = ncx_slab_alloc_pages(pool, (size >> ncx_pagesize_shift)
+ ((size % ncx_pagesize) ? 1 : 0));
if (page) {
p = (page - pool->pages) << ncx_pagesize_shift;
p += (uintptr_t) pool->start;
} else {
p = 0;
}
goto done;
}
if (size > pool->min_size) {
shift = 1;
for (s = size - 1; s >>= 1; shift++) { /* void */ }
slot = shift - pool->min_shift;
} else {
shift = pool->min_shift;
slot = 0;
}
slots = (ncx_slab_page_t *) ((u_char *) pool + sizeof(ncx_slab_pool_t));
page = slots[slot].next;
if (page->next != page) {
if (shift < ncx_slab_exact_shift) {
do {
p = (page - pool->pages) << ncx_pagesize_shift;
bitmap = (uintptr_t *) (pool->start + p);
map = (1 << (ncx_pagesize_shift - shift))
/ (sizeof(uintptr_t) * 8);
for (n = 0; n < map; n++) {
if (bitmap[n] != NCX_SLAB_BUSY) {
for (m = 1, i = 0; m; m <<= 1, i++) {
if ((bitmap[n] & m)) {
continue;
}
bitmap[n] |= m;
i = ((n * sizeof(uintptr_t) * 8) << shift)
+ (i << shift);
if (bitmap[n] == NCX_SLAB_BUSY) {
for (n = n + 1; n < map; n++) {
if (bitmap[n] != NCX_SLAB_BUSY) {
p = (uintptr_t) bitmap + i;
goto done;
}
}
prev = (ncx_slab_page_t *)
(page->prev & ~NCX_SLAB_PAGE_MASK);
prev->next = page->next;
page->next->prev = page->prev;
page->next = NULL;
page->prev = NCX_SLAB_SMALL;
}
p = (uintptr_t) bitmap + i;
goto done;
}
}
}
page = page->next;
} while (page);
} else if (shift == ncx_slab_exact_shift) {
do {
if (page->slab != NCX_SLAB_BUSY) {
for (m = 1, i = 0; m; m <<= 1, i++) {
if ((page->slab & m)) {
continue;
}
page->slab |= m;
if (page->slab == NCX_SLAB_BUSY) {
prev = (ncx_slab_page_t *)
(page->prev & ~NCX_SLAB_PAGE_MASK);
prev->next = page->next;
page->next->prev = page->prev;
page->next = NULL;
page->prev = NCX_SLAB_EXACT;
}
p = (page - pool->pages) << ncx_pagesize_shift;
p += i << shift;
p += (uintptr_t) pool->start;
goto done;
}
}
page = page->next;
} while (page);
} else { /* shift > ncx_slab_exact_shift */
n = ncx_pagesize_shift - (page->slab & NCX_SLAB_SHIFT_MASK);
n = 1 << n;
n = ((uintptr_t) 1 << n) - 1;
mask = n << NCX_SLAB_MAP_SHIFT;
do {
if ((page->slab & NCX_SLAB_MAP_MASK) != mask) {
for (m = (uintptr_t) 1 << NCX_SLAB_MAP_SHIFT, i = 0;
m & mask;
m <<= 1, i++) {
if ((page->slab & m)) {
continue;
}
page->slab |= m;
if ((page->slab & NCX_SLAB_MAP_MASK) == mask) {
prev = (ncx_slab_page_t *)
(page->prev & ~NCX_SLAB_PAGE_MASK);
prev->next = page->next;
page->next->prev = page->prev;
page->next = NULL;
page->prev = NCX_SLAB_BIG;
}
p = (page - pool->pages) << ncx_pagesize_shift;
p += i << shift;
p += (uintptr_t) pool->start;
goto done;
}
}
page = page->next;
} while (page);
}
}
page = ncx_slab_alloc_pages(pool, 1);
if (page) {
if (shift < ncx_slab_exact_shift) {
p = (page - pool->pages) << ncx_pagesize_shift;
bitmap = (uintptr_t *) (pool->start + p);
s = 1 << shift;
n = (1 << (ncx_pagesize_shift - shift)) / 8 / s;
if (n == 0) {
n = 1;
}
bitmap[0] = (2 << n) - 1;
map = (1 << (ncx_pagesize_shift - shift)) / (sizeof(uintptr_t) * 8);
for (i = 1; i < map; i++) {
bitmap[i] = 0;
}
page->slab = shift;
page->next = &slots[slot];
page->prev = (uintptr_t) &slots[slot] | NCX_SLAB_SMALL;
slots[slot].next = page;
p = ((page - pool->pages) << ncx_pagesize_shift) + s * n;
p += (uintptr_t) pool->start;
goto done;
} else if (shift == ncx_slab_exact_shift) {
page->slab = 1;
page->next = &slots[slot];
page->prev = (uintptr_t) &slots[slot] | NCX_SLAB_EXACT;
slots[slot].next = page;
p = (page - pool->pages) << ncx_pagesize_shift;
p += (uintptr_t) pool->start;
goto done;
} else { /* shift > ncx_slab_exact_shift */
page->slab = ((uintptr_t) 1 << NCX_SLAB_MAP_SHIFT) | shift;
page->next = &slots[slot];
page->prev = (uintptr_t) &slots[slot] | NCX_SLAB_BIG;
slots[slot].next = page;
p = (page - pool->pages) << ncx_pagesize_shift;
p += (uintptr_t) pool->start;
goto done;
}
}
p = 0;
done:
return (void *) p;
}
void ncx_slab_free(ncx_slab_pool_t *pool, void *p) {
size_t size;
uintptr_t slab, m, *bitmap;
ncx_uint_t n, type, slot, shift, map;
ncx_slab_page_t *slots, *page;
if ((u_char *) p < pool->start || (u_char *) p > pool->end) {
// error("ncx_slab_free(): outside of pool");
goto fail;
}
n = ((u_char *) p - pool->start) >> ncx_pagesize_shift;
page = &pool->pages[n];
slab = page->slab;
type = page->prev & NCX_SLAB_PAGE_MASK;
switch (type) {
case NCX_SLAB_SMALL:
shift = slab & NCX_SLAB_SHIFT_MASK;
size = 1 << shift;
if ((uintptr_t) p & (size - 1)) {
goto wrong_chunk;
}
n = ((uintptr_t) p & (ncx_pagesize - 1)) >> shift;
m = (uintptr_t) 1 << (n & (sizeof(uintptr_t) * 8 - 1));
n /= (sizeof(uintptr_t) * 8);
bitmap = (uintptr_t *) ((uintptr_t) p & ~(ncx_pagesize - 1));
if (bitmap[n] & m) {
if (page->next == NULL) {
slots = (ncx_slab_page_t *)
((u_char *) pool + sizeof(ncx_slab_pool_t));
slot = shift - pool->min_shift;
page->next = slots[slot].next;
slots[slot].next = page;
page->prev = (uintptr_t) &slots[slot] | NCX_SLAB_SMALL;
page->next->prev = (uintptr_t) page | NCX_SLAB_SMALL;
}
bitmap[n] &= ~m;
n = (1 << (ncx_pagesize_shift - shift)) / 8 / (1 << shift);
if (n == 0) {
n = 1;
}
if (bitmap[0] & ~(((uintptr_t) 1 << n) - 1)) {
goto done;
}
map = (1 << (ncx_pagesize_shift - shift)) / (sizeof(uintptr_t) * 8);
for (n = 1; n < map; n++) {
if (bitmap[n]) {
goto done;
}
}
ncx_slab_free_pages(pool, page, 1);
goto done;
}
goto chunk_already_free;
case NCX_SLAB_EXACT:
m = (uintptr_t) 1 <<
(((uintptr_t) p & (ncx_pagesize - 1)) >> ncx_slab_exact_shift);
size = ncx_slab_exact_size;
if ((uintptr_t) p & (size - 1)) {
goto wrong_chunk;
}
if (slab & m) {
if (slab == NCX_SLAB_BUSY) {
slots = (ncx_slab_page_t *)
((u_char *) pool + sizeof(ncx_slab_pool_t));
slot = ncx_slab_exact_shift - pool->min_shift;
page->next = slots[slot].next;
slots[slot].next = page;
page->prev = (uintptr_t) &slots[slot] | NCX_SLAB_EXACT;
page->next->prev = (uintptr_t) page | NCX_SLAB_EXACT;
}
page->slab &= ~m;
if (page->slab) {
goto done;
}
ncx_slab_free_pages(pool, page, 1);
goto done;
}
goto chunk_already_free;
case NCX_SLAB_BIG:
shift = slab & NCX_SLAB_SHIFT_MASK;
size = 1 << shift;
if ((uintptr_t) p & (size - 1)) {
goto wrong_chunk;
}
m = (uintptr_t) 1 << ((((uintptr_t) p & (ncx_pagesize - 1)) >> shift)
+ NCX_SLAB_MAP_SHIFT);
if (slab & m) {
if (page->next == NULL) {
slots = (ncx_slab_page_t *)
((u_char *) pool + sizeof(ncx_slab_pool_t));
slot = shift - pool->min_shift;
page->next = slots[slot].next;
slots[slot].next = page;
page->prev = (uintptr_t) &slots[slot] | NCX_SLAB_BIG;
page->next->prev = (uintptr_t) page | NCX_SLAB_BIG;
}
page->slab &= ~m;
if (page->slab & NCX_SLAB_MAP_MASK) {
goto done;
}
ncx_slab_free_pages(pool, page, 1);
goto done;
}
goto chunk_already_free;
case NCX_SLAB_PAGE:
if ((uintptr_t) p & (ncx_pagesize - 1)) {
goto wrong_chunk;
}
if (slab == NCX_SLAB_PAGE_FREE) {
// alert("ncx_slab_free(): page is already free");
goto fail;
}
if (slab == NCX_SLAB_PAGE_BUSY) {
// alert("ncx_slab_free(): pointer to wrong page");
goto fail;
}
n = ((u_char *) p - pool->start) >> ncx_pagesize_shift;
size = slab & ~NCX_SLAB_PAGE_START;
ncx_slab_free_pages(pool, &pool->pages[n], size);
return;
}
/* not reached */
return;
done:
return;
wrong_chunk:
// error("ncx_slab_free(): pointer to wrong chunk");
goto fail;
chunk_already_free:
// error("ncx_slab_free(): chunk is already free");
fail:
return;
}
static ncx_slab_page_t *ncx_slab_alloc_pages(ncx_slab_pool_t *pool, ncx_uint_t pages) {
ncx_slab_page_t *page, *p;
for (page = pool->free.next; page != &pool->free; page = page->next) {
if (page->slab >= pages) {
if (page->slab > pages) {
page[pages].slab = page->slab - pages;
page[pages].next = page->next;
page[pages].prev = page->prev;
p = (ncx_slab_page_t *) page->prev;
p->next = &page[pages];
page->next->prev = (uintptr_t) &page[pages];
} else {
p = (ncx_slab_page_t *) page->prev;
p->next = page->next;
page->next->prev = page->prev;
}
page->slab = pages | NCX_SLAB_PAGE_START;
page->next = NULL;
page->prev = NCX_SLAB_PAGE;
if (--pages == 0) {
return page;
}
for (p = page + 1; pages; pages--) {
p->slab = NCX_SLAB_PAGE_BUSY;
p->next = NULL;
p->prev = NCX_SLAB_PAGE;
p++;
}
return page;
}
}
// error("ncx_slab_alloc() failed: no memory");
return NULL;
}
static void ncx_slab_free_pages(ncx_slab_pool_t *pool, ncx_slab_page_t *page, ncx_uint_t pages) {
ncx_slab_page_t *prev;
if (pages > 1) {
ncx_memzero(&page[1], (pages - 1) * sizeof(ncx_slab_page_t));
}
if (page->next) {
prev = (ncx_slab_page_t *) (page->prev & ~NCX_SLAB_PAGE_MASK);
prev->next = page->next;
page->next->prev = page->prev;
}
page->slab = pages;
page->prev = (uintptr_t) &pool->free;
page->next = pool->free.next;
page->next->prev = (uintptr_t) page;
pool->free.next = page;
#ifdef PAGE_MERGE
if (pool->pages != page) {
prev = page - 1;
if (ncx_slab_empty(pool, prev)) {
for (; prev >= pool->pages; prev--) {
if (prev->slab != 0)
{
pool->free.next = page->next;
page->next->prev = (uintptr_t) &pool->free;
prev->slab += pages;
ncx_memzero(page, sizeof(ncx_slab_page_t));
page = prev;
break;
}
}
}
}
if ((page - pool->pages + page->slab) < ncx_real_pages) {
next = page + page->slab;
if (ncx_slab_empty(pool, next))
{
prev = (ncx_slab_page_t *) (next->prev);
prev->next = next->next;
next->next->prev = next->prev;
page->slab += next->slab;
ncx_memzero(next, sizeof(ncx_slab_page_t));
}
}
#endif
}
void ncx_slab_stat(ncx_slab_pool_t *pool, ncx_slab_stat_t *stat) {
uintptr_t m, n, mask, slab;
uintptr_t *bitmap;
ncx_uint_t i, j, map, type, obj_size;
ncx_slab_page_t *page;
ncx_memzero(stat, sizeof(ncx_slab_stat_t));
page = pool->pages;
stat->pages = (pool->end - pool->start) / ncx_pagesize;
for (i = 0; i < stat->pages; i++) {
slab = page->slab;
type = page->prev & NCX_SLAB_PAGE_MASK;
switch (type) {
case NCX_SLAB_SMALL:
n = (page - pool->pages) << ncx_pagesize_shift;
bitmap = (uintptr_t *) (pool->start + n);
obj_size = 1 << slab;
map = (1 << (ncx_pagesize_shift - slab))
/ (sizeof(uintptr_t) * 8);
for (j = 0; j < map; j++) {
for (m = 1; m; m <<= 1) {
if ((bitmap[j] & m)) {
stat->used_size += obj_size;
stat->b_small += obj_size;
}
}
}
stat->p_small++;
break;
case NCX_SLAB_EXACT:
if (slab == NCX_SLAB_BUSY) {
stat->used_size += sizeof(uintptr_t) * 8 * ncx_slab_exact_size;
stat->b_exact += sizeof(uintptr_t) * 8 * ncx_slab_exact_size;
} else {
for (m = 1; m; m <<= 1) {
if (slab & m) {
stat->used_size += ncx_slab_exact_size;
stat->b_exact += ncx_slab_exact_size;
}
}
}
stat->p_exact++;
break;
case NCX_SLAB_BIG:
j = ncx_pagesize_shift - (slab & NCX_SLAB_SHIFT_MASK);
j = 1 << j;
j = ((uintptr_t) 1 << j) - 1;
mask = j << NCX_SLAB_MAP_SHIFT;
obj_size = 1 << (slab & NCX_SLAB_SHIFT_MASK);
for (m = (uintptr_t) 1 << NCX_SLAB_MAP_SHIFT; m & mask; m <<= 1) {
if ((page->slab & m)) {
stat->used_size += obj_size;
stat->b_big += obj_size;
}
}
stat->p_big++;
break;
case NCX_SLAB_PAGE:
if (page->prev == NCX_SLAB_PAGE) {
slab = slab & ~NCX_SLAB_PAGE_START;
stat->used_size += slab * ncx_pagesize;
stat->b_page += slab * ncx_pagesize;
stat->p_page += slab;
i += (slab - 1);
break;
}
default:
if (slab > stat->max_free_pages) {
stat->max_free_pages = page->slab;
}
stat->free_page += slab;
i += (slab - 1);
break;
}
page = pool->pages + i + 1;
}
stat->pool_size = pool->end - pool->start;
stat->used_pct = stat->used_size * 100 / stat->pool_size;
}
static bool ncx_slab_empty(ncx_slab_pool_t *pool, ncx_slab_page_t *page) {
ncx_slab_page_t *prev;
if (page->slab == 0) {
return true;
}
//page->prev == PAGE | SMALL | EXACT | BIG
if (page->next == NULL) {
return false;
}
prev = (ncx_slab_page_t *) (page->prev & ~NCX_SLAB_PAGE_MASK);
while (prev >= pool->pages) {
prev = (ncx_slab_page_t *) (prev->prev & ~NCX_SLAB_PAGE_MASK);
}
if (prev == &pool->free) {
return true;
}
return false;
}

View File

@ -1,62 +0,0 @@
#ifndef SIST2_MEMPOOL_H
#define SIST2_MEMPOOL_H
#include <stdlib.h>
#include <stdio.h>
#include <inttypes.h>
#include <string.h>
#include <stdbool.h>
typedef unsigned char u_char;
typedef uintptr_t ncx_uint_t;
#ifndef NCX_ALIGNMENT
#define NCX_ALIGNMENT sizeof(unsigned long)
#endif
#define ncx_align(d, a) (((d) + (a - 1)) & ~(a - 1))
#define ncx_align_ptr(p, a) (u_char *) (((uintptr_t) (p) + ((uintptr_t) a - 1)) & ~((uintptr_t) a - 1))
#define ncx_memzero(buf, n) (void) memset(buf, 0, n)
#define ncx_memset(buf, c, n) (void) memset(buf, c, n)
typedef struct ncx_slab_page_s ncx_slab_page_t;
struct ncx_slab_page_s {
uintptr_t slab;
ncx_slab_page_t *next;
uintptr_t prev;
};
typedef struct {
size_t min_size;
size_t min_shift;
ncx_slab_page_t *pages;
ncx_slab_page_t free;
u_char *start;
u_char *end;
//ncx_shmtx_t mutex;
void *addr;
} ncx_slab_pool_t;
typedef struct {
size_t pool_size, used_size, used_pct;
size_t pages, free_page;
size_t p_small, p_exact, p_big, p_page;
size_t b_small, b_exact, b_big, b_page;
size_t max_free_pages;
} ncx_slab_stat_t;
void ncx_slab_init(ncx_slab_pool_t *mempool);
void *ncx_slab_alloc(ncx_slab_pool_t *mempool, size_t size);
void ncx_slab_free(ncx_slab_pool_t *mempool, void *p);
void ncx_slab_stat(ncx_slab_pool_t *mempool, ncx_slab_stat_t *stat);
#endif //SIST2_MEMPOOL_H

42
src/parsing/fs_util.h Normal file
View File

@ -0,0 +1,42 @@
#ifndef SIST2_FS_UTIL_H
#define SIST2_FS_UTIL_H
#include "src/sist.h"
#define CLOSE_FILE(f) if ((f).close != NULL) {(f).close(&(f));};
static int fs_read(struct vfile *f, void *buf, size_t size) {
if (f->fd == -1) {
SHA1_Init(&f->sha1_ctx);
f->fd = open(f->filepath, O_RDONLY);
if (f->fd == -1) {
return -1;
}
}
int ret = (int) read(f->fd, buf, size);
if (ret != 0 && f->calculate_checksum) {
f->has_checksum = TRUE;
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, ret);
}
return ret;
}
static void fs_close(struct vfile *f) {
if (f->fd != -1) {
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
close(f->fd);
}
}
static void fs_reset(struct vfile *f) {
if (f->fd != -1) {
lseek(f->fd, 0, SEEK_SET);
}
}
#endif

32
src/parsing/magic_util.c Normal file
View File

@ -0,0 +1,32 @@
#include "magic_util.h"
#include "src/log.h"
#include "mime.h"
#include <magic.h>
#include "src/magic_generated.c"
char *magic_buffer_embedded(void *buffer, size_t buffer_size) {
magic_t magic = magic_open(MAGIC_MIME_TYPE);
const char *magic_buffers[1] = {magic_database_buffer,};
size_t sizes[1] = {sizeof(magic_database_buffer),};
// TODO: check if we can reuse the magic instance
int load_ret = magic_load_buffers(magic, (void **) &magic_buffers, sizes, 1);
if (load_ret != 0) {
LOG_FATALF("parse.c", "Could not load libmagic database: (%d)", load_ret);
}
const char *magic_mime_str = magic_buffer(magic, buffer, buffer_size);
char *return_value = NULL;
if (magic_mime_str != NULL) {
return_value = malloc(strlen(magic_mime_str) + 1);
strcpy(return_value, magic_mime_str);
}
magic_close(magic);
return return_value;
}

8
src/parsing/magic_util.h Normal file
View File

@ -0,0 +1,8 @@
#ifndef SIST2_MAGIC_UTIL_H
#define SIST2_MAGIC_UTIL_H
#include <stdio.h>
char *magic_buffer_embedded(void *buffer, size_t buffer_size);
#endif //SIST2_MAGIC_UTIL_H

View File

@ -1,22 +1,30 @@
#include "mime.h" #include "mime.h"
#include <zlib.h>
unsigned int mime_get_mime_by_ext(GHashTable *ext_table, const char * ext) { unsigned int mime_get_mime_by_ext(const char *ext) {
char lower[8]; unsigned char lower[16];
char *p = lower; unsigned char *p = lower;
int cnt = 0; int cnt = 0;
while ((*ext) != '\0' && cnt + 1 < sizeof(lower)) { while ((*ext) != '\0' && cnt + 1 < sizeof(lower)) {
*p++ = (char)tolower(*ext++); *p++ = tolower(*ext++);
cnt++; cnt++;
} }
*p = '\0'; *p = '\0';
return (size_t) g_hash_table_lookup(ext_table, lower);
unsigned long crc = crc32(0, lower, cnt);
unsigned int mime = mime_extension_lookup(crc);
return mime;
} }
unsigned int mime_get_mime_by_string(GHashTable *mime_table, const char * str) { unsigned int mime_get_mime_by_string(const char *str) {
const char * ptr = str; const char *ptr = str;
while (*ptr == ' ' || *ptr == '[') { while (*ptr == ' ' || *ptr == '[') {
ptr++; ptr++;
} }
return (size_t) g_hash_table_lookup(mime_table, ptr);
unsigned long crc = crc32(0, (unsigned char *) ptr, strlen(ptr));
return mime_name_lookup(crc);
} }

View File

@ -51,14 +51,14 @@ enum major_mime {
enum mime; enum mime;
GHashTable *mime_get_mime_table(); unsigned int mime_name_lookup(unsigned long mime_crc32);
GHashTable *mime_get_ext_table(); unsigned int mime_extension_lookup(unsigned long extension_crc32);
char *mime_get_mime_text(unsigned int); const char *mime_get_mime_text(unsigned int);
unsigned int mime_get_mime_by_ext(GHashTable *ext_table, const char * ext); unsigned int mime_get_mime_by_ext(const char *ext);
unsigned int mime_get_mime_by_string(GHashTable *mime_table, const char * str); unsigned int mime_get_mime_by_string(const char *str);
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -5,235 +5,242 @@
#include "mime.h" #include "mime.h"
#include "src/io/serialize.h" #include "src/io/serialize.h"
#include "src/parsing/sidecar.h" #include "src/parsing/sidecar.h"
#include "src/magic_generated.c" #include "src/parsing/fs_util.h"
#include "src/parsing/magic_util.h"
#include <magic.h> #include <pthread.h>
#define MIN_VIDEO_SIZE (1024 * 64) #define MIN_VIDEO_SIZE (1024 * 64)
#define MIN_IMAGE_SIZE (512) #define MIN_IMAGE_SIZE (512)
int fs_read(struct vfile *f, void *buf, size_t size) { #define MAGIC_BUF_SIZE (4096 * 6)
if (f->fd == -1) { typedef enum {
SHA1_Init(&f->sha1_ctx); FILETYPE_DONT_PARSE,
FILETYPE_RAW,
FILETYPE_MEDIA,
FILETYPE_EBOOK,
FILETYPE_MARKUP,
FILETYPE_TEXT,
FILETYPE_FONT,
FILETYPE_ARCHIVE,
FILETYPE_OOXML,
FILETYPE_COMIC,
FILETYPE_MOBI,
FILETYPE_SIST2_SIDECAR,
FILETYPE_MSDOC,
FILETYPE_JSON,
FILETYPE_NDJSON,
} file_type_t;
f->fd = open(f->filepath, O_RDONLY); file_type_t get_file_type(unsigned int mime, size_t size, const char *filepath) {
if (f->fd == -1) {
return -1; int major_mime = MAJOR_MIME(mime);
if (!(SHOULD_PARSE(mime))) {
return FILETYPE_DONT_PARSE;
} else if (IS_RAW(mime)) {
return FILETYPE_RAW;
} else if ((major_mime == MimeVideo && size >= MIN_VIDEO_SIZE) ||
(major_mime == MimeImage && size >= MIN_IMAGE_SIZE) || major_mime == MimeAudio) {
return FILETYPE_MEDIA;
} else if (IS_PDF(mime)) {
return FILETYPE_EBOOK;
} else if (major_mime == MimeText && ScanCtx.text_ctx.content_size > 0) {
if (IS_MARKUP(mime)) {
return FILETYPE_MARKUP;
} else {
return FILETYPE_TEXT;
}
} else if (IS_FONT(mime)) {
return FILETYPE_FONT;
} else if (
ScanCtx.arc_ctx.mode != ARC_MODE_SKIP && (
IS_ARC(mime) ||
(IS_ARC_FILTER(mime) && should_parse_filtered_file(filepath))
)) {
return FILETYPE_ARCHIVE;
} else if ((ScanCtx.ooxml_ctx.content_size > 0 || ScanCtx.media_ctx.tn_size > 0) && IS_DOC(mime)) {
return FILETYPE_OOXML;
} else if (is_cbr(&ScanCtx.comic_ctx, mime) || is_cbz(&ScanCtx.comic_ctx, mime)) {
return FILETYPE_COMIC;
} else if (IS_MOBI(mime)) {
return FILETYPE_MOBI;
} else if (mime == MIME_SIST2_SIDECAR) {
return FILETYPE_SIST2_SIDECAR;
} else if (is_msdoc(&ScanCtx.msdoc_ctx, mime)) {
return FILETYPE_MSDOC;
} else if (is_json(&ScanCtx.json_ctx, mime)) {
return FILETYPE_JSON;
} else if (is_ndjson(&ScanCtx.json_ctx, mime)) {
return FILETYPE_NDJSON;
}
}
#define GET_MIME_ERROR_FATAL (-1)
int get_mime(parse_job_t *job) {
char *extension = job->filepath + job->ext;
int mime = 0;
if (job->vfile.st_size == 0) {
return MIME_EMPTY;
}
if (*extension != '\0' && (job->ext - job->base != 1)) {
mime = (int) mime_get_mime_by_ext(extension);
if (mime != 0) {
return mime;
} }
} }
int ret = (int) read(f->fd, buf, size); if (strlen(extension) == 0 && strlen(job->filepath + job->base) == 40) {
fprintf(stderr, "GIT? %s", job->filepath);
if (ret != 0 && f->calculate_checksum) {
f->has_checksum = TRUE;
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, ret);
} }
return ret; if (ScanCtx.fast) {
} return 0;
#define CLOSE_FILE(f) if ((f).close != NULL) {(f).close(&(f));};
void fs_close(struct vfile *f) {
if (f->fd != -1) {
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
close(f->fd);
} }
}
void fs_reset(struct vfile *f) { // Get mime type with libmagic
if (f->fd != -1) { if (job->vfile.read_rewindable == NULL) {
lseek(f->fd, 0, SEEK_SET); LOG_WARNING(job->filepath,
"File does not support rewindable reads, cannot guess Media type");
return 0;
} }
char *buf[MAGIC_BUF_SIZE];
int bytes_read = job->vfile.read_rewindable(&job->vfile, buf, MAGIC_BUF_SIZE);
if (bytes_read < 0) {
if (job->vfile.is_fs_file) {
LOG_ERRORF(job->filepath, "read(): [%d] %s", errno, strerror(errno));
} else {
LOG_ERRORF(job->filepath, "(virtual) read(): [%d] %s", bytes_read, archive_error_string(job->vfile.arc));
}
return GET_MIME_ERROR_FATAL;
}
char *magic_mime_str = magic_buffer_embedded(buf, bytes_read);
if (magic_mime_str != NULL) {
mime = (int) mime_get_mime_by_string(magic_mime_str);
free(magic_mime_str);
if (mime == 0) {
LOG_WARNINGF(job->filepath, "Couldn't find mime %s", magic_mime_str);
return 0;
}
}
if (job->vfile.reset != NULL) {
job->vfile.reset(&job->vfile);
}
return mime;
} }
void set_dbg_current_file(parse_job_t *job) { void parse(parse_job_t *job) {
unsigned long long pid = (unsigned long long) pthread_self();
pthread_mutex_lock(&ScanCtx.dbg_current_files_mu);
g_hash_table_replace(ScanCtx.dbg_current_files, GINT_TO_POINTER(pid), job);
pthread_mutex_unlock(&ScanCtx.dbg_current_files_mu);
}
void parse_job(parse_job_t *job) { if (job->vfile.is_fs_file) {
tpool_work_arg_shm_t *arg = malloc(sizeof(tpool_work_arg_shm_t) + sizeof(*job)); job->vfile.read = fs_read;
job->vfile.read_rewindable = fs_read;
memcpy(arg->arg, job, sizeof(*job)); job->vfile.reset = fs_reset;
arg->arg_size = -1; job->vfile.close = fs_close;
job->vfile.calculate_checksum = ScanCtx.calculate_checksums;
parse(arg); }
free(arg);
}
void parse(tpool_work_arg_shm_t *arg) {
parse_job_t *job = (void*)arg->arg;
document_t *doc = malloc(sizeof(document_t)); document_t *doc = malloc(sizeof(document_t));
set_dbg_current_file(job);
strcpy(doc->filepath, job->filepath); strcpy(doc->filepath, job->filepath);
doc->ext = (short) job->ext; doc->ext = job->ext;
doc->base = (short) job->base; doc->base = job->base;
char *rel_path = doc->filepath + ScanCtx.index.desc.root_len;
generate_doc_id(rel_path, doc->doc_id);
doc->meta_head = NULL; doc->meta_head = NULL;
doc->meta_tail = NULL; doc->meta_tail = NULL;
doc->mime = 0;
doc->size = job->vfile.st_size; doc->size = job->vfile.st_size;
doc->mtime = (int) job->vfile.mtime; doc->mtime = (int) job->vfile.mtime;
doc->mime = get_mime(job);
generate_doc_id(doc->filepath + ScanCtx.index.desc.root_len, doc->doc_id);
int inc_ts = incremental_get(ScanCtx.original_table, doc->doc_id); if (doc->mime == GET_MIME_ERROR_FATAL) {
if (inc_ts != 0 && inc_ts == job->vfile.mtime) { pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
pthread_mutex_lock(&ScanCtx.copy_table_mu); ScanCtx.dbg_failed_files_count += 1;
incremental_mark_file(ScanCtx.copy_table, doc->doc_id); pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
pthread_mutex_unlock(&ScanCtx.copy_table_mu);
CLOSE_FILE(job->vfile)
free(doc);
return;
}
if (database_mark_document(ProcData.index_db, doc->doc_id, doc->mtime)) {
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu); pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
ScanCtx.dbg_skipped_files_count += 1; ScanCtx.dbg_skipped_files_count += 1;
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu); pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
CLOSE_FILE(job->vfile) CLOSE_FILE(job->vfile)
free(doc); free(doc);
return; return;
} }
if (ScanCtx.new_table != NULL) {
pthread_mutex_lock(&ScanCtx.copy_table_mu);
incremental_mark_file(ScanCtx.new_table, doc->doc_id);
pthread_mutex_unlock(&ScanCtx.copy_table_mu);
}
char *buf[MAGIC_BUF_SIZE];
if (LogCtx.very_verbose) { if (LogCtx.very_verbose) {
LOG_DEBUGF(job->filepath, "Starting parse job {%s}", doc->doc_id) LOG_DEBUGF(job->filepath, "Starting parse job {%s}", doc->doc_id);
} }
if (job->ext > 4096) { switch (get_file_type(doc->mime, doc->size, doc->filepath)) {
fprintf(stderr, "Ext is %d, filename is %s\n", job->ext, job->filepath); case FILETYPE_RAW:
} parse_raw(&ScanCtx.raw_ctx, &job->vfile, doc);
break;
if (job->vfile.st_size == 0) { case FILETYPE_MEDIA:
doc->mime = MIME_EMPTY; parse_media(&ScanCtx.media_ctx, &job->vfile, doc, mime_get_mime_text(doc->mime));
} else if (*(job->filepath + job->ext) != '\0' && (job->ext - job->base != 1)) { break;
doc->mime = mime_get_mime_by_ext(ScanCtx.ext_table, job->filepath + job->ext); case FILETYPE_EBOOK:
} parse_ebook(&ScanCtx.ebook_ctx, &job->vfile, mime_get_mime_text(doc->mime), doc);
break;
if (doc->mime == 0 && !ScanCtx.fast) { case FILETYPE_MARKUP:
parse_markup(&ScanCtx.text_ctx, &job->vfile, doc);
// Get mime type with libmagic break;
if (job->vfile.read_rewindable == NULL) { case FILETYPE_TEXT:
LOG_WARNING(job->filepath, parse_text(&ScanCtx.text_ctx, &job->vfile, doc);
"File does not support rewindable reads, cannot guess Media type"); break;
goto abort; case FILETYPE_FONT:
} parse_font(&ScanCtx.font_ctx, &job->vfile, doc);
break;
int bytes_read = job->vfile.read_rewindable(&job->vfile, buf, MAGIC_BUF_SIZE); case FILETYPE_ARCHIVE:
if (bytes_read < 0) { parse_archive(&ScanCtx.arc_ctx, &job->vfile, doc, ScanCtx.exclude, ScanCtx.exclude_extra);
break;
if (job->vfile.is_fs_file) { case FILETYPE_OOXML:
LOG_ERRORF(job->filepath, "read(): [%d] %s", errno, strerror(errno)) parse_ooxml(&ScanCtx.ooxml_ctx, &job->vfile, doc);
} else { break;
LOG_ERRORF(job->filepath, "(virtual) read(): [%d] %s", bytes_read, archive_error_string(job->vfile.arc)) case FILETYPE_COMIC:
} parse_comic(&ScanCtx.comic_ctx, &job->vfile, doc);
break;
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu); case FILETYPE_MOBI:
ScanCtx.dbg_failed_files_count += 1; parse_mobi(&ScanCtx.mobi_ctx, &job->vfile, doc);
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu); break;
case FILETYPE_SIST2_SIDECAR:
parse_sidecar(&job->vfile, doc);
CLOSE_FILE(job->vfile) CLOSE_FILE(job->vfile)
free(doc); free(doc);
return; return;
} case FILETYPE_MSDOC:
parse_msdoc(&ScanCtx.msdoc_ctx, &job->vfile, doc);
magic_t magic = magic_open(MAGIC_MIME_TYPE); break;
case FILETYPE_JSON:
const char *magic_buffers[1] = {magic_database_buffer,}; parse_json(&ScanCtx.json_ctx, &job->vfile, doc);
size_t sizes[1] = {sizeof(magic_database_buffer),}; break;
case FILETYPE_NDJSON:
int load_ret = magic_load_buffers(magic, (void **) &magic_buffers, sizes, 1); parse_ndjson(&ScanCtx.json_ctx, &job->vfile, doc);
break;
if (load_ret != 0) { case FILETYPE_DONT_PARSE:
LOG_FATALF("parse.c", "Could not load libmagic database: (%d)", load_ret) default:
} break;
const char *magic_mime_str = magic_buffer(magic, buf, bytes_read);
if (magic_mime_str != NULL) {
doc->mime = mime_get_mime_by_string(ScanCtx.mime_table, magic_mime_str);
LOG_DEBUGF(job->filepath, "libmagic: %s", magic_mime_str);
if (doc->mime == 0) {
LOG_WARNINGF(job->filepath, "Couldn't find mime %s", magic_mime_str);
}
}
if (job->vfile.reset != NULL) {
job->vfile.reset(&job->vfile);
}
magic_close(magic);
} }
int mmime = MAJOR_MIME(doc->mime);
if (!(SHOULD_PARSE(doc->mime))) {
} else if (IS_RAW(doc->mime)) {
parse_raw(&ScanCtx.raw_ctx, &job->vfile, doc);
} else if ((mmime == MimeVideo && doc->size >= MIN_VIDEO_SIZE) ||
(mmime == MimeImage && doc->size >= MIN_IMAGE_SIZE) || mmime == MimeAudio) {
parse_media(&ScanCtx.media_ctx, &job->vfile, doc, mime_get_mime_text(doc->mime));
} else if (IS_PDF(doc->mime)) {
parse_ebook(&ScanCtx.ebook_ctx, &job->vfile, mime_get_mime_text(doc->mime), doc);
} else if (mmime == MimeText && ScanCtx.text_ctx.content_size > 0) {
if (IS_MARKUP(doc->mime)) {
parse_markup(&ScanCtx.text_ctx, &job->vfile, doc);
} else {
parse_text(&ScanCtx.text_ctx, &job->vfile, doc);
}
} else if (IS_FONT(doc->mime)) {
parse_font(&ScanCtx.font_ctx, &job->vfile, doc);
} else if (
ScanCtx.arc_ctx.mode != ARC_MODE_SKIP && (
IS_ARC(doc->mime) ||
(IS_ARC_FILTER(doc->mime) && should_parse_filtered_file(doc->filepath, doc->ext))
)) {
parse_archive(&ScanCtx.arc_ctx, &job->vfile, doc, ScanCtx.exclude, ScanCtx.exclude_extra);
} else if ((ScanCtx.ooxml_ctx.content_size > 0 || ScanCtx.media_ctx.tn_size > 0) && IS_DOC(doc->mime)) {
parse_ooxml(&ScanCtx.ooxml_ctx, &job->vfile, doc);
} else if (is_cbr(&ScanCtx.comic_ctx, doc->mime) || is_cbz(&ScanCtx.comic_ctx, doc->mime)) {
parse_comic(&ScanCtx.comic_ctx, &job->vfile, doc);
} else if (IS_MOBI(doc->mime)) {
parse_mobi(&ScanCtx.mobi_ctx, &job->vfile, doc);
} else if (doc->mime == MIME_SIST2_SIDECAR) {
parse_sidecar(&job->vfile, doc);
CLOSE_FILE(job->vfile)
free(doc);
return;
} else if (is_msdoc(&ScanCtx.msdoc_ctx, doc->mime)) {
parse_msdoc(&ScanCtx.msdoc_ctx, &job->vfile, doc);
} else if (is_json(&ScanCtx.json_ctx, doc->mime)) {
parse_json(&ScanCtx.json_ctx, &job->vfile, doc);
} else if (is_ndjson(&ScanCtx.json_ctx, doc->mime)) {
parse_ndjson(&ScanCtx.json_ctx, &job->vfile, doc);
}
abort:
//Parent meta //Parent meta
if (job->parent[0] != '\0') { if (job->parent[0] != '\0') {
meta_line_t *meta_parent = malloc(sizeof(meta_line_t) + SIST_INDEX_ID_LEN); meta_line_t *meta_parent = malloc(sizeof(meta_line_t) + SIST_INDEX_ID_LEN);
@ -247,12 +254,8 @@ void parse(tpool_work_arg_shm_t *arg) {
if (job->vfile.has_checksum) { if (job->vfile.has_checksum) {
char sha1_digest_str[SHA1_STR_LENGTH]; char sha1_digest_str[SHA1_STR_LENGTH];
buf2hex((unsigned char *) job->vfile.sha1_digest, SHA1_DIGEST_LENGTH, (char *) sha1_digest_str); buf2hex((unsigned char *) job->vfile.sha1_digest, SHA1_DIGEST_LENGTH, (char *) sha1_digest_str);
APPEND_STR_META(doc, MetaChecksum, (const char *) sha1_digest_str); APPEND_STR_META(doc, MetaChecksum, (const char *) sha1_digest_str)
} }
write_document(doc); write_document(doc);
} }
void cleanup_parse() {
// noop
}

View File

@ -4,15 +4,7 @@
#include "../sist.h" #include "../sist.h"
#include "src/tpool.h" #include "src/tpool.h"
#define MAGIC_BUF_SIZE (4096 * 6)
int fs_read(struct vfile *f, void *buf, size_t size); void parse(parse_job_t *arg);
void fs_close(struct vfile *f);
void fs_reset(struct vfile *f);
void parse_job(parse_job_t *job);
void parse(tpool_work_arg_shm_t *arg);
void cleanup_parse();
#endif #endif

View File

@ -4,12 +4,12 @@
void parse_sidecar(vfile_t *vfile, document_t *doc) { void parse_sidecar(vfile_t *vfile, document_t *doc) {
LOG_DEBUGF("sidecar.c", "Parsing sidecar file %s", vfile->filepath) LOG_DEBUGF("sidecar.c", "Parsing sidecar file %s", vfile->filepath);
size_t size; size_t size;
char *buf = read_all(vfile, &size); char *buf = read_all(vfile, &size);
if (buf == NULL) { if (buf == NULL) {
LOG_ERRORF("sidecar.c", "Read error for %s", vfile->filepath) LOG_ERRORF("sidecar.c", "Read error for %s", vfile->filepath);
return; return;
} }
@ -18,7 +18,7 @@ void parse_sidecar(vfile_t *vfile, document_t *doc) {
cJSON *json = cJSON_Parse(buf); cJSON *json = cJSON_Parse(buf);
if (json == NULL) { if (json == NULL) {
LOG_ERRORF("sidecar.c", "Could not parse JSON sidecar %s", vfile->filepath) LOG_ERRORF("sidecar.c", "Could not parse JSON sidecar %s", vfile->filepath);
return; return;
} }
char *json_str = cJSON_PrintUnformatted(json); char *json_str = cJSON_PrintUnformatted(json);
@ -32,8 +32,7 @@ void parse_sidecar(vfile_t *vfile, document_t *doc) {
generate_doc_id(rel_path, assoc_doc_id); generate_doc_id(rel_path, assoc_doc_id);
store_write(ScanCtx.index.meta_store, assoc_doc_id, sizeof(assoc_doc_id), json_str, database_write_document_sidecar(ProcData.index_db, assoc_doc_id, json_str);
strlen(json_str) + 1);
cJSON_Delete(json); cJSON_Delete(json);
free(json_str); free(json_str);

View File

@ -49,8 +49,11 @@
#include <ctype.h> #include <ctype.h>
#include "git_hash.h" #include "git_hash.h"
#define VERSION "2.14.3" #define VERSION "3.0.0"
static const char *const Version = VERSION; static const char *const Version = VERSION;
static const int VersionMajor = 3;
static const int VersionMinor = 0;
static const int VersionPatch = 0;
#ifndef SIST_PLATFORM #ifndef SIST_PLATFORM
#define SIST_PLATFORM unknown #define SIST_PLATFORM unknown

View File

@ -1,343 +0,0 @@
#include "sist.h"
#include "io/serialize.h"
#include "ctx.h"
static GHashTable *FlatTree;
static GHashTable *BufferTable;
static GHashTable *AggMime;
static GHashTable *AggSize;
static GHashTable *AggDate;
#define SIZE_BUCKET (long)(5 * 1024 * 1024)
#define DATE_BUCKET (long)(2629800)
static long TotalSize = 0;
static long DocumentCount = 0;
typedef struct {
long size;
long count;
} agg_t;
void fill_tables(cJSON *document, UNUSED(const char index_id[SIST_INDEX_ID_LEN])) {
if (cJSON_GetObjectItem(document, "parent") != NULL) {
return;
}
const char *json_path = cJSON_GetObjectItem(document, "path")->valuestring;
char *path = malloc(strlen(json_path) + 1);
strcpy(path, json_path);
const char *json_mime = cJSON_GetObjectItem(document, "mime")->valuestring;
char *mime;
if (json_mime == NULL) {
mime = NULL;
} else {
mime = malloc(strlen(json_mime) + 1);
strcpy(mime, json_mime);
}
long size = (long) cJSON_GetObjectItem(document, "size")->valuedouble;
int mtime = cJSON_GetObjectItem(document, "mtime")->valueint;
// treemap
void *existing_path = g_hash_table_lookup(FlatTree, path);
if (existing_path == NULL) {
g_hash_table_insert(FlatTree, path, (gpointer) size);
} else {
g_hash_table_replace(FlatTree, path, (gpointer) ((long) existing_path + size));
}
// mime agg
if (mime != NULL) {
agg_t *orig_agg = g_hash_table_lookup(AggMime, mime);
if (orig_agg == NULL) {
agg_t *agg = malloc(sizeof(agg_t));
agg->size = size;
agg->count = 1;
g_hash_table_insert(AggMime, mime, agg);
} else {
orig_agg->size += size;
orig_agg->count += 1;
free(mime);
}
}
// size agg
long size_bucket = size - (size % SIZE_BUCKET);
agg_t *orig_agg = g_hash_table_lookup(AggSize, (gpointer) size_bucket);
if (orig_agg == NULL) {
agg_t *agg = malloc(sizeof(agg_t));
agg->size = size;
agg->count = 1;
g_hash_table_insert(AggSize, (gpointer) size_bucket, agg);
} else {
orig_agg->count += 1;
orig_agg->size += size;
}
// date agg
long date_bucket = mtime - (mtime % DATE_BUCKET);
orig_agg = g_hash_table_lookup(AggDate, (gpointer) date_bucket);
if (orig_agg == NULL) {
agg_t *agg = malloc(sizeof(agg_t));
agg->size = size;
agg->count = 1;
g_hash_table_insert(AggDate, (gpointer) date_bucket, agg);
} else {
orig_agg->count += 1;
orig_agg->size += size;
}
TotalSize += size;
DocumentCount += 1;
}
void read_index_into_tables(index_t *index) {
char file_path[PATH_MAX];
READ_INDICES(file_path, index->path, read_index(file_path, index->desc.id, index->desc.type, fill_tables), {}, 1);
}
static size_t rfind(const char *str, int c) {
for (int i = (int)strlen(str); i >= 0; i--) {
if (str[i] == c) {
return i;
}
}
return -1;
}
int merge_up(double thresh) {
long min_size = (long) (thresh * (double) TotalSize);
int count = 0;
GHashTableIter iter;
g_hash_table_iter_init(&iter, FlatTree);
void *key;
void *value;
while (g_hash_table_iter_next(&iter, &key, &value)) {
long size = (long) value;
if (size < min_size) {
int stop = rfind(key, '/');
if (stop == -1) {
stop = 0;
}
char *parent = malloc(stop + 1);
strncpy(parent, key, stop);
*(parent + stop) = '\0';
void *existing_parent = g_hash_table_lookup(FlatTree, parent);
if (existing_parent == NULL) {
void *existing_parent2_key;
void *existing_parent2_val;
int found = g_hash_table_lookup_extended(BufferTable, parent, &existing_parent2_key,
&existing_parent2_val);
if (!found) {
g_hash_table_insert(BufferTable, parent, value);
} else {
g_hash_table_replace(BufferTable, parent, (gpointer) ((long) existing_parent2_val + size));
free(existing_parent2_key);
}
} else {
g_hash_table_replace(FlatTree, parent, (gpointer) ((long) existing_parent + size));
}
g_hash_table_iter_remove(&iter);
count += 1;
}
}
g_hash_table_iter_init(&iter, BufferTable);
while (g_hash_table_iter_next(&iter, &key, &value)) {
g_hash_table_insert(FlatTree, key, value);
g_hash_table_iter_remove(&iter);
}
int size = g_hash_table_size(FlatTree);
LOG_DEBUGF("stats.c", "Merge up iteration (%d merged, %d in tree)", count, size)
return count;
}
/**
* Assumes out is at at least PATH_MAX *4
*/
void csv_escape(char *dst, const char *str) {
const char *ptr = str;
char *out = dst;
if (rfind(str, ',') == -1 && rfind(str, '"') == -1) {
strcpy(dst, str);
return;
}
*out++ = '"';
char c;
while ((c = *ptr++) != 0) {
if (c == '"') {
*out++ = '"';
*out++ = '"';
} else {
*out++ = c;
}
}
*out++ = '"';
*out = '\0';
}
int open_or_exit(const char *path) {
int fd = open(path, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR);
if (fd < 0) {
LOG_FATALF("stats.c", "Error while creating file: %s [%d]\n", strerror(errno), errno)
}
return fd;
}
#define TREEMAP_CSV_HEADER "path,size"
#define MIME_AGG_CSV_HEADER "mime,size,count"
#define SIZE_AGG_CSV_HEADER "bucket,size,count"
#define DATE_AGG_CSV_HEADER "bucket,size,count"
void write_treemap_csv(double thresh, const char *out_path) {
void *key;
void *value;
long min_size = (long) (thresh * (double) TotalSize);
int fd = open_or_exit(out_path);
int ret = write(fd, TREEMAP_CSV_HEADER, sizeof(TREEMAP_CSV_HEADER) - 1);
if (ret == -1) {
LOG_FATALF("stats.c", "Write error: %s", strerror(errno))
}
GHashTableIter iter;
g_hash_table_iter_init(&iter, FlatTree);
while (g_hash_table_iter_next(&iter, &key, &value)) {
long size = (long) value;
if (size >= min_size) {
char path_buf[PATH_MAX * 4];
char buf[PATH_MAX * 4 + 16];
csv_escape(path_buf, key);
size_t written = sprintf(buf, "\n%s,%ld", path_buf, (long) value);
ret = write(fd, buf, written);
if (ret == -1) {
LOG_FATALF("stats.c", "Write error: %s", strerror(errno))
}
}
}
close(fd);
}
void write_agg_csv_str(const char *out_path, const char *header, GHashTable *table) {
void *key;
void *value;
char buf[4096];
int fd = open_or_exit(out_path);
int ret = write(fd, header, strlen(header));
if (ret == -1) {
LOG_FATALF("stats.c", "Write error: %s", strerror(errno))
}
GHashTableIter iter;
g_hash_table_iter_init(&iter, table);
while (g_hash_table_iter_next(&iter, &key, &value)) {
agg_t *agg = value;
size_t written = sprintf(buf, "\n%s,%ld,%ld", (const char*)key, agg->size, agg->count);
ret = write(fd, buf, written);
if (ret == -1) {
LOG_FATALF("stats.c", "Write error: %s", strerror(errno))
}
}
close(fd);
}
void write_agg_csv_long(const char *out_path, const char *header, GHashTable *table) {
void *key;
void *value;
char buf[4096];
int fd = open_or_exit(out_path);
int ret = write(fd, header, strlen(header));
if (ret == -1) {
LOG_FATALF("stats.c", "Write error: %s", strerror(errno))
}
GHashTableIter iter;
g_hash_table_iter_init(&iter, table);
while (g_hash_table_iter_next(&iter, &key, &value)) {
agg_t *agg = value;
size_t written = sprintf(buf, "\n%ld,%ld,%ld", (long)key, agg->size, agg->count);
ret = write(fd, buf, written);
if (ret == -1) {
LOG_FATALF("stats.c", "Write error: %s", strerror(errno))
}
}
close(fd);
}
int generate_stats(index_t *index, const double threshold, const char *out_prefix) {
FlatTree = g_hash_table_new_full(g_str_hash, g_str_equal, free, NULL);
BufferTable = g_hash_table_new(g_str_hash, g_str_equal);
AggMime = g_hash_table_new_full(g_str_hash, g_str_equal, free, free);
AggSize = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL, free);
AggDate = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL, free);
LOG_INFO("stats.c", "Generating stats...")
read_index_into_tables(index);
LOG_DEBUG("stats.c", "Read index into tables")
LOG_DEBUGF("stats.c", "Total size is %ld", TotalSize)
LOG_DEBUGF("stats.c", "Document count is %ld", DocumentCount)
LOG_DEBUGF("stats.c", "Merging small directories upwards with a threshold of %f%%", threshold * 100)
while (merge_up(threshold) > 100) {}
char tmp[PATH_MAX];
strncpy(tmp, out_prefix, sizeof(tmp));
strcat(tmp, "treemap.csv");
write_treemap_csv(threshold, tmp);
strncpy(tmp, out_prefix, sizeof(tmp));
strcat(tmp, "mime_agg.csv");
write_agg_csv_str(tmp, MIME_AGG_CSV_HEADER, AggMime);
strncpy(tmp, out_prefix, sizeof(tmp));
strcat(tmp, "size_agg.csv");
write_agg_csv_long(tmp, SIZE_AGG_CSV_HEADER, AggSize);
strncpy(tmp, out_prefix, sizeof(tmp));
strcat(tmp, "date_agg.csv");
write_agg_csv_long(tmp, DATE_AGG_CSV_HEADER, AggDate);
g_hash_table_remove_all(FlatTree);
g_hash_table_destroy(FlatTree);
g_hash_table_destroy(BufferTable);
g_hash_table_remove_all(AggMime);
g_hash_table_destroy(AggMime);
g_hash_table_remove_all(AggSize);
g_hash_table_destroy(AggSize);
g_hash_table_remove_all(AggDate);
g_hash_table_destroy(AggDate);
return 0;
}

View File

@ -1,6 +0,0 @@
#ifndef SIST2_STATS_H
#define SIST2_STATS_H
int generate_stats(index_t *index, double threshold, const char* out_prefix);
#endif

View File

@ -4,257 +4,250 @@
#include <pthread.h> #include <pthread.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <sys/wait.h> #include <sys/wait.h>
#include "mempool/mempool.h" #include "parsing/parse.h"
#define BLANK_STR " " #define BLANK_STR " "
// TODO: Use slab OOM to control queue size
#define MAX_QUEUE_SIZE 100000
typedef struct tpool_work { typedef struct {
tpool_work_arg_shm_t *arg; int thread_id;
thread_func_t func; tpool_t *pool;
struct tpool_work *next; } start_thread_arg_t;
} tpool_work_t;
typedef struct tpool { typedef struct tpool {
tpool_work_t *work_head;
tpool_work_t *work_tail;
pthread_mutex_t work_mutex;
pthread_mutex_t mem_mutex;
// TODO: Initialize with SHARED attr
pthread_cond_t has_work_cond;
pthread_cond_t working_cond;
pthread_t threads[256]; pthread_t threads[256];
int num_threads;
int thread_cnt; int fork;
int work_cnt;
int done_cnt;
int busy_cnt;
int stop;
int waiting;
int print_progress; int print_progress;
void (*cleanup_func)(); struct {
job_type_t job_type;
void *shared_memory; int stop;
size_t shared_memory_size; int waiting;
ncx_slab_pool_t *mempool; database_ipc_ctx_t ipc_ctx;
pthread_mutex_t mutex;
pthread_mutex_t data_mutex;
pthread_cond_t done_working_cond;
pthread_cond_t workers_initialized_cond;
int busy_count;
int initialized_count;
} *shm;
} tpool_t; } tpool_t;
void job_destroy(job_t *job) {
/** if (job->type == JOB_PARSE_JOB) {
* Create a work object free(job->parse_job);
*/
static tpool_work_t *tpool_work_create(tpool_t *pool, thread_func_t func, tpool_work_arg_t *arg) {
if (func == NULL) {
return NULL;
} }
// Copy heap arg to shm arg free(job);
pthread_mutex_lock(&pool->mem_mutex);
tpool_work_arg_shm_t *shm_arg = ncx_slab_alloc(pool->mempool, sizeof(tpool_work_arg_shm_t) + arg->arg_size);
shm_arg->arg_size = arg->arg_size;
memcpy(shm_arg->arg, arg->arg, arg->arg_size);
free(arg->arg);
tpool_work_t *work = ncx_slab_alloc(pool->mempool, sizeof(tpool_work_t));
pthread_mutex_unlock(&pool->mem_mutex);
work->func = func;
work->arg = shm_arg;
work->next = NULL;
return work;
} }
void tpool_dump_debug_info(tpool_t *pool) { void tpool_dump_debug_info(tpool_t *pool) {
LOG_DEBUGF("tpool.c", "pool->thread_cnt = %d", pool->thread_cnt) // TODO
LOG_DEBUGF("tpool.c", "pool->work_cnt = %d", pool->work_cnt) LOG_DEBUGF("tpool.c", "pool->num_threads = %d", pool->num_threads);
LOG_DEBUGF("tpool.c", "pool->done_cnt = %d", pool->done_cnt)
LOG_DEBUGF("tpool.c", "pool->busy_cnt = %d", pool->busy_cnt)
LOG_DEBUGF("tpool.c", "pool->stop = %d", pool->stop)
}
/**
* Pop work object from thread pool
*/
static tpool_work_t *tpool_work_get(tpool_t *pool) {
tpool_work_t *work = pool->work_head;
if (work == NULL) {
return NULL;
}
if (work->next == NULL) {
pool->work_head = NULL;
pool->work_tail = NULL;
} else {
pool->work_head = work->next;
}
return work;
} }
/** /**
* Push work object to thread pool * Push work object to thread pool
*/ */
int tpool_add_work(tpool_t *pool, thread_func_t func, tpool_work_arg_t *arg) { int tpool_add_work(tpool_t *pool, job_t *job) {
while ((pool->work_cnt - pool->done_cnt) >= MAX_QUEUE_SIZE) { if (pool->shm->job_type == JOB_UNDEFINED) {
usleep(10000); pool->shm->job_type = job->type;
} } else if (pool->shm->job_type != job->type) {
tpool_work_t *work = tpool_work_create(pool, func, arg); LOG_FATAL("tpool.c", "FIXME: tpool cannot queue jobs with different types!");
if (work == NULL) {
return 0;
} }
pthread_mutex_lock(&(pool->work_mutex)); database_add_work(ProcData.ipc_db, job);
if (pool->work_head == NULL) {
pool->work_head = work;
pool->work_tail = pool->work_head;
} else {
pool->work_tail->next = work;
pool->work_tail = work;
}
pool->work_cnt++; return TRUE;
pthread_cond_broadcast(&(pool->has_work_cond));
pthread_mutex_unlock(&(pool->work_mutex));
return 1;
} }
static void worker_thread_loop(tpool_t *pool) { static void worker_thread_loop(tpool_t *pool) {
while (TRUE) { while (TRUE) {
pthread_mutex_lock(&pool->work_mutex); if (pool->shm->stop) {
if (pool->stop) {
break; break;
} }
if (pool->work_head == NULL) { if (pool->shm->job_type == JOB_UNDEFINED) {
pthread_cond_wait(&(pool->has_work_cond), &(pool->work_mutex)); // Wait before first job is queued
pthread_mutex_lock(&pool->shm->mutex);
pthread_cond_timedwait_ms(&pool->shm->ipc_ctx.has_work_cond, &pool->shm->mutex, 1000);
pthread_mutex_unlock(&pool->shm->mutex);
} }
tpool_work_t *work = tpool_work_get(pool); job_t *job = database_get_work(ProcData.ipc_db, pool->shm->job_type);
if (work != NULL) { if (job != NULL) {
pool->busy_cnt += 1; pthread_mutex_lock(&(pool->shm->data_mutex));
} pool->shm->busy_count += 1;
pthread_mutex_unlock(&(pool->shm->data_mutex));
pthread_mutex_unlock(&(pool->work_mutex)); if (pool->shm->stop) {
if (work != NULL) {
if (pool->stop) {
break; break;
} }
work->func(work->arg); if (job->type == JOB_PARSE_JOB) {
parse(job->parse_job);
} else if (job->type == JOB_BULK_LINE) {
elastic_index_line(job->bulk_line);
}
pthread_mutex_lock(&pool->mem_mutex); job_destroy(job);
ncx_slab_free(pool->mempool, work->arg);
ncx_slab_free(pool->mempool, work);
pthread_mutex_unlock(&pool->mem_mutex);
}
pthread_mutex_lock(&(pool->work_mutex)); pthread_mutex_lock(&(pool->shm->data_mutex));
if (work != NULL) { pool->shm->busy_count -= 1;
pool->busy_cnt -= 1; pthread_mutex_unlock(&(pool->shm->data_mutex));
pool->done_cnt++;
pthread_mutex_lock(&(pool->shm->ipc_ctx.mutex));
pool->shm->ipc_ctx.completed_job_count += 1;
pthread_mutex_unlock(&(pool->shm->ipc_ctx.mutex));
} }
if (pool->print_progress) { if (pool->print_progress) {
int done = pool->shm->ipc_ctx.completed_job_count;
int count = pool->shm->ipc_ctx.completed_job_count + pool->shm->ipc_ctx.job_count;
if (LogCtx.json_logs) { if (LogCtx.json_logs) {
progress_bar_print_json(pool->done_cnt, pool->work_cnt, ScanCtx.stat_tn_size, progress_bar_print_json(done,
ScanCtx.stat_index_size, pool->waiting); count,
ScanCtx.stat_tn_size,
ScanCtx.stat_index_size, pool->shm->waiting);
} else { } else {
progress_bar_print((double) pool->done_cnt / pool->work_cnt, ScanCtx.stat_tn_size, progress_bar_print((double) done / count,
ScanCtx.stat_index_size); ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
} }
} }
if (pool->work_head == NULL) { if (job == NULL) {
pthread_cond_signal(&(pool->working_cond)); pthread_mutex_lock(&pool->shm->mutex);
pthread_cond_signal(&pool->shm->done_working_cond);
pthread_mutex_unlock(&pool->shm->mutex);
} }
pthread_mutex_unlock(&(pool->work_mutex));
} }
} }
static void worker_proc_init(tpool_t *pool, int thread_id) {
// TODO create PID -> thread_id mapping for signal handler
ProcData.thread_id = thread_id;
if (ScanCtx.index.path[0] != '\0') {
// TODO This should be closed in proc cleanup function
ProcData.index_db = database_create(ScanCtx.index.path, INDEX_DATABASE);
ProcData.index_db->ipc_ctx = &pool->shm->ipc_ctx;
database_open(ProcData.index_db);
}
// TODO /dev/shm
pthread_mutex_lock(&pool->shm->mutex);
ProcData.ipc_db = database_create("/dev/shm/ipc.sist2", IPC_CONSUMER_DATABASE);
ProcData.ipc_db->ipc_ctx = &pool->shm->ipc_ctx;
database_open(ProcData.ipc_db);
pthread_mutex_unlock(&pool->shm->mutex);
}
void worker_proc_cleanup(tpool_t* pool) {
if (ProcData.index_db != NULL) {
database_close(ProcData.index_db, FALSE);
}
database_close(ProcData.ipc_db, FALSE);
}
/** /**
* Thread worker function * Thread worker function
*/ */
static void *tpool_worker(void *arg) { static void *tpool_worker(void *arg) {
tpool_t *pool = arg; tpool_t *pool = ((start_thread_arg_t *) arg)->pool;
int pid = fork(); if (pool->fork) {
while (TRUE) {
int pid = fork();
if (pid == 0) { if (pid == 0) {
worker_proc_init(pool, ((start_thread_arg_t *) arg)->thread_id);
pthread_mutex_lock(&pool->shm->mutex);
pthread_cond_signal(&pool->shm->workers_initialized_cond);
pool->shm->initialized_count += 1;
pthread_mutex_unlock(&pool->shm->mutex);
worker_thread_loop(pool);
pthread_mutex_lock(&pool->shm->mutex);
pthread_cond_signal(&pool->shm->done_working_cond);
pthread_mutex_unlock(&pool->shm->mutex);
worker_proc_cleanup(pool);
exit(0);
} else {
int status;
// TODO: On crash, print debug info and resume thread
waitpid(pid, &status, 0);
LOG_DEBUGF("tpool.c", "Child process terminated with status code %d", WEXITSTATUS(status));
pthread_mutex_lock(&(pool->shm->ipc_ctx.mutex));
pool->shm->ipc_ctx.completed_job_count += 1;
pthread_mutex_unlock(&(pool->shm->ipc_ctx.mutex));
pthread_mutex_lock(&(pool->shm->data_mutex));
pool->shm->busy_count -= 1;
pthread_mutex_unlock(&(pool->shm->data_mutex));
if (WIFSIGNALED(status)) {
// TODO: Get current_job based on PID
const char *job_filepath = "TODO";
LOG_FATALF_NO_EXIT(
"tpool.c",
"Child process was terminated by signal (%s).\n"
BLANK_STR "The process was working on %s",
strsignal(WTERMSIG(status)),
job_filepath
);
}
break;
}
}
} else {
worker_proc_init(pool, ((start_thread_arg_t *) arg)->thread_id);
pthread_mutex_lock(&pool->shm->mutex);
pthread_cond_signal(&pool->shm->workers_initialized_cond);
pool->shm->initialized_count += 1;
pthread_mutex_unlock(&pool->shm->mutex);
worker_thread_loop(pool); worker_thread_loop(pool);
if (pool->cleanup_func != NULL) { pthread_mutex_lock(&pool->shm->mutex);
LOG_INFO("tpool.c", "Executing cleanup function") pthread_cond_signal(&pool->shm->done_working_cond);
pool->cleanup_func(); pthread_mutex_unlock(&pool->shm->mutex);
LOG_DEBUG("tpool.c", "Done executing cleanup function")
}
pthread_cond_signal(&(pool->working_cond)); return NULL;
pthread_mutex_unlock(&(pool->work_mutex));
exit(0);
} else {
int status;
// TODO: On crash, print debug info and resume thread
waitpid(pid, &status, 0);
LOG_DEBUGF("tpool.c", "Child process terminated with status code %d", WEXITSTATUS(status))
pthread_mutex_lock(&(pool->work_mutex));
pool->busy_cnt -= 1;
pool->done_cnt++;
pthread_mutex_unlock(&(pool->work_mutex));
if (WIFSIGNALED(status)) {
// parse_job_t *job = g_hash_table_lookup(ScanCtx.dbg_current_files, GINT_TO_POINTER(pthread_self()));
const char *job_filepath = "TODO";
LOG_FATALF_NO_EXIT(
"tpool.c",
"Child process was terminated by signal (%s).\n"
BLANK_STR "The process was working on %s",
strsignal(WTERMSIG(status)),
job_filepath
)
}
} }
return NULL; return NULL;
} }
void tpool_wait(tpool_t *pool) { void tpool_wait(tpool_t *pool) {
LOG_DEBUG("tpool.c", "Waiting for worker threads to finish") LOG_DEBUG("tpool.c", "Waiting for worker threads to finish");
pthread_mutex_lock(&(pool->work_mutex)); pthread_mutex_lock(&pool->shm->mutex);
pool->waiting = TRUE; pool->shm->waiting = TRUE;
pool->shm->ipc_ctx.no_more_jobs = TRUE;
while (TRUE) { while (TRUE) {
if (pool->done_cnt < pool->work_cnt) { if (pool->shm->ipc_ctx.job_count > 0) {
pthread_cond_wait(&(pool->working_cond), &(pool->work_mutex)); pthread_cond_wait(&(pool->shm->done_working_cond), &pool->shm->mutex);
} else { } else {
LOG_INFOF("tpool.c", "Received head=NULL signal, busy_cnt=%d", pool->busy_cnt); if (pool->shm->ipc_ctx.job_count == 0 && pool->shm->busy_count == 0) {
pool->shm->stop = TRUE;
if (pool->done_cnt == pool->work_cnt && pool->busy_cnt == 0) {
pool->stop = TRUE;
break; break;
} }
} }
@ -262,34 +255,25 @@ void tpool_wait(tpool_t *pool) {
if (pool->print_progress && !LogCtx.json_logs) { if (pool->print_progress && !LogCtx.json_logs) {
progress_bar_print(1.0, ScanCtx.stat_tn_size, ScanCtx.stat_index_size); progress_bar_print(1.0, ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
} }
pthread_mutex_unlock(&(pool->work_mutex)); pthread_mutex_unlock(&pool->shm->mutex);
LOG_INFO("tpool.c", "Worker threads finished") LOG_INFO("tpool.c", "Worker threads finished");
} }
void tpool_destroy(tpool_t *pool) { void tpool_destroy(tpool_t *pool) {
if (pool == NULL) { LOG_INFO("tpool.c", "Destroying thread pool");
return;
}
LOG_INFO("tpool.c", "Destroying thread pool") database_close(ProcData.ipc_db, FALSE);
pthread_mutex_lock(&(pool->work_mutex));
tpool_work_t *work = pool->work_head;
int count = 0; int count = 0;
while (work != NULL) {
tpool_work_t *tmp = work->next;
free(work);
work = tmp;
count += 1;
}
LOG_DEBUGF("tpool.c", "Destroyed %d jobs", count); LOG_DEBUGF("tpool.c", "Destroyed %d jobs", count);
pthread_cond_broadcast(&(pool->has_work_cond)); pthread_mutex_lock(&pool->shm->mutex);
pthread_mutex_unlock(&(pool->work_mutex)); pthread_cond_broadcast(&pool->shm->ipc_ctx.has_work_cond);
pthread_mutex_unlock(&pool->shm->mutex);
for (size_t i = 0; i < pool->thread_cnt; i++) { for (size_t i = 0; i < pool->num_threads; i++) {
pthread_t thread = pool->threads[i]; pthread_t thread = pool->threads[i];
if (thread != 0) { if (thread != 0) {
void *_; void *_;
@ -297,42 +281,33 @@ void tpool_destroy(tpool_t *pool) {
} }
} }
LOG_INFO("tpool.c", "Final cleanup") pthread_mutex_destroy(&pool->shm->ipc_ctx.mutex);
pthread_mutex_destroy(&pool->shm->mutex);
pthread_cond_destroy(&pool->shm->ipc_ctx.has_work_cond);
pthread_cond_destroy(&pool->shm->done_working_cond);
pthread_mutex_destroy(&(pool->work_mutex)); munmap(pool->shm, sizeof(*pool->shm));
pthread_cond_destroy(&(pool->has_work_cond));
pthread_cond_destroy(&(pool->working_cond));
munmap(pool->shared_memory, pool->shared_memory_size);
} }
/** /**
* Create a thread pool * Create a thread pool
* @param thread_cnt Worker threads count * @param thread_cnt Worker threads count
*/ */
tpool_t *tpool_create(int thread_cnt, void cleanup_func(), int print_progress) { tpool_t *tpool_create(int thread_cnt, int print_progress) {
size_t shm_size = 1024 * 1024 * 2000; int fork = FALSE;
void *shared_memory = mmap(NULL, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); tpool_t *pool = malloc(sizeof(tpool_t));
tpool_t *pool = (tpool_t *) shared_memory; pool->shm = mmap(NULL, sizeof(*pool->shm), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
pool->shared_memory = shared_memory;
pool->shared_memory_size = shm_size;
pool->mempool = (ncx_slab_pool_t *) (pool->shared_memory + sizeof(tpool_t));
pool->mempool->addr = pool->mempool;
pool->mempool->min_shift = 4;
pool->mempool->end = pool->shared_memory + shm_size;
ncx_slab_init(pool->mempool); pool->fork = fork;
pool->num_threads = thread_cnt;
pool->thread_cnt = thread_cnt; pool->shm->ipc_ctx.job_count = 0;
pool->work_cnt = 0; pool->shm->ipc_ctx.no_more_jobs = FALSE;
pool->done_cnt = 0; pool->shm->stop = FALSE;
pool->busy_cnt = 0; pool->shm->waiting = FALSE;
pool->stop = FALSE; pool->shm->job_type = JOB_UNDEFINED;
pool->waiting = FALSE;
pool->cleanup_func = cleanup_func;
memset(pool->threads, 0, sizeof(pool->threads)); memset(pool->threads, 0, sizeof(pool->threads));
pool->print_progress = print_progress; pool->print_progress = print_progress;
@ -340,27 +315,50 @@ tpool_t *tpool_create(int thread_cnt, void cleanup_func(), int print_progress) {
pthread_mutexattr_init(&mutexattr); pthread_mutexattr_init(&mutexattr);
pthread_mutexattr_setpshared(&mutexattr, TRUE); pthread_mutexattr_setpshared(&mutexattr, TRUE);
pthread_mutex_init(&(pool->work_mutex), &mutexattr); pthread_mutex_init(&(pool->shm->mutex), &mutexattr);
pthread_mutex_init(&(pool->mem_mutex), &mutexattr); pthread_mutex_init(&(pool->shm->data_mutex), &mutexattr);
pthread_mutex_init(&(pool->shm->ipc_ctx.mutex), &mutexattr);
pthread_mutex_init(&(pool->shm->ipc_ctx.db_mutex), &mutexattr);
pthread_mutex_init(&(pool->shm->ipc_ctx.index_db_mutex), &mutexattr);
pthread_condattr_t condattr; pthread_condattr_t condattr;
pthread_condattr_init(&condattr); pthread_condattr_init(&condattr);
pthread_condattr_setpshared(&condattr, TRUE); pthread_condattr_setpshared(&condattr, TRUE);
pthread_cond_init(&(pool->has_work_cond), &condattr); pthread_cond_init(&(pool->shm->ipc_ctx.has_work_cond), &condattr);
pthread_cond_init(&(pool->working_cond), &condattr); pthread_cond_init(&(pool->shm->done_working_cond), &condattr);
pthread_cond_init(&(pool->shm->workers_initialized_cond), &condattr);
pool->work_head = NULL; remove("/dev/shm/ipc.sist2");
pool->work_tail = NULL; remove("/dev/shm/ipc.sist2-wal");
remove("/dev/shm/ipc.sist2-shm");
ProcData.ipc_db = database_create("/dev/shm/ipc.sist2", IPC_PRODUCER_DATABASE);
ProcData.ipc_db->ipc_ctx = &pool->shm->ipc_ctx;
database_initialize(ProcData.ipc_db);
return pool; return pool;
} }
void tpool_start(tpool_t *pool) { void tpool_start(tpool_t *pool) {
LOG_INFOF("tpool.c", "Starting thread pool with %d threads", pool->thread_cnt) LOG_INFOF("tpool.c", "Starting thread pool with %d threads", pool->num_threads);
for (size_t i = 0; i < pool->thread_cnt; i++) { pthread_mutex_lock(&pool->shm->mutex);
pthread_create(&pool->threads[i], NULL, tpool_worker, pool);
for (int i = 0; i < pool->num_threads; i++) {
start_thread_arg_t *arg = malloc(sizeof(start_thread_arg_t));
arg->thread_id = i + 1;
arg->pool = pool;
pthread_create(&pool->threads[i], NULL, tpool_worker, arg);
} }
// Only open the database when all workers are done initializing
while (pool->shm->initialized_count != pool->num_threads) {
pthread_cond_wait(&pool->shm->workers_initialized_cond, &pool->shm->mutex);
}
pthread_mutex_unlock(&pool->shm->mutex);
database_open(ProcData.ipc_db);
} }

View File

@ -2,34 +2,27 @@
#define SIST2_TPOOL_H #define SIST2_TPOOL_H
#include "sist.h" #include "sist.h"
#include "third-party/libscan/libscan/scan.h"
#include "index/elastic.h"
#include "src/database/database.h"
struct tpool; struct tpool;
typedef struct tpool tpool_t; typedef struct tpool tpool_t;
typedef struct { tpool_t *tpool_create(int num, int print_progress);
size_t arg_size;
void *arg;
} tpool_work_arg_t;
typedef struct {
size_t arg_size;
char arg[0];
} tpool_work_arg_shm_t;
typedef void (*thread_func_t)(tpool_work_arg_shm_t *arg);
tpool_t *tpool_create(int num, void (*cleanup_func)(), int print_progress);
void tpool_start(tpool_t *pool); void tpool_start(tpool_t *pool);
void tpool_destroy(tpool_t *pool); void tpool_destroy(tpool_t *pool);
int tpool_add_work(tpool_t *pool, thread_func_t func, tpool_work_arg_t *arg); int tpool_add_work(tpool_t *pool, job_t *job);
void tpool_wait(tpool_t *pool); void tpool_wait(tpool_t *pool);
void tpool_dump_debug_info(tpool_t *pool); void tpool_dump_debug_info(tpool_t *pool);
void job_destroy(job_t *job);
#endif #endif

View File

@ -1,24 +1,26 @@
#ifndef SIST2_TYPES_H #ifndef SIST2_TYPES_H
#define SIST2_TYPES_H #define SIST2_TYPES_H
#define INDEX_TYPE_NDJSON "ndjson" typedef struct database database_t;
typedef struct index_descriptor { typedef struct index_descriptor {
char id[SIST_INDEX_ID_LEN]; char id[SIST_INDEX_ID_LEN];
char version[64]; char version[64];
int version_major;
int version_minor;
int version_patch;
long timestamp; long timestamp;
char root[PATH_MAX]; char root[PATH_MAX];
char rewrite_url[8192]; char rewrite_url[8192];
short root_len; int root_len;
char name[1024]; char name[1024];
char type[64];
} index_descriptor_t; } index_descriptor_t;
typedef struct index_t { typedef struct index_t {
struct index_descriptor desc; struct index_descriptor desc;
struct store_t *store;
struct store_t *tag_store; database_t *db;
struct store_t *meta_store;
char path[PATH_MAX]; char path[PATH_MAX];
} index_t; } index_t;

View File

@ -25,7 +25,6 @@ dyn_buffer_t url_escape(char *str) {
} }
char *abspath(const char *path) { char *abspath(const char *path) {
char *expanded = expandpath(path); char *expanded = expandpath(path);
char *abs = realpath(expanded, NULL); char *abs = realpath(expanded, NULL);
@ -34,8 +33,7 @@ char *abspath(const char *path) {
return NULL; return NULL;
} }
if (strlen(abs) > 1) { if (strlen(abs) > 1) {
abs = realloc(abs, strlen(abs) + 2); abs = realloc(abs, strlen(abs) + 1);
strcat(abs, "/");
} }
return abs; return abs;
@ -76,9 +74,8 @@ char *expandpath(const char *path) {
} }
} }
char *expanded = malloc(strlen(tmp) + 2); char *expanded = malloc(strlen(tmp) + 1);
strcpy(expanded, tmp); strcpy(expanded, tmp);
strcat(expanded, "/");
wordfree(&w); wordfree(&w);
return expanded; return expanded;
@ -103,6 +100,10 @@ void progress_bar_print_json(size_t done, size_t count, size_t tn_size, size_t i
void progress_bar_print(double percentage, size_t tn_size, size_t index_size) { void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {
if (isnan(percentage)) {
return;
}
// TODO: Fix this with shm/ctx // TODO: Fix this with shm/ctx
static int last_val = -1; static int last_val = -1;
@ -150,10 +151,6 @@ void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {
PrintingProgressBar = TRUE; PrintingProgressBar = TRUE;
} }
GHashTable *incremental_get_table() {
GHashTable *file_table = g_hash_table_new_full(g_str_hash, g_str_equal, free, NULL);
return file_table;
}
const char *find_file_in_paths(const char *paths[], const char *filename) { const char *find_file_in_paths(const char *paths[], const char *filename) {
@ -167,7 +164,7 @@ const char *find_file_in_paths(const char *paths[], const char *filename) {
char path[PATH_MAX]; char path[PATH_MAX];
snprintf(path, sizeof(path), "%s%s", apath, filename); snprintf(path, sizeof(path), "%s%s", apath, filename);
LOG_DEBUGF("util.c", "Looking for '%s' in folder '%s'", filename, apath) LOG_DEBUGF("util.c", "Looking for '%s' in folder '%s'", filename, apath);
free(apath); free(apath);
struct stat info; struct stat info;
@ -269,3 +266,39 @@ void str_unescape(char *dst, const char *str) {
} }
*cur = '\0'; *cur = '\0';
} }
#define NSEC_PER_SEC 1000000000
struct timespec timespec_normalise(struct timespec ts) {
while (ts.tv_nsec >= NSEC_PER_SEC) {
ts.tv_sec += 1;
ts.tv_nsec -= NSEC_PER_SEC;
}
while (ts.tv_nsec <= -NSEC_PER_SEC) {
ts.tv_sec -= 1;
ts.tv_nsec += NSEC_PER_SEC;
}
if (ts.tv_nsec < 0) {
ts.tv_sec -= 1;
ts.tv_nsec = (NSEC_PER_SEC + ts.tv_nsec);
}
return ts;
}
struct timespec timespec_add(struct timespec ts1, long usec) {
ts1 = timespec_normalise(ts1);
struct timespec ts2 = timespec_normalise((struct timespec) {
.tv_sec = 0,
.tv_nsec = usec * 1000
});
ts1.tv_sec += ts2.tv_sec;
ts1.tv_nsec += ts2.tv_nsec;
return timespec_normalise(ts1);
}

View File

@ -5,8 +5,6 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <glib.h>
#include "third-party/utf8.h/utf8.h" #include "third-party/utf8.h/utf8.h"
#include "libscan/scan.h" #include "libscan/scan.h"
@ -22,9 +20,6 @@ extern int PrintingProgressBar;
void progress_bar_print_json(size_t done, size_t count, size_t tn_size, size_t index_size, int waiting); void progress_bar_print_json(size_t done, size_t count, size_t tn_size, size_t index_size, int waiting);
void progress_bar_print(double percentage, size_t tn_size, size_t index_size); void progress_bar_print(double percentage, size_t tn_size, size_t index_size);
GHashTable *incremental_get_table();
const char *find_file_in_paths(const char **paths, const char *filename); const char *find_file_in_paths(const char **paths, const char *filename);
@ -100,31 +95,23 @@ static void generate_doc_id(const char *rel_path, char *doc_id) {
buf2hex(md, sizeof(md), doc_id); buf2hex(md, sizeof(md), doc_id);
} }
__always_inline #define MILLISECOND 1000
static void incremental_put(GHashTable *table, const char doc_id[SIST_DOC_ID_LEN], int mtime) {
char *ptr = malloc(SIST_DOC_ID_LEN);
strcpy(ptr, doc_id);
g_hash_table_insert(table, ptr, GINT_TO_POINTER(mtime));
}
__always_inline struct timespec timespec_add(struct timespec ts1, long usec);
static int incremental_get(GHashTable *table, const char doc_id[SIST_DOC_ID_LEN]) {
if (table != NULL) {
return GPOINTER_TO_INT(g_hash_table_lookup(table, doc_id));
} else {
return 0;
}
}
/** #define TIMER_INIT() struct timespec timer_begin
* Marks a file by adding it to a table. #define TIMER_START() clock_gettime(CLOCK_REALTIME, &timer_begin)
* !!Not thread safe. #define TIMER_END(x) do { \
*/ struct timespec timer_end; \
__always_inline clock_gettime(CLOCK_REALTIME, &timer_end); \
static int incremental_mark_file(GHashTable *table, const char doc_id[SIST_DOC_ID_LEN]) { x = (timer_end.tv_sec - timer_begin.tv_sec) * 1000000 + (timer_end.tv_nsec - timer_begin.tv_nsec) / 1000; \
char *ptr = malloc(SIST_DOC_ID_LEN); } while (0)
strcpy(ptr, doc_id);
return g_hash_table_insert(table, ptr, GINT_TO_POINTER(1)); #define pthread_cond_timedwait_ms(cond, mutex, delay_ms) do {\
} struct timespec now; \
clock_gettime(CLOCK_REALTIME, &now); \
struct timespec end_time = timespec_add(now, MILLISECOND * delay_ms); \
pthread_cond_timedwait(cond, mutex, &end_time); \
} while (0)
#endif #endif

View File

@ -1,15 +1,14 @@
#include "serve.h" #include "serve.h"
#include "src/sist.h" #include "src/sist.h"
#include "src/io/store.h" //#include "src/io/store.h"
#include "static_generated.c"
#include "src/index/elastic.h" #include "src/index/elastic.h"
#include "src/index/web.h" #include "src/index/web.h"
#include "src/auth0/auth0_c_api.h" #include "src/auth0/auth0_c_api.h"
#include "src/web/web_util.h"
#include <src/ctx.h> #include <src/ctx.h>
#define HTTP_SERVER_HEADER "Server: sist2/" VERSION "\r\n"
#define HTTP_TEXT_TYPE_HEADER "Content-Type: text/plain;charset=utf-8\r\n" #define HTTP_TEXT_TYPE_HEADER "Content-Type: text/plain;charset=utf-8\r\n"
#define HTTP_REPLY_NOT_FOUND mg_http_reply(nc, 404, HTTP_SERVER_HEADER HTTP_TEXT_TYPE_HEADER, "Not found"); #define HTTP_REPLY_NOT_FOUND mg_http_reply(nc, 404, HTTP_SERVER_HEADER HTTP_TEXT_TYPE_HEADER, "Not found");
@ -20,62 +19,6 @@ static struct mg_http_serve_opts DefaultServeOpts = {
.mime_types = "" .mime_types = ""
}; };
__always_inline
static char *address_to_string(struct mg_addr *addr) {
static char address_to_string_buf[INET6_ADDRSTRLEN];
return mg_ntoa(addr, address_to_string_buf, sizeof(address_to_string_buf));
}
static void send_response_line(struct mg_connection *nc, int status_code, size_t length, char *extra_headers) {
mg_printf(
nc,
"HTTP/1.1 %d %s\r\n"
HTTP_SERVER_HEADER
"Content-Length: %d\r\n"
"%s\r\n\r\n",
status_code, "OK",
length,
extra_headers
);
}
index_t *get_index_by_id(const char *index_id) {
for (int i = WebCtx.index_count; i >= 0; i--) {
if (strncmp(index_id, WebCtx.indices[i].desc.id, SIST_INDEX_ID_LEN) == 0) {
return &WebCtx.indices[i];
}
}
return NULL;
}
store_t *get_store(const char *index_id) {
index_t *idx = get_index_by_id(index_id);
if (idx != NULL) {
return idx->store;
}
return NULL;
}
store_t *get_tag_store(const char *index_id) {
index_t *idx = get_index_by_id(index_id);
if (idx != NULL) {
return idx->tag_store;
}
return NULL;
}
void search_index(struct mg_connection *nc, struct mg_http_message *hm) {
if (WebCtx.dev) {
mg_http_serve_file(nc, hm, "sist2-vue/dist/index.html", &DefaultServeOpts);
} else {
send_response_line(nc, 200, sizeof(index_html), "Content-Type: text/html");
mg_send(nc, index_html, sizeof(index_html));
}
}
void stats_files(struct mg_connection *nc, struct mg_http_message *hm) { void stats_files(struct mg_connection *nc, struct mg_http_message *hm) {
if (hm->uri.len != SIST_INDEX_ID_LEN + 4) { if (hm->uri.len != SIST_INDEX_ID_LEN + 4) {
@ -87,7 +30,7 @@ void stats_files(struct mg_connection *nc, struct mg_http_message *hm) {
memcpy(arg_index_id, hm->uri.ptr + 3, SIST_INDEX_ID_LEN); memcpy(arg_index_id, hm->uri.ptr + 3, SIST_INDEX_ID_LEN);
*(arg_index_id + SIST_INDEX_ID_LEN - 1) = '\0'; *(arg_index_id + SIST_INDEX_ID_LEN - 1) = '\0';
index_t *index = get_index_by_id(arg_index_id); index_t *index = web_get_index_by_id(arg_index_id);
if (index == NULL) { if (index == NULL) {
HTTP_REPLY_NOT_FOUND HTTP_REPLY_NOT_FOUND
return; return;
@ -123,87 +66,58 @@ void stats_files(struct mg_connection *nc, struct mg_http_message *hm) {
mg_http_serve_file(nc, hm, full_path, &opts); mg_http_serve_file(nc, hm, full_path, &opts);
} }
void javascript(struct mg_connection *nc, struct mg_http_message *hm) { void serve_index_html(struct mg_connection *nc, struct mg_http_message *hm) {
if (WebCtx.dev) {
mg_http_serve_file(nc, hm, "sist2-vue/dist/index.html", &DefaultServeOpts);
} else {
web_serve_asset_index_html(nc);
}
}
void serve_index_js(struct mg_connection *nc, struct mg_http_message *hm) {
if (WebCtx.dev) { if (WebCtx.dev) {
mg_http_serve_file(nc, hm, "sist2-vue/dist/js/index.js", &DefaultServeOpts); mg_http_serve_file(nc, hm, "sist2-vue/dist/js/index.js", &DefaultServeOpts);
} else { } else {
send_response_line(nc, 200, sizeof(index_js), "Content-Type: application/javascript"); web_serve_asset_index_js(nc);
mg_send(nc, index_js, sizeof(index_js));
} }
} }
void javascript_vendor(struct mg_connection *nc, struct mg_http_message *hm) { void serve_chunk_vendors_js(struct mg_connection *nc, struct mg_http_message *hm) {
if (WebCtx.dev) { if (WebCtx.dev) {
mg_http_serve_file(nc, hm, "sist2-vue/dist/js/chunk-vendors.js", &DefaultServeOpts); mg_http_serve_file(nc, hm, "sist2-vue/dist/js/chunk-vendors.js", &DefaultServeOpts);
} else { } else {
send_response_line(nc, 200, sizeof(chunk_vendors_js), "Content-Type: application/javascript"); web_serve_asset_chunk_vendors_js(nc);
mg_send(nc, chunk_vendors_js, sizeof(chunk_vendors_js));
} }
} }
void favicon(struct mg_connection *nc, struct mg_http_message *hm) { void serve_favicon_ico(struct mg_connection *nc, struct mg_http_message *hm) {
send_response_line(nc, 200, sizeof(favicon_ico), "Content-Type: image/x-icon"); web_serve_asset_favicon_ico(nc);
mg_send(nc, favicon_ico, sizeof(favicon_ico));
} }
void style(struct mg_connection *nc, struct mg_http_message *hm) { void serve_style_css(struct mg_connection *nc, struct mg_http_message *hm) {
send_response_line(nc, 200, sizeof(index_css), "Content-Type: text/css"); web_serve_asset_style_css(nc);
mg_send(nc, index_css, sizeof(index_css));
} }
void style_vendor(struct mg_connection *nc, struct mg_http_message *hm) { void serve_chunk_vendors_css(struct mg_connection *nc, struct mg_http_message *hm) {
send_response_line(nc, 200, sizeof(chunk_vendors_css), "Content-Type: text/css"); web_serve_asset_chunk_vendors_css(nc);
mg_send(nc, chunk_vendors_css, sizeof(chunk_vendors_css));
} }
void thumbnail(struct mg_connection *nc, struct mg_http_message *hm) { void serve_thumbnail(struct mg_connection *nc, struct mg_http_message *hm, const char *arg_index,
const char *arg_doc_id, int arg_num) {
int has_thumbnail_index = FALSE; database_t *db = web_get_database(arg_index);
if (db == NULL) {
if (hm->uri.len != SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2) { LOG_DEBUGF("serve.c", "Could not get database for index: %s", arg_index);
if (hm->uri.len != SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2 + 4) {
LOG_DEBUGF("serve.c", "Invalid thumbnail path: %.*s", (int) hm->uri.len, hm->uri.ptr)
HTTP_REPLY_NOT_FOUND
return;
}
has_thumbnail_index = TRUE;
}
char arg_doc_id[SIST_DOC_ID_LEN];
char arg_index[SIST_INDEX_ID_LEN];
memcpy(arg_index, hm->uri.ptr + 3, SIST_INDEX_ID_LEN);
*(arg_index + SIST_INDEX_ID_LEN - 1) = '\0';
memcpy(arg_doc_id, hm->uri.ptr + 3 + SIST_INDEX_ID_LEN, SIST_DOC_ID_LEN);
*(arg_doc_id + SIST_DOC_ID_LEN - 1) = '\0';
store_t *store = get_store(arg_index);
if (store == NULL) {
LOG_DEBUGF("serve.c", "Could not get store for index: %s", arg_index)
HTTP_REPLY_NOT_FOUND HTTP_REPLY_NOT_FOUND
return; return;
} }
char *data;
size_t data_len = 0; size_t data_len = 0;
if (has_thumbnail_index) { void *data = database_read_thumbnail(db, arg_doc_id, arg_num, &data_len);
const char *tn_index = hm->uri.ptr + SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2;
char tn_key[sizeof(arg_doc_id) + sizeof(char) * 4];
memcpy(tn_key, arg_doc_id, sizeof(arg_doc_id));
memcpy(tn_key + sizeof(arg_doc_id) - 1, tn_index, sizeof(char) * 4);
*(tn_key + sizeof(tn_key) - 1) = '\0';
data = store_read(store, (char *) tn_key, sizeof(tn_key), &data_len);
} else {
data = store_read(store, (char *) arg_doc_id, sizeof(arg_doc_id), &data_len);
}
if (data_len != 0) { if (data_len != 0) {
send_response_line( web_send_headers(
nc, 200, data_len, nc, 200, data_len,
"Content-Type: image/jpeg\r\n" "Content-Type: image/jpeg\r\n"
"Cache-Control: max-age=31536000" "Cache-Control: max-age=31536000"
@ -216,10 +130,50 @@ void thumbnail(struct mg_connection *nc, struct mg_http_message *hm) {
} }
} }
void search(struct mg_connection *nc, struct mg_http_message *hm) { void thumbnail_with_num(struct mg_connection *nc, struct mg_http_message *hm) {
if (hm->uri.len != SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2 + 5) {
LOG_DEBUGF("serve.c", "Invalid thumbnail path: %.*s", (int) hm->uri.len, hm->uri.ptr);
HTTP_REPLY_NOT_FOUND
return;
}
char arg_doc_id[SIST_DOC_ID_LEN];
char arg_index[SIST_INDEX_ID_LEN];
char arg_num[5] = {0};
memcpy(arg_index, hm->uri.ptr + 3, SIST_INDEX_ID_LEN);
*(arg_index + SIST_INDEX_ID_LEN - 1) = '\0';
memcpy(arg_doc_id, hm->uri.ptr + 3 + SIST_INDEX_ID_LEN, SIST_DOC_ID_LEN);
*(arg_doc_id + SIST_DOC_ID_LEN - 1) = '\0';
memcpy(arg_num, hm->uri.ptr + SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2, 4);
int num = (int) strtol(arg_num, NULL, 10);
serve_thumbnail(nc, hm, arg_index, arg_doc_id, num);
}
void thumbnail(struct mg_connection *nc, struct mg_http_message *hm) {
if (hm->uri.len != SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2) {
LOG_DEBUGF("serve.c", "Invalid thumbnail path: %.*s", (int) hm->uri.len, hm->uri.ptr);
HTTP_REPLY_NOT_FOUND
return;
}
char arg_doc_id[SIST_DOC_ID_LEN];
char arg_index[SIST_INDEX_ID_LEN];
memcpy(arg_index, hm->uri.ptr + 3, SIST_INDEX_ID_LEN);
*(arg_index + SIST_INDEX_ID_LEN - 1) = '\0';
memcpy(arg_doc_id, hm->uri.ptr + 3 + SIST_INDEX_ID_LEN, SIST_DOC_ID_LEN);
*(arg_doc_id + SIST_DOC_ID_LEN - 1) = '\0';
serve_thumbnail(nc, hm, arg_index, arg_doc_id, 0);
}
void search(struct mg_connection *nc, struct mg_http_message *hm) {
if (hm->body.len == 0) { if (hm->body.len == 0) {
LOG_DEBUG("serve.c", "Client sent empty body, ignoring request") LOG_DEBUG("serve.c", "Client sent empty body, ignoring request");
mg_http_reply(nc, 400, HTTP_SERVER_HEADER HTTP_TEXT_TYPE_HEADER, "Invalid request"); mg_http_reply(nc, 400, HTTP_SERVER_HEADER HTTP_TEXT_TYPE_HEADER, "Invalid request");
return; return;
} }
@ -266,7 +220,7 @@ void serve_file_from_disk(cJSON *json, index_t *idx, struct mg_connection *nc, s
if (strcmp(MG_VERSION, EXPECTED_MONGOOSE_VERSION) != 0) { if (strcmp(MG_VERSION, EXPECTED_MONGOOSE_VERSION) != 0) {
LOG_WARNING("serve.c", "sist2 was not linked with latest mongoose version, " LOG_WARNING("serve.c", "sist2 was not linked with latest mongoose version, "
"serving file from disk might not work as expected.") "serving file from disk might not work as expected.");
} }
const char *path = cJSON_GetObjectItem(json, "path")->valuestring; const char *path = cJSON_GetObjectItem(json, "path")->valuestring;
@ -285,7 +239,7 @@ void serve_file_from_disk(cJSON *json, index_t *idx, struct mg_connection *nc, s
idx->desc.root, path_unescaped, strlen(path_unescaped) == 0 ? "" : "/", idx->desc.root, path_unescaped, strlen(path_unescaped) == 0 ? "" : "/",
name_unescaped, strlen(ext) == 0 ? "" : ".", ext); name_unescaped, strlen(ext) == 0 ? "" : ".", ext);
LOG_DEBUGF("serve.c", "Serving file from disk: %s", full_path) LOG_DEBUGF("serve.c", "Serving file from disk: %s", full_path);
char disposition[8192]; char disposition[8192];
snprintf(disposition, sizeof(disposition), snprintf(disposition, sizeof(disposition),
@ -372,7 +326,7 @@ void index_info(struct mg_connection *nc) {
char *json_str = cJSON_PrintUnformatted(json); char *json_str = cJSON_PrintUnformatted(json);
send_response_line(nc, 200, strlen(json_str), "Content-Type: application/json"); web_send_headers(nc, 200, strlen(json_str), "Content-Type: application/json");
mg_send(nc, json_str, strlen(json_str)); mg_send(nc, json_str, strlen(json_str));
free(json_str); free(json_str);
cJSON_Delete(json); cJSON_Delete(json);
@ -382,7 +336,7 @@ void index_info(struct mg_connection *nc) {
void file(struct mg_connection *nc, struct mg_http_message *hm) { void file(struct mg_connection *nc, struct mg_http_message *hm) {
if (hm->uri.len != SIST_DOC_ID_LEN + 2) { if (hm->uri.len != SIST_DOC_ID_LEN + 2) {
LOG_DEBUGF("serve.c", "Invalid file path: %.*s", (int) hm->uri.len, hm->uri.ptr) LOG_DEBUGF("serve.c", "Invalid file path: %.*s", (int) hm->uri.len, hm->uri.ptr);
HTTP_REPLY_NOT_FOUND HTTP_REPLY_NOT_FOUND
return; return;
} }
@ -412,7 +366,7 @@ void file(struct mg_connection *nc, struct mg_http_message *hm) {
next = parent->valuestring; next = parent->valuestring;
} }
index_t *idx = get_index_by_id(index_id->valuestring); index_t *idx = web_get_index_by_id(index_id->valuestring);
if (idx == NULL) { if (idx == NULL) {
cJSON_Delete(doc); cJSON_Delete(doc);
@ -431,9 +385,9 @@ void file(struct mg_connection *nc, struct mg_http_message *hm) {
void status(struct mg_connection *nc) { void status(struct mg_connection *nc) {
char *status = elastic_get_status(); char *status = elastic_get_status();
if (strcmp(status, "open") == 0) { if (strcmp(status, "open") == 0) {
send_response_line(nc, 204, 0, "Content-Type: application/json"); web_send_headers(nc, 204, 0, "Content-Type: application/json");
} else { } else {
send_response_line(nc, 500, 0, "Content-Type: application/json"); web_send_headers(nc, 500, 0, "Content-Type: application/json");
} }
free(status); free(status);
@ -475,114 +429,114 @@ tag_req_t *parse_tag_request(cJSON *json) {
} }
void tag(struct mg_connection *nc, struct mg_http_message *hm) { void tag(struct mg_connection *nc, struct mg_http_message *hm) {
if (hm->uri.len != SIST_INDEX_ID_LEN + 4) { // if (hm->uri.len != SIST_INDEX_ID_LEN + 4) {
LOG_DEBUGF("serve.c", "Invalid tag path: %.*s", (int) hm->uri.len, hm->uri.ptr) // LOG_DEBUGF("serve.c", "Invalid tag path: %.*s", (int) hm->uri.len, hm->uri.ptr)
HTTP_REPLY_NOT_FOUND // HTTP_REPLY_NOT_FOUND
return; // return;
} // }
//
char arg_index[SIST_INDEX_ID_LEN]; // char arg_index[SIST_INDEX_ID_LEN];
memcpy(arg_index, hm->uri.ptr + 5, SIST_INDEX_ID_LEN); // memcpy(arg_index, hm->uri.ptr + 5, SIST_INDEX_ID_LEN);
*(arg_index + SIST_INDEX_ID_LEN - 1) = '\0'; // *(arg_index + SIST_INDEX_ID_LEN - 1) = '\0';
//
if (hm->body.len < 2 || hm->method.len != 4 || memcmp(&hm->method, "POST", 4) == 0) { // if (hm->body.len < 2 || hm->method.len != 4 || memcmp(&hm->method, "POST", 4) == 0) {
LOG_DEBUG("serve.c", "Invalid tag request") // LOG_DEBUG("serve.c", "Invalid tag request")
HTTP_REPLY_NOT_FOUND // HTTP_REPLY_NOT_FOUND
return; // return;
} // }
//
store_t *store = get_tag_store(arg_index); // store_t *store = get_tag_store(arg_index);
if (store == NULL) { // if (store == NULL) {
LOG_DEBUGF("serve.c", "Could not get tag store for index: %s", arg_index) // LOG_DEBUGF("serve.c", "Could not get tag store for index: %s", arg_index)
HTTP_REPLY_NOT_FOUND // HTTP_REPLY_NOT_FOUND
return; // return;
} // }
//
char *body = malloc(hm->body.len + 1); // char *body = malloc(hm->body.len + 1);
memcpy(body, hm->body.ptr, hm->body.len); // memcpy(body, hm->body.ptr, hm->body.len);
*(body + hm->body.len) = '\0'; // *(body + hm->body.len) = '\0';
cJSON *json = cJSON_Parse(body); // cJSON *json = cJSON_Parse(body);
//
tag_req_t *arg_req = parse_tag_request(json); // tag_req_t *arg_req = parse_tag_request(json);
if (arg_req == NULL) { // if (arg_req == NULL) {
LOG_DEBUGF("serve.c", "Could not parse tag request", arg_index) // LOG_DEBUGF("serve.c", "Could not parse tag request", arg_index)
cJSON_Delete(json); // cJSON_Delete(json);
free(body); // free(body);
mg_http_reply(nc, 400, "", "Invalid request"); // mg_http_reply(nc, 400, "", "Invalid request");
return; // return;
} // }
//
cJSON *arr = NULL; // cJSON *arr = NULL;
//
size_t data_len = 0; // size_t data_len = 0;
const char *data = store_read(store, arg_req->doc_id, SIST_DOC_ID_LEN, &data_len); // const char *data = store_read(store, arg_req->doc_id, SIST_DOC_ID_LEN, &data_len);
if (data_len == 0) { // if (data_len == 0) {
arr = cJSON_CreateArray(); // arr = cJSON_CreateArray();
} else { // } else {
arr = cJSON_Parse(data); // arr = cJSON_Parse(data);
} // }
//
if (arg_req->delete) { // if (arg_req->delete) {
//
if (data_len > 0) { // if (data_len > 0) {
cJSON *element = NULL; // cJSON *element = NULL;
int i = 0; // int i = 0;
cJSON_ArrayForEach(element, arr) { // cJSON_ArrayForEach(element, arr) {
if (strcmp(element->valuestring, arg_req->name) == 0) { // if (strcmp(element->valuestring, arg_req->name) == 0) {
cJSON_DeleteItemFromArray(arr, i); // cJSON_DeleteItemFromArray(arr, i);
break; // break;
} // }
i++; // i++;
} // }
} // }
//
char *buf = malloc(sizeof(char) * 8192); // char *buf = malloc(sizeof(char) * 8192);
snprintf(buf, 8192, // snprintf(buf, 8192,
"{" // "{"
" \"script\" : {" // " \"script\" : {"
" \"source\": \"if (ctx._source.tag.contains(params.tag)) { ctx._source.tag.remove(ctx._source.tag.indexOf(params.tag)) }\"," // " \"source\": \"if (ctx._source.tag.contains(params.tag)) { ctx._source.tag.remove(ctx._source.tag.indexOf(params.tag)) }\","
" \"lang\": \"painless\"," // " \"lang\": \"painless\","
" \"params\" : {" // " \"params\" : {"
" \"tag\" : \"%s\"" // " \"tag\" : \"%s\""
" }" // " }"
" }" // " }"
"}", arg_req->name // "}", arg_req->name
); // );
//
char url[4096]; // char url[4096];
snprintf(url, sizeof(url), "%s/%s/_update/%s", WebCtx.es_url, WebCtx.es_index, arg_req->doc_id); // snprintf(url, sizeof(url), "%s/%s/_update/%s", WebCtx.es_url, WebCtx.es_index, arg_req->doc_id);
nc->fn_data = web_post_async(url, buf, WebCtx.es_insecure_ssl); // nc->fn_data = web_post_async(url, buf, WebCtx.es_insecure_ssl);
//
} else { // } else {
cJSON_AddItemToArray(arr, cJSON_CreateString(arg_req->name)); // cJSON_AddItemToArray(arr, cJSON_CreateString(arg_req->name));
//
char *buf = malloc(sizeof(char) * 8192); // char *buf = malloc(sizeof(char) * 8192);
snprintf(buf, 8192, // snprintf(buf, 8192,
"{" // "{"
" \"script\" : {" // " \"script\" : {"
" \"source\": \"if(ctx._source.tag == null) {ctx._source.tag = new ArrayList()} ctx._source.tag.add(params.tag)\"," // " \"source\": \"if(ctx._source.tag == null) {ctx._source.tag = new ArrayList()} ctx._source.tag.add(params.tag)\","
" \"lang\": \"painless\"," // " \"lang\": \"painless\","
" \"params\" : {" // " \"params\" : {"
" \"tag\" : \"%s\"" // " \"tag\" : \"%s\""
" }" // " }"
" }" // " }"
"}", arg_req->name // "}", arg_req->name
); // );
//
char url[4096]; // char url[4096];
snprintf(url, sizeof(url), "%s/%s/_update/%s", WebCtx.es_url, WebCtx.es_index, arg_req->doc_id); // snprintf(url, sizeof(url), "%s/%s/_update/%s", WebCtx.es_url, WebCtx.es_index, arg_req->doc_id);
nc->fn_data = web_post_async(url, buf, WebCtx.es_insecure_ssl); // nc->fn_data = web_post_async(url, buf, WebCtx.es_insecure_ssl);
} // }
//
char *json_str = cJSON_PrintUnformatted(arr); // char *json_str = cJSON_PrintUnformatted(arr);
store_write(store, arg_req->doc_id, SIST_DOC_ID_LEN, json_str, strlen(json_str) + 1); // store_write(store, arg_req->doc_id, SIST_DOC_ID_LEN, json_str, strlen(json_str) + 1);
store_flush(store); // store_flush(store);
//
free(arg_req); // free(arg_req);
free(json_str); // free(json_str);
cJSON_Delete(json); // cJSON_Delete(json);
cJSON_Delete(arr); // cJSON_Delete(arr);
free(body); // free(body);
} }
int validate_auth(struct mg_connection *nc, struct mg_http_message *hm) { int validate_auth(struct mg_connection *nc, struct mg_http_message *hm) {
@ -601,7 +555,7 @@ int check_auth0(struct mg_http_message *hm) {
struct mg_str *cookie = mg_http_get_header(hm, "Cookie"); struct mg_str *cookie = mg_http_get_header(hm, "Cookie");
if (cookie == NULL) { if (cookie == NULL) {
LOG_WARNING("serve.c", "Unauthorized request (no auth cookie)") LOG_WARNING("serve.c", "Unauthorized request (no auth cookie)");
return FALSE; return FALSE;
} }
@ -610,7 +564,7 @@ int check_auth0(struct mg_http_message *hm) {
token = mg_http_get_header_var(*cookie, mg_str("sist2-auth0")); token = mg_http_get_header_var(*cookie, mg_str("sist2-auth0"));
if (token.len == 0) { if (token.len == 0) {
LOG_WARNING("serve.c", "Unauthorized request (no auth cookie)") LOG_WARNING("serve.c", "Unauthorized request (no auth cookie)");
return FALSE; return FALSE;
} }
@ -644,28 +598,31 @@ static void ev_router(struct mg_connection *nc, int ev, void *ev_data, UNUSED(vo
} }
} }
char uri[256];
memcpy(uri, hm->uri.ptr, hm->uri.len);
*(uri + hm->uri.len) = '\0';
LOG_DEBUGF("serve.c", "<%s> GET %s", LOG_DEBUGF("serve.c", "<%s> GET %s",
address_to_string(&(nc->rem)), web_address_to_string(&(nc->rem)),
hm->uri uri
) );
if (mg_http_match_uri(hm, "/")) { if (mg_http_match_uri(hm, "/")) {
search_index(nc, hm); serve_index_html(nc, hm);
return; return;
} else if (mg_http_match_uri(hm, "/favicon.ico")) { } else if (mg_http_match_uri(hm, "/favicon.ico")) {
favicon(nc, hm); serve_favicon_ico(nc, hm);
return; return;
} else if (mg_http_match_uri(hm, "/css/index.css")) { } else if (mg_http_match_uri(hm, "/css/index.css")) {
style(nc, hm); serve_style_css(nc, hm);
return; return;
} else if (mg_http_match_uri(hm, "/css/chunk-vendors.css")) { } else if (mg_http_match_uri(hm, "/css/chunk-vendors.css")) {
style_vendor(nc, hm); serve_chunk_vendors_css(nc, hm);
return; return;
} else if (mg_http_match_uri(hm, "/js/index.js")) { } else if (mg_http_match_uri(hm, "/js/index.js")) {
javascript(nc, hm); serve_index_js(nc, hm);
return; return;
} else if (mg_http_match_uri(hm, "/js/chunk-vendors.js")) { } else if (mg_http_match_uri(hm, "/js/chunk-vendors.js")) {
javascript_vendor(nc, hm); serve_chunk_vendors_js(nc, hm);
return; return;
} else if (mg_http_match_uri(hm, "/i")) { } else if (mg_http_match_uri(hm, "/i")) {
index_info(nc); index_info(nc);
@ -683,6 +640,8 @@ static void ev_router(struct mg_connection *nc, int ev, void *ev_data, UNUSED(vo
status(nc); status(nc);
} else if (mg_http_match_uri(hm, "/f/*")) { } else if (mg_http_match_uri(hm, "/f/*")) {
file(nc, hm); file(nc, hm);
} else if (mg_http_match_uri(hm, "/t/*/*/*")) {
thumbnail_with_num(nc, hm);
} else if (mg_http_match_uri(hm, "/t/*/*")) { } else if (mg_http_match_uri(hm, "/t/*/*")) {
thumbnail(nc, hm); thumbnail(nc, hm);
} else if (mg_http_match_uri(hm, "/s/*/*")) { } else if (mg_http_match_uri(hm, "/s/*/*")) {
@ -706,7 +665,7 @@ static void ev_router(struct mg_connection *nc, int ev, void *ev_data, UNUSED(vo
response_t *r = ctx->response; response_t *r = ctx->response;
if (r->status_code == 200) { if (r->status_code == 200) {
send_response_line(nc, 200, r->size, "Content-Type: application/json"); web_send_headers(nc, 200, r->size, "Content-Type: application/json");
mg_send(nc, r->body, r->size); mg_send(nc, r->body, r->size);
} else if (r->status_code == 0) { } else if (r->status_code == 0) {
sist_log("serve.c", LOG_SIST_ERROR, "Could not connect to elasticsearch!"); sist_log("serve.c", LOG_SIST_ERROR, "Could not connect to elasticsearch!");
@ -738,7 +697,7 @@ static void ev_router(struct mg_connection *nc, int ev, void *ev_data, UNUSED(vo
void serve(const char *listen_address) { void serve(const char *listen_address) {
LOG_INFOF("serve.c", "Starting web server @ http://%s", listen_address) LOG_INFOF("serve.c", "Starting web server @ http://%s", listen_address);
struct mg_mgr mgr; struct mg_mgr mgr;
mg_mgr_init(&mgr); mg_mgr_init(&mgr);
@ -747,12 +706,12 @@ void serve(const char *listen_address) {
struct mg_connection *nc = mg_http_listen(&mgr, listen_address, ev_router, NULL); struct mg_connection *nc = mg_http_listen(&mgr, listen_address, ev_router, NULL);
if (nc == NULL) { if (nc == NULL) {
LOG_FATALF("serve.c", "Couldn't bind web server on address %s", listen_address) LOG_FATALF("serve.c", "Couldn't bind web server on address %s", listen_address);
} }
while (ok) { while (ok) {
mg_mgr_poll(&mgr, 10); mg_mgr_poll(&mgr, 10);
} }
mg_mgr_free(&mgr); mg_mgr_free(&mgr);
LOG_INFO("serve.c", "Finished web event loop") LOG_INFO("serve.c", "Finished web event loop");
} }

63
src/web/web_util.c Normal file
View File

@ -0,0 +1,63 @@
#include "web_util.h"
#include "static_generated.c"
void web_serve_asset_index_html(struct mg_connection *nc) {
web_send_headers(nc, 200, sizeof(index_html), "Content-Type: text/html");
mg_send(nc, index_html, sizeof(index_html));
}
void web_serve_asset_index_js(struct mg_connection *nc) {
web_send_headers(nc, 200, sizeof(index_js), "Content-Type: application/javascript");
mg_send(nc, index_js, sizeof(index_js));
}
void web_serve_asset_chunk_vendors_js(struct mg_connection *nc) {
web_send_headers(nc, 200, sizeof(chunk_vendors_js), "Content-Type: application/javascript");
mg_send(nc, chunk_vendors_js, sizeof(chunk_vendors_js));
}
void web_serve_asset_favicon_ico(struct mg_connection *nc) {
web_send_headers(nc, 200, sizeof(favicon_ico), "Content-Type: image/x-icon");
mg_send(nc, favicon_ico, sizeof(favicon_ico));
}
void web_serve_asset_style_css(struct mg_connection *nc) {
web_send_headers(nc, 200, sizeof(index_css), "Content-Type: text/css");
mg_send(nc, index_css, sizeof(index_css));
}
void web_serve_asset_chunk_vendors_css(struct mg_connection *nc) {
web_send_headers(nc, 200, sizeof(chunk_vendors_css), "Content-Type: text/css");
mg_send(nc, chunk_vendors_css, sizeof(chunk_vendors_css));
}
index_t *web_get_index_by_id(const char *index_id) {
for (int i = WebCtx.index_count; i >= 0; i--) {
if (strncmp(index_id, WebCtx.indices[i].desc.id, SIST_INDEX_ID_LEN) == 0) {
return &WebCtx.indices[i];
}
}
return NULL;
}
database_t *web_get_database(const char *index_id) {
index_t *idx = web_get_index_by_id(index_id);
if (idx != NULL) {
return idx->db;
}
return NULL;
}
void web_send_headers(struct mg_connection *nc, int status_code, size_t length, char *extra_headers) {
mg_printf(
nc,
"HTTP/1.1 %d %s\r\n"
HTTP_SERVER_HEADER
"Content-Length: %d\r\n"
"%s\r\n\r\n",
status_code, "OK",
length,
extra_headers
);
}

32
src/web/web_util.h Normal file
View File

@ -0,0 +1,32 @@
#ifndef SIST2_WEB_UTIL_H
#define SIST2_WEB_UTIL_H
#include "src/sist.h"
#include "src/index/elastic.h"
#include "src/ctx.h"
#include <mongoose.h>
#define HTTP_SERVER_HEADER "Server: sist2/" VERSION "\r\n"
index_t *web_get_index_by_id(const char *index_id);
database_t *web_get_database(const char *index_id);
__always_inline
static char *web_address_to_string(struct mg_addr *addr) {
return "TODO";
// static char address_to_string_buf[INET6_ADDRSTRLEN];
//
// return mg_ntoa(addr, address_to_string_buf, sizeof(address_to_string_buf));
}
void web_send_headers(struct mg_connection *nc, int status_code, size_t length, char *extra_headers);
void web_serve_asset_index_html(struct mg_connection *nc);
void web_serve_asset_index_js(struct mg_connection *nc);
void web_serve_asset_chunk_vendors_js(struct mg_connection *nc);
void web_serve_asset_favicon_ico(struct mg_connection *nc);
void web_serve_asset_style_css(struct mg_connection *nc);
void web_serve_asset_chunk_vendors_css(struct mg_connection *nc);
#endif //SIST2_WEB_UTIL_H

View File

@ -97,7 +97,6 @@ find_package(LibLZMA REQUIRED)
find_package(ZLIB REQUIRED) find_package(ZLIB REQUIRED)
find_package(unofficial-pcre CONFIG REQUIRED) find_package(unofficial-pcre CONFIG REQUIRED)
find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec) find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec)
find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd) find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd)
find_library(FREETYPE_LIB NAMES freetype freetyped) find_library(FREETYPE_LIB NAMES freetype freetyped)
@ -110,6 +109,7 @@ find_library(CMS_LIB NAMES lcms2)
find_library(JAS_LIB NAMES jasper) find_library(JAS_LIB NAMES jasper)
find_library(GUMBO_LIB NAMES gumbo) find_library(GUMBO_LIB NAMES gumbo)
find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/11/ /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/) find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/11/ /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/)
find_package(Leptonica CONFIG REQUIRED)
target_compile_options( target_compile_options(
@ -231,6 +231,7 @@ target_link_libraries(
antiword antiword
mobi mobi
unofficial::pcre::pcre unofficial::pcre::pcre16 unofficial::pcre::pcre32 unofficial::pcre::pcrecpp unofficial::pcre::pcre unofficial::pcre::pcre16 unofficial::pcre::pcre32 unofficial::pcre::pcrecpp
leptonica
) )
target_include_directories( target_include_directories(

View File

@ -9,27 +9,13 @@
#define MAX_DECOMPRESSED_SIZE_RATIO 40.0 #define MAX_DECOMPRESSED_SIZE_RATIO 40.0
int should_parse_filtered_file(const char *filepath, int ext) { int should_parse_filtered_file(const char *filepath) {
char tmp[PATH_MAX * 2];
if (ext == 0) { if (strstr(filepath, ".tgz")) {
return FALSE;
}
if (strncmp(filepath + ext, "tgz", 3) == 0) {
return TRUE; return TRUE;
} }
memcpy(tmp, filepath, ext - 1); if (strstr(filepath, ".tar.")) {
*(tmp + ext - 1) = '\0';
char *idx = strrchr(tmp, '.');
if (idx == NULL) {
return FALSE;
}
if (strcmp(idx, ".tar") == 0) {
return TRUE; return TRUE;
} }
@ -206,18 +192,10 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
struct stat entry_stat = *archive_entry_stat(entry); struct stat entry_stat = *archive_entry_stat(entry);
sub_job->vfile.st_mode = entry_stat.st_mode;
sub_job->vfile.st_size = entry_stat.st_size; sub_job->vfile.st_size = entry_stat.st_size;
sub_job->vfile.mtime = (int) entry_stat.st_mtim.tv_sec; sub_job->vfile.mtime = (int) entry_stat.st_mtim.tv_sec;
double decompressed_size_ratio = (double) sub_job->vfile.st_size / (double) f->st_size; if (S_ISREG(entry_stat.st_mode)) {
if (decompressed_size_ratio > MAX_DECOMPRESSED_SIZE_RATIO) {
CTX_LOG_DEBUGF("arc.c", "Skipped %s, possible zip bomb (decompressed_size_ratio=%f)", sub_job->filepath,
decompressed_size_ratio)
continue;
}
if (S_ISREG(sub_job->vfile.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry); const char *utf8_name = archive_entry_pathname_utf8(entry);
@ -231,6 +209,13 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre
} }
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1; sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
double decompressed_size_ratio = (double) sub_job->vfile.st_size / (double) f->st_size;
if (decompressed_size_ratio > MAX_DECOMPRESSED_SIZE_RATIO) {
CTX_LOG_DEBUGF("arc.c", "Skipped %s, possible zip bomb (decompressed_size_ratio=%f)", sub_job->filepath,
decompressed_size_ratio)
break;
}
// Handle excludes // Handle excludes
if (exclude != NULL && EXCLUDED(sub_job->filepath)) { if (exclude != NULL && EXCLUDED(sub_job->filepath)) {
CTX_LOG_DEBUGF("arc.c", "Excluded: %s", sub_job->filepath) CTX_LOG_DEBUGF("arc.c", "Excluded: %s", sub_job->filepath)

View File

@ -67,7 +67,7 @@ static int vfile_close_callback(struct archive *a, void *user_data) {
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse); int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse);
int should_parse_filtered_file(const char *filepath, int ext); int should_parse_filtered_file(const char *filepath);
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra); scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra);

View File

@ -162,7 +162,7 @@ int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_d
avcodec_receive_packet(jpeg_encoder, &jpeg_packet); avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
APPEND_LONG_META(doc, MetaThumbnail, 1) APPEND_LONG_META(doc, MetaThumbnail, 1)
ctx->store(doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size); ctx->store(doc->doc_id, 0, (char *) jpeg_packet.data, jpeg_packet.size);
free(samples); free(samples);
av_packet_unref(&jpeg_packet); av_packet_unref(&jpeg_packet);

View File

@ -232,7 +232,7 @@ void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc) {
bmp_format(&bmp_data, dimensions, bitmap); bmp_format(&bmp_data, dimensions, bitmap);
APPEND_LONG_META(doc, MetaThumbnail, 1) APPEND_LONG_META(doc, MetaThumbnail, 1)
ctx->store(doc->doc_id, sizeof(doc->doc_id), (char *) bmp_data.buf, bmp_data.cur); ctx->store(doc->doc_id, 0, bmp_data.buf, bmp_data.cur);
dyn_buffer_destroy(&bmp_data); dyn_buffer_destroy(&bmp_data);
free(bitmap); free(bitmap);

View File

@ -468,8 +468,7 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
if (scaled_frame == STORE_AS_IS) { if (scaled_frame == STORE_AS_IS) {
return_value = SAVE_THUMBNAIL_OK; return_value = SAVE_THUMBNAIL_OK;
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) frame_and_packet->packet->data, ctx->store(doc->doc_id, 0, frame_and_packet->packet->data, frame_and_packet->packet->size);
frame_and_packet->packet->size);
} else { } else {
// Encode frame to jpeg // Encode frame to jpeg
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
@ -482,19 +481,17 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
// Save thumbnail // Save thumbnail
if (thumbnail_index == 0) { if (thumbnail_index == 0) {
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size); ctx->store(doc->doc_id, 0, jpeg_packet.data, jpeg_packet.size);
return_value = SAVE_THUMBNAIL_OK; return_value = SAVE_THUMBNAIL_OK;
} else if (thumbnail_index > 1) { } else if (thumbnail_index > 1) {
return_value = SAVE_THUMBNAIL_OK;
// TO FIX: the 2nd rendered frame is always broken, just skip it until // TO FIX: the 2nd rendered frame is always broken, just skip it until
// I figure out a better fix. // I figure out a better fix.
thumbnail_index -= 1; thumbnail_index -= 1;
char tn_key[sizeof(doc->doc_id) + sizeof(char) * 4]; ctx->store(doc->doc_id, thumbnail_index, jpeg_packet.data, jpeg_packet.size);
snprintf(tn_key, sizeof(tn_key), "%s%04d", doc->doc_id, thumbnail_index);
ctx->store((char *) tn_key, sizeof(tn_key), (char *) jpeg_packet.data, jpeg_packet.size); return_value = SAVE_THUMBNAIL_OK;
} else { } else {
return_value = SAVE_THUMBNAIL_SKIPPED; return_value = SAVE_THUMBNAIL_SKIPPED;
} }
@ -854,8 +851,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
if (scaled_frame == STORE_AS_IS) { if (scaled_frame == STORE_AS_IS) {
APPEND_LONG_META(doc, MetaThumbnail, 1) APPEND_LONG_META(doc, MetaThumbnail, 1)
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) frame_and_packet->packet->data, ctx->store(doc->doc_id, 0, frame_and_packet->packet->data, frame_and_packet->packet->size);
frame_and_packet->packet->size);
} else { } else {
// Encode frame to jpeg // Encode frame to jpeg
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
@ -868,7 +864,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
// Save thumbnail // Save thumbnail
APPEND_LONG_META(doc, MetaThumbnail, 1) APPEND_LONG_META(doc, MetaThumbnail, 1)
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size); ctx->store(doc->doc_id, 0, jpeg_packet.data, jpeg_packet.size);
av_packet_unref(&jpeg_packet); av_packet_unref(&jpeg_packet);
avcodec_free_context(&jpeg_encoder); avcodec_free_context(&jpeg_encoder);

View File

@ -191,7 +191,7 @@ void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, s
archive_read_data(a, buf, entry_size); archive_read_data(a, buf, entry_size);
APPEND_LONG_META(doc, MetaThumbnail, 1) APPEND_LONG_META(doc, MetaThumbnail, 1)
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), buf, entry_size); ctx->store(doc->doc_id, 1, buf, entry_size);
free(buf); free(buf);
} }

View File

@ -6,6 +6,7 @@
#endif #endif
#include <stdio.h> #include <stdio.h>
#include <string.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <openssl/md5.h> #include <openssl/md5.h>
#include <openssl/sha.h> #include <openssl/sha.h>
@ -16,7 +17,7 @@
#define UNUSED(x) __attribute__((__unused__)) x #define UNUSED(x) __attribute__((__unused__)) x
typedef void (*store_callback_t)(char *key, size_t key_len, char *buf, size_t buf_len); typedef void (*store_callback_t)(char *key, int num, void *buf, size_t buf_len);
typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...); typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...);
@ -111,8 +112,8 @@ typedef struct document {
unsigned long size; unsigned long size;
unsigned int mime; unsigned int mime;
int mtime; int mtime;
short base; int base;
short ext; int ext;
meta_line_t *meta_head; meta_line_t *meta_head;
meta_line_t *meta_tail; meta_line_t *meta_tail;
char filepath[PATH_MAX * 2 + 1]; char filepath[PATH_MAX * 2 + 1];
@ -144,7 +145,6 @@ typedef struct vfile {
int mtime; int mtime;
size_t st_size; size_t st_size;
unsigned int st_mode;
SHA_CTX sha1_ctx; SHA_CTX sha1_ctx;
unsigned char sha1_digest[SHA1_DIGEST_LENGTH]; unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
@ -161,7 +161,7 @@ typedef struct vfile {
logf_callback_t logf; logf_callback_t logf;
} vfile_t; } vfile_t;
typedef struct parse_job_t { typedef struct {
int base; int base;
int ext; int ext;
struct vfile vfile; struct vfile vfile;

View File

@ -358,4 +358,37 @@ static void safe_sha1_update(SHA_CTX *ctx, void *buf, size_t size) {
} }
} }
static parse_job_t *create_parse_job(const char *filepath, int mtime, size_t st_size) {
parse_job_t *job = (parse_job_t *) malloc(sizeof(parse_job_t));
job->parent[0] = '\0';
strcpy(job->filepath, filepath);
strcpy(job->vfile.filepath, filepath);
job->vfile.st_size = st_size;
job->vfile.mtime = mtime;
const char *slash = strrchr(filepath, '/');
if (slash == NULL) {
job->base = 0;
} else {
job->base = (int) (slash - filepath + 1);
}
const char *dot = strrchr(filepath + job->base, '.');
if (dot == NULL) {
job->ext = (int) strlen(filepath);
} else {
job->ext = (int) (dot - filepath + 1);
}
job->vfile.fd = -1;
job->vfile.is_fs_file = TRUE;
job->vfile.has_checksum = FALSE;
job->vfile.rewind_buffer_size = 0;
job->vfile.rewind_buffer = NULL;
return job;
}
#endif #endif

View File

@ -55,7 +55,6 @@ void load_file(const char *filepath, vfile_t *f) {
f->mtime = (int)info.st_mtim.tv_sec; f->mtime = (int)info.st_mtim.tv_sec;
f->st_size = info.st_size; f->st_size = info.st_size;
f->st_mode = info.st_mode;
f->fd = open(filepath, O_RDONLY); f->fd = open(filepath, O_RDONLY);

View File

@ -21,7 +21,7 @@ static void noop_log(const char *filepath, int level, char *str) {
static size_t store_size = 0; static size_t store_size = 0;
static void counter_store(char* key, size_t key_len, char *value, size_t value_len) { static void counter_store(char* key, int num, void *value, size_t value_len) {
store_size += value_len; store_size += value_len;
// char id[37]; // char id[37];
// char tmp[PATH_MAX]; // char tmp[PATH_MAX];

@ -1 +1 @@
Subproject commit ddb042143e72a8b789e06f09dbc897dfa9f15b82 Subproject commit badfdac84586511d4f2b626516162d62a3625349