mirror of
https://github.com/simon987/sist2.git
synced 2025-04-04 07:52:59 +00:00
use sqlite to save index, major thread pool refactor
This commit is contained in:
parent
ca973d63a4
commit
fc36f33d52
2
.gitignore
vendored
2
.gitignore
vendored
@ -41,3 +41,5 @@ build.ninja
|
||||
src/web/static_generated.c
|
||||
src/magic_generated.c
|
||||
src/index/static_generated.c
|
||||
*.sist2
|
||||
*-shm
|
@ -22,30 +22,33 @@ set(ARGPARSE_SHARED off)
|
||||
add_subdirectory(third-party/argparse)
|
||||
|
||||
add_executable(sist2
|
||||
# argparse
|
||||
third-party/argparse/argparse.h third-party/argparse/argparse.c
|
||||
|
||||
src/main.c
|
||||
src/sist.h
|
||||
src/io/walk.h src/io/walk.c
|
||||
src/io/store.h src/io/store.c
|
||||
src/tpool.h src/tpool.c
|
||||
src/parsing/parse.h src/parsing/parse.c
|
||||
src/parsing/magic_util.c src/parsing/magic_util.h
|
||||
src/io/serialize.h src/io/serialize.c
|
||||
src/parsing/mime.h src/parsing/mime.c src/parsing/mime_generated.c
|
||||
src/index/web.c src/index/web.h
|
||||
src/web/serve.c src/web/serve.h
|
||||
src/web/web_util.c src/web/web_util.h
|
||||
src/index/elastic.c src/index/elastic.h
|
||||
src/util.c src/util.h
|
||||
src/ctx.h src/types.h
|
||||
src/ctx.c src/ctx.h
|
||||
src/types.h
|
||||
src/log.c src/log.h
|
||||
src/cli.c src/cli.h
|
||||
src/stats.c src/stats.h src/ctx.c
|
||||
src/parsing/sidecar.c src/parsing/sidecar.h
|
||||
src/mempool/mempool.c src/mempool/mempool.h
|
||||
src/database/database.c src/database/database.h
|
||||
src/parsing/fs_util.h
|
||||
|
||||
src/auth0/auth0_c_api.h src/auth0/auth0_c_api.cpp
|
||||
|
||||
# argparse
|
||||
third-party/argparse/argparse.h third-party/argparse/argparse.c
|
||||
)
|
||||
src/database/database_stats.c src/database/database_stats.h src/database/database_schema.c)
|
||||
set_target_properties(sist2 PROPERTIES LINKER_LANGUAGE C)
|
||||
|
||||
target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/)
|
||||
@ -53,8 +56,6 @@ set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib)
|
||||
|
||||
find_package(PkgConfig REQUIRED)
|
||||
|
||||
pkg_search_module(GLIB REQUIRED glib-2.0)
|
||||
|
||||
find_package(lmdb CONFIG REQUIRED)
|
||||
find_package(cJSON CONFIG REQUIRED)
|
||||
find_package(unofficial-mongoose CONFIG REQUIRED)
|
||||
@ -63,6 +64,7 @@ find_library(MAGIC_LIB
|
||||
NAMES libmagic.so.1 magic
|
||||
PATHS /usr/lib/x86_64-linux-gnu/ /usr/lib/aarch64-linux-gnu/
|
||||
)
|
||||
find_package(unofficial-sqlite3 CONFIG REQUIRED)
|
||||
|
||||
|
||||
target_include_directories(
|
||||
@ -71,7 +73,6 @@ target_include_directories(
|
||||
${CMAKE_SOURCE_DIR}/third-party/utf8.h/
|
||||
${CMAKE_SOURCE_DIR}/third-party/libscan/
|
||||
${CMAKE_SOURCE_DIR}/
|
||||
${GLIB_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
target_compile_options(
|
||||
@ -90,6 +91,7 @@ if (SIST_DEBUG)
|
||||
-fsanitize=address
|
||||
-fno-inline
|
||||
# -O2
|
||||
-w
|
||||
)
|
||||
target_link_options(
|
||||
sist2
|
||||
@ -121,6 +123,7 @@ else ()
|
||||
-Ofast
|
||||
-fno-stack-protector
|
||||
-fomit-frame-pointer
|
||||
-w
|
||||
)
|
||||
endif ()
|
||||
|
||||
@ -137,17 +140,15 @@ target_link_libraries(
|
||||
lmdb
|
||||
cjson
|
||||
argparse
|
||||
${GLIB_LDFLAGS}
|
||||
unofficial::mongoose::mongoose
|
||||
CURL::libcurl
|
||||
|
||||
pthread
|
||||
|
||||
c
|
||||
|
||||
scan
|
||||
|
||||
${MAGIC_LIB}
|
||||
unofficial::sqlite3::sqlite3
|
||||
)
|
||||
|
||||
add_custom_target(
|
||||
|
@ -150,7 +150,7 @@ docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
|
||||
|
||||
```bash
|
||||
vcpkg install curl[core,openssl]
|
||||
vcpkg install lmdb cjson glib brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw jasper lcms gumbo
|
||||
vcpkg install lmdb sqlite3 cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw jasper lcms gumbo
|
||||
```
|
||||
|
||||
1. Build
|
||||
|
@ -1,10 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
rm -rf index.sist2/
|
||||
(
|
||||
cd ..
|
||||
rm -rf index.sist2
|
||||
|
||||
python3 scripts/mime.py > src/parsing/mime_generated.c
|
||||
python3 scripts/serve_static.py > src/web/static_generated.c
|
||||
python3 scripts/index_static.py > src/index/static_generated.c
|
||||
python3 scripts/magic_static.py > src/magic_generated.c
|
||||
python3 scripts/mime.py > src/parsing/mime_generated.c
|
||||
python3 scripts/serve_static.py > src/web/static_generated.c
|
||||
python3 scripts/index_static.py > src/index/static_generated.c
|
||||
python3 scripts/magic_static.py > src/magic_generated.c
|
||||
|
||||
printf "static const char *const Sist2CommitHash = \"%s\";\n" $(git rev-parse HEAD) > src/git_hash.h
|
||||
printf "static const char *const Sist2CommitHash = \"%s\";\n" $(git rev-parse HEAD) > src/git_hash.h
|
||||
)
|
@ -29,7 +29,7 @@ application/mime, aps
|
||||
application/mspowerpoint, ppz
|
||||
application/msword, doc|dot|w6w|wiz|word
|
||||
application/netmc, mcp
|
||||
application/octet-stream, bin|dump|gpg
|
||||
application/octet-stream, bin|dump|gpg|pack|idx
|
||||
application/oda, oda
|
||||
application/ogg, ogv
|
||||
application/pdf, pdf
|
||||
@ -243,7 +243,7 @@ audio/make, funk|my|pfunk
|
||||
audio/midi, kar
|
||||
audio/mid, rmi
|
||||
audio/mp4, m4b
|
||||
audio/mpeg, m2a|mpa
|
||||
audio/mpeg, m2a|mpa|mpga
|
||||
audio/ogg, ogg
|
||||
audio/s3m, s3m
|
||||
audio/tsp-audio, tsi
|
||||
@ -382,7 +382,7 @@ text/x-pascal, p
|
||||
text/x-perl, pl
|
||||
text/x-php, php
|
||||
text/x-po, po
|
||||
text/x-python, py
|
||||
text/x-python, py|pyi
|
||||
text/x-ruby, rb
|
||||
text/x-sass, sass
|
||||
text/x-scss, scss
|
||||
|
|
@ -1,3 +1,5 @@
|
||||
import zlib
|
||||
|
||||
mimes = {}
|
||||
noparse = set()
|
||||
ext_in_hash = set()
|
||||
@ -135,24 +137,40 @@ def clean(t):
|
||||
return t.replace("/", "_").replace(".", "_").replace("+", "_").replace("-", "_")
|
||||
|
||||
|
||||
def crc(s):
|
||||
return zlib.crc32(s.encode()) & 0xffffffff
|
||||
|
||||
|
||||
with open("scripts/mime.csv") as f:
|
||||
for l in f:
|
||||
mime, ext_list = l.split(",")
|
||||
if l.startswith("!"):
|
||||
mime = mime[1:]
|
||||
noparse.add(mime)
|
||||
ext = [x.strip() for x in ext_list.split("|")]
|
||||
ext = [x.strip() for x in ext_list.split("|") if x.strip() != ""]
|
||||
mimes[mime] = ext
|
||||
|
||||
seen_crc = set()
|
||||
for ext in mimes.values():
|
||||
for e in ext:
|
||||
if crc(e) in seen_crc:
|
||||
raise Exception("CRC32 collision")
|
||||
seen_crc.add(crc(e))
|
||||
|
||||
seen_crc = set()
|
||||
for mime in mimes.keys():
|
||||
if crc(mime) in seen_crc:
|
||||
raise Exception("CRC32 collision")
|
||||
seen_crc.add(crc(mime))
|
||||
|
||||
print("// **Generated by mime.py**")
|
||||
print("#ifndef MIME_GENERATED_C")
|
||||
print("#define MIME_GENERATED_C")
|
||||
print("#include <glib.h>\n")
|
||||
print("#include <stdlib.h>\n")
|
||||
# Enum
|
||||
print("enum mime {")
|
||||
for mime, ext in sorted(mimes.items()):
|
||||
print(" " + clean(mime) + "=" + mime_id(mime) + ",")
|
||||
print(f"{clean(mime)}={mime_id(mime)},")
|
||||
print("};")
|
||||
|
||||
# Enum -> string
|
||||
@ -163,20 +181,20 @@ with open("scripts/mime.csv") as f:
|
||||
print("default: return NULL;}}")
|
||||
|
||||
# Ext -> Enum
|
||||
print("GHashTable *mime_get_ext_table() {"
|
||||
"GHashTable *ext_table = g_hash_table_new(g_str_hash, g_str_equal);")
|
||||
print("unsigned int mime_extension_lookup(unsigned long extension_crc32) {"
|
||||
"switch (extension_crc32) {")
|
||||
for mime, ext in mimes.items():
|
||||
for e in [e for e in ext if e]:
|
||||
print("g_hash_table_insert(ext_table, \"" + e + "\", (gpointer)" + clean(mime) + ");")
|
||||
if e in ext_in_hash:
|
||||
raise Exception("extension already in hash: " + e)
|
||||
ext_in_hash.add(e)
|
||||
print("return ext_table;}")
|
||||
if len(ext) > 0:
|
||||
for e in ext:
|
||||
print(f"case {crc(e)}:", end="")
|
||||
print(f"return {clean(mime)};")
|
||||
print("default: return 0;}}")
|
||||
|
||||
# string -> Enum
|
||||
print("GHashTable *mime_get_mime_table() {"
|
||||
"GHashTable *mime_table = g_hash_table_new(g_str_hash, g_str_equal);")
|
||||
for mime, ext in mimes.items():
|
||||
print("g_hash_table_insert(mime_table, \"" + mime + "\", (gpointer)" + clean(mime) + ");")
|
||||
print("return mime_table;}")
|
||||
print("unsigned int mime_name_lookup(unsigned long mime_crc32) {"
|
||||
"switch (mime_crc32) {")
|
||||
for mime in mimes.keys():
|
||||
print(f"case {crc(mime)}: return {clean(mime)};")
|
||||
|
||||
print("default: return 0;}}")
|
||||
print("#endif")
|
||||
|
@ -4,7 +4,7 @@
|
||||
<meta charset="utf-8">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1.0">
|
||||
<link rel="icon" href="<%= BASE_URL %>favicon.ico">
|
||||
<link rel="icon" href="<%= BASE_URL %>serve_favicon_ico.ico">
|
||||
<title>sist2-admin</title>
|
||||
</head>
|
||||
<body>
|
||||
|
@ -1,12 +1,13 @@
|
||||
#ifndef SIST2_AUTH0_C_API_H
|
||||
#define SIST2_AUTH0_C_API_H
|
||||
|
||||
#include "stdlib.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
#define EXTERNC extern "C"
|
||||
#include "cstdlib"
|
||||
#else
|
||||
#define EXTERNC
|
||||
#include "stdlib.h"
|
||||
#endif
|
||||
|
||||
#define AUTH0_OK (0)
|
||||
|
163
src/cli.c
163
src/cli.c
@ -2,16 +2,17 @@
|
||||
#include "ctx.h"
|
||||
#include <tesseract/capi.h>
|
||||
|
||||
#define DEFAULT_OUTPUT "index.sist2/"
|
||||
#define DEFAULT_OUTPUT "index.sist2"
|
||||
#define DEFAULT_NAME "index"
|
||||
#define DEFAULT_CONTENT_SIZE 32768
|
||||
#define DEFAULT_QUALITY 2
|
||||
#define DEFAULT_THUMBNAIL_SIZE 500
|
||||
#define DEFAULT_THUMBNAIL_SIZE 552
|
||||
#define DEFAULT_THUMBNAIL_COUNT 1
|
||||
#define DEFAULT_REWRITE_URL ""
|
||||
|
||||
#define DEFAULT_ES_URL "http://localhost:9200"
|
||||
#define DEFAULT_ES_INDEX "sist2"
|
||||
#define DEFAULT_BATCH_SIZE 100
|
||||
#define DEFAULT_BATCH_SIZE 70
|
||||
#define DEFAULT_TAGLINE "Lightning-fast file system indexer and search tool"
|
||||
#define DEFAULT_LANG "en"
|
||||
|
||||
@ -20,8 +21,6 @@
|
||||
|
||||
#define DEFAULT_MAX_MEM_BUFFER 2000
|
||||
|
||||
#define DEFAULT_THROTTLE_MEMORY_THRESHOLD 0
|
||||
|
||||
const char *TESS_DATAPATHS[] = {
|
||||
"/usr/share/tessdata/",
|
||||
"/usr/share/tesseract-ocr/tessdata/",
|
||||
@ -48,9 +47,6 @@ void scan_args_destroy(scan_args_t *args) {
|
||||
if (args->name != NULL) {
|
||||
free(args->name);
|
||||
}
|
||||
if (args->incremental != NULL) {
|
||||
free(args->incremental);
|
||||
}
|
||||
if (args->path != NULL) {
|
||||
free(args->path);
|
||||
}
|
||||
@ -61,7 +57,6 @@ void scan_args_destroy(scan_args_t *args) {
|
||||
}
|
||||
|
||||
void index_args_destroy(index_args_t *args) {
|
||||
//todo
|
||||
if (args->es_mappings_path) {
|
||||
free(args->es_mappings);
|
||||
}
|
||||
@ -76,7 +71,6 @@ void index_args_destroy(index_args_t *args) {
|
||||
}
|
||||
|
||||
void web_args_destroy(web_args_t *args) {
|
||||
//todo
|
||||
free(args);
|
||||
}
|
||||
|
||||
@ -97,19 +91,13 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
|
||||
char *abs_path = abspath(argv[1]);
|
||||
if (abs_path == NULL) {
|
||||
LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1])
|
||||
LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1]);
|
||||
} else {
|
||||
abs_path = realloc(abs_path, strlen(abs_path) + 2);
|
||||
strcat(abs_path, "/");
|
||||
args->path = abs_path;
|
||||
}
|
||||
|
||||
if (args->incremental != OPTION_VALUE_UNSPECIFIED) {
|
||||
args->incremental = abspath(args->incremental);
|
||||
if (abs_path == NULL) {
|
||||
sist_log("main.c", LOG_SIST_WARNING, "Could not open original index! Disabled incremental scan feature.");
|
||||
args->incremental = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (args->tn_quality == OPTION_VALUE_UNSPECIFIED) {
|
||||
args->tn_quality = DEFAULT_QUALITY;
|
||||
} else if (args->tn_quality < 2 || args->tn_quality > 31) {
|
||||
@ -152,20 +140,24 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
args->output = expandpath(args->output);
|
||||
}
|
||||
|
||||
int ret = mkdir(args->output, S_IRUSR | S_IWUSR | S_IXUSR);
|
||||
if (ret != 0) {
|
||||
fprintf(stderr, "Invalid output: '%s' (%s).\n", args->output, strerror(errno));
|
||||
return 1;
|
||||
char *abs_output = abspath(args->output);
|
||||
if (args->incremental && abs_output == NULL) {
|
||||
LOG_WARNINGF("main.c", "Could not open original index for incremental scan: %s. Will not perform incremental scan.", abs_output);
|
||||
args->incremental = FALSE;
|
||||
} else if (!args->incremental && abs_output != NULL) {
|
||||
LOG_FATALF("main.c", "Index already exists: %s. If you wish to perform incremental scan, you must specify --incremental", abs_output);
|
||||
}
|
||||
free(abs_output);
|
||||
|
||||
if (args->depth <= 0) {
|
||||
args->depth = G_MAXINT32;
|
||||
args->depth = 2147483647;
|
||||
} else {
|
||||
args->depth += 1;
|
||||
}
|
||||
|
||||
if (args->name == OPTION_VALUE_UNSPECIFIED) {
|
||||
args->name = g_path_get_basename(args->output);
|
||||
args->name = malloc(strlen(DEFAULT_NAME) + 1);
|
||||
strcpy(args->name, DEFAULT_NAME);
|
||||
} else {
|
||||
char *tmp = malloc(strlen(args->name) + 1);
|
||||
strcpy(tmp, args->name);
|
||||
@ -224,7 +216,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
}
|
||||
if (trained_data_path != NULL && path != trained_data_path) {
|
||||
LOG_FATAL("cli.c", "When specifying more than one tesseract language, all the traineddata "
|
||||
"files must be in the same folder")
|
||||
"files must be in the same folder");
|
||||
}
|
||||
trained_data_path = path;
|
||||
|
||||
@ -232,7 +224,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
}
|
||||
free(lang);
|
||||
|
||||
ret = TessBaseAPIInit3(api, trained_data_path, args->tesseract_lang);
|
||||
int ret = TessBaseAPIInit3(api, trained_data_path, args->tesseract_lang);
|
||||
if (ret != 0) {
|
||||
fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang);
|
||||
return 1;
|
||||
@ -249,12 +241,12 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
|
||||
pcre *re = pcre_compile(args->exclude_regex, 0, &error, &error_offset, 0);
|
||||
if (error != NULL) {
|
||||
LOG_FATALF("cli.c", "pcre_compile returned error: %s (offset:%d)", error, error_offset)
|
||||
LOG_FATALF("cli.c", "pcre_compile returned error: %s (offset:%d)", error, error_offset);
|
||||
}
|
||||
|
||||
pcre_extra *re_extra = pcre_study(re, 0, &error);
|
||||
if (error != NULL) {
|
||||
LOG_FATALF("cli.c", "pcre_study returned error: %s", error)
|
||||
LOG_FATALF("cli.c", "pcre_study returned error: %s", error);
|
||||
}
|
||||
|
||||
ScanCtx.exclude = re;
|
||||
@ -276,7 +268,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
if (args->list_path != OPTION_VALUE_UNSPECIFIED) {
|
||||
if (strcmp(args->list_path, "-") == 0) {
|
||||
args->list_file = stdin;
|
||||
LOG_DEBUG("cli.c", "Using stdin as list file")
|
||||
LOG_DEBUG("cli.c", "Using stdin as list file");
|
||||
} else {
|
||||
args->list_file = fopen(args->list_path, "r");
|
||||
|
||||
@ -286,27 +278,27 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DEBUGF("cli.c", "arg tn_quality=%f", args->tn_quality)
|
||||
LOG_DEBUGF("cli.c", "arg tn_size=%d", args->tn_size)
|
||||
LOG_DEBUGF("cli.c", "arg tn_count=%d", args->tn_count)
|
||||
LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size)
|
||||
LOG_DEBUGF("cli.c", "arg threads=%d", args->threads)
|
||||
LOG_DEBUGF("cli.c", "arg incremental=%s", args->incremental)
|
||||
LOG_DEBUGF("cli.c", "arg output=%s", args->output)
|
||||
LOG_DEBUGF("cli.c", "arg rewrite_url=%s", args->rewrite_url)
|
||||
LOG_DEBUGF("cli.c", "arg name=%s", args->name)
|
||||
LOG_DEBUGF("cli.c", "arg depth=%d", args->depth)
|
||||
LOG_DEBUGF("cli.c", "arg path=%s", args->path)
|
||||
LOG_DEBUGF("cli.c", "arg archive=%s", args->archive)
|
||||
LOG_DEBUGF("cli.c", "arg archive_passphrase=%s", args->archive_passphrase)
|
||||
LOG_DEBUGF("cli.c", "arg tesseract_lang=%s", args->tesseract_lang)
|
||||
LOG_DEBUGF("cli.c", "arg tesseract_path=%s", args->tesseract_path)
|
||||
LOG_DEBUGF("cli.c", "arg exclude=%s", args->exclude_regex)
|
||||
LOG_DEBUGF("cli.c", "arg fast=%d", args->fast)
|
||||
LOG_DEBUGF("cli.c", "arg fast_epub=%d", args->fast_epub)
|
||||
LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold)
|
||||
LOG_DEBUGF("cli.c", "arg max_memory_buffer_mib=%d", args->max_memory_buffer_mib)
|
||||
LOG_DEBUGF("cli.c", "arg list_path=%s", args->list_path)
|
||||
LOG_DEBUGF("cli.c", "arg tn_quality=%f", args->tn_quality);
|
||||
LOG_DEBUGF("cli.c", "arg tn_size=%d", args->tn_size);
|
||||
LOG_DEBUGF("cli.c", "arg tn_count=%d", args->tn_count);
|
||||
LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size);
|
||||
LOG_DEBUGF("cli.c", "arg threads=%d", args->threads);
|
||||
LOG_DEBUGF("cli.c", "arg incremental=%d", args->incremental);
|
||||
LOG_DEBUGF("cli.c", "arg output=%s", args->output);
|
||||
LOG_DEBUGF("cli.c", "arg rewrite_url=%s", args->rewrite_url);
|
||||
LOG_DEBUGF("cli.c", "arg name=%s", args->name);
|
||||
LOG_DEBUGF("cli.c", "arg depth=%d", args->depth);
|
||||
LOG_DEBUGF("cli.c", "arg path=%s", args->path);
|
||||
LOG_DEBUGF("cli.c", "arg archive=%s", args->archive);
|
||||
LOG_DEBUGF("cli.c", "arg archive_passphrase=%s", args->archive_passphrase);
|
||||
LOG_DEBUGF("cli.c", "arg tesseract_lang=%s", args->tesseract_lang);
|
||||
LOG_DEBUGF("cli.c", "arg tesseract_path=%s", args->tesseract_path);
|
||||
LOG_DEBUGF("cli.c", "arg exclude=%s", args->exclude_regex);
|
||||
LOG_DEBUGF("cli.c", "arg fast=%d", args->fast);
|
||||
LOG_DEBUGF("cli.c", "arg fast_epub=%d", args->fast_epub);
|
||||
LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold);
|
||||
LOG_DEBUGF("cli.c", "arg max_memory_buffer_mib=%d", args->max_memory_buffer_mib);
|
||||
LOG_DEBUGF("cli.c", "arg list_path=%s", args->list_path);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -316,20 +308,20 @@ int load_external_file(const char *file_path, char **dst) {
|
||||
int res = stat(file_path, &info);
|
||||
|
||||
if (res == -1) {
|
||||
LOG_ERRORF("cli.c", "Error opening file '%s': %s\n", file_path, strerror(errno))
|
||||
LOG_ERRORF("cli.c", "Error opening file '%s': %s\n", file_path, strerror(errno));
|
||||
return 1;
|
||||
}
|
||||
|
||||
int fd = open(file_path, O_RDONLY);
|
||||
if (fd == -1) {
|
||||
LOG_ERRORF("cli.c", "Error opening file '%s': %s\n", file_path, strerror(errno))
|
||||
LOG_ERRORF("cli.c", "Error opening file '%s': %s\n", file_path, strerror(errno));
|
||||
return 1;
|
||||
}
|
||||
|
||||
*dst = malloc(info.st_size + 1);
|
||||
res = read(fd, *dst, info.st_size);
|
||||
if (res < 0) {
|
||||
LOG_ERRORF("cli.c", "Error reading file '%s': %s\n", file_path, strerror(errno))
|
||||
LOG_ERRORF("cli.c", "Error reading file '%s': %s\n", file_path, strerror(errno));
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -357,7 +349,7 @@ int index_args_validate(index_args_t *args, int argc, const char **argv) {
|
||||
|
||||
char *index_path = abspath(argv[1]);
|
||||
if (index_path == NULL) {
|
||||
LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1])
|
||||
LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1]);
|
||||
} else {
|
||||
args->index_path = index_path;
|
||||
}
|
||||
@ -392,28 +384,28 @@ int index_args_validate(index_args_t *args, int argc, const char **argv) {
|
||||
args->batch_size = DEFAULT_BATCH_SIZE;
|
||||
}
|
||||
|
||||
LOG_DEBUGF("cli.c", "arg es_url=%s", args->es_url)
|
||||
LOG_DEBUGF("cli.c", "arg es_index=%s", args->es_index)
|
||||
LOG_DEBUGF("cli.c", "arg es_insecure_ssl=%d", args->es_insecure_ssl)
|
||||
LOG_DEBUGF("cli.c", "arg index_path=%s", args->index_path)
|
||||
LOG_DEBUGF("cli.c", "arg script_path=%s", args->script_path)
|
||||
LOG_DEBUGF("cli.c", "arg async_script=%d", args->async_script)
|
||||
LOG_DEBUGF("cli.c", "arg es_url=%s", args->es_url);
|
||||
LOG_DEBUGF("cli.c", "arg es_index=%s", args->es_index);
|
||||
LOG_DEBUGF("cli.c", "arg es_insecure_ssl=%d", args->es_insecure_ssl);
|
||||
LOG_DEBUGF("cli.c", "arg index_path=%s", args->index_path);
|
||||
LOG_DEBUGF("cli.c", "arg script_path=%s", args->script_path);
|
||||
LOG_DEBUGF("cli.c", "arg async_script=%d", args->async_script);
|
||||
|
||||
if (args->script) {
|
||||
char log_buf[5000];
|
||||
|
||||
strncpy(log_buf, args->script, sizeof(log_buf));
|
||||
*(log_buf + sizeof(log_buf) - 1) = '\0';
|
||||
LOG_DEBUGF("cli.c", "arg script=%s", log_buf)
|
||||
LOG_DEBUGF("cli.c", "arg script=%s", log_buf);
|
||||
}
|
||||
|
||||
LOG_DEBUGF("cli.c", "arg print=%d", args->print)
|
||||
LOG_DEBUGF("cli.c", "arg es_mappings_path=%s", args->es_mappings_path)
|
||||
LOG_DEBUGF("cli.c", "arg es_mappings=%s", args->es_mappings)
|
||||
LOG_DEBUGF("cli.c", "arg es_settings_path=%s", args->es_settings_path)
|
||||
LOG_DEBUGF("cli.c", "arg es_settings=%s", args->es_settings)
|
||||
LOG_DEBUGF("cli.c", "arg batch_size=%d", args->batch_size)
|
||||
LOG_DEBUGF("cli.c", "arg force_reset=%d", args->force_reset)
|
||||
LOG_DEBUGF("cli.c", "arg print=%d", args->print);
|
||||
LOG_DEBUGF("cli.c", "arg es_mappings_path=%s", args->es_mappings_path);
|
||||
LOG_DEBUGF("cli.c", "arg es_mappings=%s", args->es_mappings);
|
||||
LOG_DEBUGF("cli.c", "arg es_settings_path=%s", args->es_settings_path);
|
||||
LOG_DEBUGF("cli.c", "arg es_settings=%s", args->es_settings);
|
||||
LOG_DEBUGF("cli.c", "arg batch_size=%d", args->batch_size);
|
||||
LOG_DEBUGF("cli.c", "arg force_reset=%d", args->force_reset);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -534,23 +526,24 @@ int web_args_validate(web_args_t *args, int argc, const char **argv) {
|
||||
for (int i = 0; i < args->index_count; i++) {
|
||||
char *abs_path = abspath(args->indices[i]);
|
||||
if (abs_path == NULL) {
|
||||
LOG_FATALF("cli.c", "Index not found: %s", args->indices[i])
|
||||
LOG_FATALF("cli.c", "Index not found: %s", args->indices[i]);
|
||||
}
|
||||
free(abs_path);
|
||||
}
|
||||
|
||||
LOG_DEBUGF("cli.c", "arg es_url=%s", args->es_url)
|
||||
LOG_DEBUGF("cli.c", "arg es_index=%s", args->es_index)
|
||||
LOG_DEBUGF("cli.c", "arg es_insecure_ssl=%d", args->es_insecure_ssl)
|
||||
LOG_DEBUGF("cli.c", "arg tagline=%s", args->tagline)
|
||||
LOG_DEBUGF("cli.c", "arg dev=%d", args->dev)
|
||||
LOG_DEBUGF("cli.c", "arg listen=%s", args->listen_address)
|
||||
LOG_DEBUGF("cli.c", "arg credentials=%s", args->credentials)
|
||||
LOG_DEBUGF("cli.c", "arg tag_credentials=%s", args->tag_credentials)
|
||||
LOG_DEBUGF("cli.c", "arg auth_user=%s", args->auth_user)
|
||||
LOG_DEBUGF("cli.c", "arg auth_pass=%s", args->auth_pass)
|
||||
LOG_DEBUGF("cli.c", "arg index_count=%d", args->index_count)
|
||||
LOG_DEBUGF("cli.c", "arg es_url=%s", args->es_url);
|
||||
LOG_DEBUGF("cli.c", "arg es_index=%s", args->es_index);
|
||||
LOG_DEBUGF("cli.c", "arg es_insecure_ssl=%d", args->es_insecure_ssl);
|
||||
LOG_DEBUGF("cli.c", "arg tagline=%s", args->tagline);
|
||||
LOG_DEBUGF("cli.c", "arg dev=%d", args->dev);
|
||||
LOG_DEBUGF("cli.c", "arg listen=%s", args->listen_address);
|
||||
LOG_DEBUGF("cli.c", "arg credentials=%s", args->credentials);
|
||||
LOG_DEBUGF("cli.c", "arg tag_credentials=%s", args->tag_credentials);
|
||||
LOG_DEBUGF("cli.c", "arg auth_user=%s", args->auth_user);
|
||||
LOG_DEBUGF("cli.c", "arg auth_pass=%s", args->auth_pass);
|
||||
LOG_DEBUGF("cli.c", "arg index_count=%d", args->index_count);
|
||||
for (int i = 0; i < args->index_count; i++) {
|
||||
LOG_DEBUGF("cli.c", "arg indices[%d]=%s", i, args->indices[i])
|
||||
LOG_DEBUGF("cli.c", "arg indices[%d]=%s", i, args->indices[i]);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -575,7 +568,7 @@ int exec_args_validate(exec_args_t *args, int argc, const char **argv) {
|
||||
|
||||
char *index_path = abspath(argv[1]);
|
||||
if (index_path == NULL) {
|
||||
LOG_FATALF("cli.c", "Invalid index PATH argument. File not found: %s", argv[1])
|
||||
LOG_FATALF("cli.c", "Invalid index PATH argument. File not found: %s", argv[1]);
|
||||
} else {
|
||||
args->index_path = index_path;
|
||||
}
|
||||
@ -596,12 +589,12 @@ int exec_args_validate(exec_args_t *args, int argc, const char **argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG_DEBUGF("cli.c", "arg script_path=%s", args->script_path)
|
||||
LOG_DEBUGF("cli.c", "arg script_path=%s", args->script_path);
|
||||
|
||||
char log_buf[5000];
|
||||
strncpy(log_buf, args->script, sizeof(log_buf));
|
||||
*(log_buf + sizeof(log_buf) - 1) = '\0';
|
||||
LOG_DEBUGF("cli.c", "arg script=%s", log_buf)
|
||||
LOG_DEBUGF("cli.c", "arg script=%s", log_buf);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -13,7 +13,7 @@ typedef struct scan_args {
|
||||
int tn_size;
|
||||
int content_size;
|
||||
int threads;
|
||||
char *incremental;
|
||||
int incremental;
|
||||
char *output;
|
||||
char *rewrite_url;
|
||||
char *name;
|
||||
|
@ -3,9 +3,10 @@
|
||||
ScanCtx_t ScanCtx = {
|
||||
.stat_index_size = 0,
|
||||
.stat_tn_size = 0,
|
||||
.dbg_current_files = NULL,
|
||||
.pool = NULL
|
||||
.pool = NULL,
|
||||
.index.path = {0,},
|
||||
};
|
||||
WebCtx_t WebCtx;
|
||||
IndexCtx_t IndexCtx;
|
||||
LogCtx_t LogCtx;
|
||||
__thread ProcData_t ProcData;
|
||||
|
27
src/ctx.h
27
src/ctx.h
@ -16,22 +16,17 @@
|
||||
#include "libscan/msdoc/msdoc.h"
|
||||
#include "libscan/wpd/wpd.h"
|
||||
#include "libscan/json/json.h"
|
||||
#include "src/io/store.h"
|
||||
#include "src/database/database.h"
|
||||
#include "src/index/elastic.h"
|
||||
#include "sqlite3.h"
|
||||
|
||||
#include <glib.h>
|
||||
#include <pcre.h>
|
||||
|
||||
typedef struct {
|
||||
struct index_t index;
|
||||
|
||||
GHashTable *mime_table;
|
||||
GHashTable *ext_table;
|
||||
|
||||
tpool_t *pool;
|
||||
|
||||
tpool_t *writer_pool;
|
||||
|
||||
int threads;
|
||||
int depth;
|
||||
int calculate_checksums;
|
||||
@ -39,16 +34,10 @@ typedef struct {
|
||||
size_t stat_tn_size;
|
||||
size_t stat_index_size;
|
||||
|
||||
GHashTable *original_table;
|
||||
GHashTable *copy_table;
|
||||
GHashTable *new_table;
|
||||
pthread_mutex_t copy_table_mu;
|
||||
|
||||
pcre *exclude;
|
||||
pcre_extra *exclude_extra;
|
||||
int fast;
|
||||
|
||||
GHashTable *dbg_current_files;
|
||||
pthread_mutex_t dbg_current_files_mu;
|
||||
|
||||
int dbg_failed_files_count;
|
||||
@ -84,10 +73,6 @@ typedef struct {
|
||||
char *es_index;
|
||||
int batch_size;
|
||||
tpool_t *pool;
|
||||
store_t *tag_store;
|
||||
GHashTable *tags;
|
||||
store_t *meta_store;
|
||||
GHashTable *meta;
|
||||
/**
|
||||
* Set to false when using --print
|
||||
*/
|
||||
@ -117,10 +102,18 @@ typedef struct {
|
||||
int dev;
|
||||
} WebCtx_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
int thread_id;
|
||||
database_t *ipc_db;
|
||||
database_t *index_db;
|
||||
} ProcData_t;
|
||||
|
||||
extern ScanCtx_t ScanCtx;
|
||||
extern WebCtx_t WebCtx;
|
||||
extern IndexCtx_t IndexCtx;
|
||||
extern LogCtx_t LogCtx;
|
||||
extern __thread ProcData_t ProcData;
|
||||
|
||||
|
||||
#endif
|
||||
|
586
src/database/database.c
Normal file
586
src/database/database.c
Normal file
@ -0,0 +1,586 @@
|
||||
#include "database.h"
|
||||
#include "malloc.h"
|
||||
#include "src/ctx.h"
|
||||
#include <string.h>
|
||||
#include <pthread.h>
|
||||
#include "src/util.h"
|
||||
|
||||
#include <time.h>
|
||||
|
||||
|
||||
|
||||
database_t *database_create(const char *filename, database_type_t type) {
|
||||
database_t *db = malloc(sizeof(database_t));
|
||||
|
||||
strcpy(db->filename, filename);
|
||||
db->type = type;
|
||||
db->select_thumbnail_stmt = NULL;
|
||||
|
||||
db->ipc_ctx = NULL;
|
||||
|
||||
return db;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static int sep_rfind(const char *str) {
|
||||
for (int i = (int) strlen(str); i >= 0; i--) {
|
||||
if (str[i] == '/') {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
void path_parent_func(sqlite3_context *ctx, int argc, sqlite3_value **argv) {
|
||||
if (argc != 1 || sqlite3_value_type(argv[0]) != SQLITE_TEXT) {
|
||||
sqlite3_result_error(ctx, "Invalid parameters", -1);
|
||||
}
|
||||
|
||||
const char *value = (const char *) sqlite3_value_text(argv[0]);
|
||||
|
||||
int stop = sep_rfind(value);
|
||||
if (stop == -1) {
|
||||
sqlite3_result_null(ctx);
|
||||
return;
|
||||
}
|
||||
char parent[PATH_MAX * 3];
|
||||
strncpy(parent, value, stop);
|
||||
|
||||
sqlite3_result_text(ctx, parent, stop, SQLITE_TRANSIENT);
|
||||
}
|
||||
|
||||
|
||||
void save_current_job_info(sqlite3_context *ctx, int argc, sqlite3_value **argv) {
|
||||
if (argc != 1 || sqlite3_value_type(argv[0]) != SQLITE_TEXT) {
|
||||
sqlite3_result_error(ctx, "Invalid parameters", -1);
|
||||
}
|
||||
|
||||
database_ipc_ctx_t *ipc_ctx = sqlite3_user_data(ctx);
|
||||
|
||||
const char *current_job = (const char *) sqlite3_value_text(argv[0]);
|
||||
|
||||
char buf[PATH_MAX];
|
||||
strcpy(buf, current_job);
|
||||
|
||||
strcpy(ipc_ctx->current_job[ProcData.thread_id], current_job);
|
||||
|
||||
sqlite3_result_text(ctx, "ok", -1, SQLITE_STATIC);
|
||||
}
|
||||
|
||||
void database_initialize(database_t *db) {
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_open(db->filename, &db->db));
|
||||
|
||||
LOG_DEBUGF("database.c", "Initializing database %s", db->filename);
|
||||
if (db->type == INDEX_DATABASE) {
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, IndexDatabaseSchema, NULL, NULL, NULL));
|
||||
} else if (db->type == IPC_CONSUMER_DATABASE || db->type == IPC_PRODUCER_DATABASE) {
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, IpcDatabaseSchema, NULL, NULL, NULL));
|
||||
}
|
||||
|
||||
sqlite3_close(db->db);
|
||||
}
|
||||
|
||||
void database_open(database_t *db) {
|
||||
LOG_DEBUGF("tpool.c", "Opening database %s (%d)", db->filename, db->type);
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_open(db->filename, &db->db));
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA cache_size = -200000;", NULL, NULL, NULL));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA synchronous = OFF;", NULL, NULL, NULL));
|
||||
|
||||
if (db->type == INDEX_DATABASE) {
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA temp_store = memory;", NULL, NULL, NULL));
|
||||
}
|
||||
|
||||
if (db->type == INDEX_DATABASE) {
|
||||
// Prepare statements;
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
|
||||
db->db,
|
||||
"SELECT data FROM thumbnail WHERE id=? AND num=? LIMIT 1;", -1,
|
||||
&db->select_thumbnail_stmt, NULL));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
|
||||
db->db,
|
||||
"UPDATE document SET marked=1 WHERE id=? AND mtime=? RETURNING id",
|
||||
-1,
|
||||
&db->mark_document_stmt, NULL));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
|
||||
db->db,
|
||||
"REPLACE INTO document_sidecar (id, json_data) VALUES (?,?)", -1,
|
||||
&db->write_document_sidecar_stmt, NULL));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
|
||||
db->db,
|
||||
"REPLACE INTO document (id, mtime, size, json_data) VALUES (?, ?, ?, ?);", -1,
|
||||
&db->write_document_stmt, NULL));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
|
||||
db->db,
|
||||
"INSERT INTO thumbnail (id, num, data) VALUES (?,?,?) ON CONFLICT DO UPDATE SET data=excluded.data;", -1,
|
||||
&db->write_thumbnail_stmt, NULL));
|
||||
|
||||
// Create functions
|
||||
sqlite3_create_function(
|
||||
db->db,
|
||||
"path_parent",
|
||||
1,
|
||||
SQLITE_UTF8,
|
||||
NULL,
|
||||
path_parent_func,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
} else if (db->type == IPC_CONSUMER_DATABASE) {
|
||||
|
||||
sqlite3_create_function(
|
||||
db->db,
|
||||
"save_current_job_info",
|
||||
1,
|
||||
SQLITE_UTF8,
|
||||
db->ipc_ctx,
|
||||
save_current_job_info,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
|
||||
db->db,
|
||||
"DELETE FROM parse_job WHERE id = (SELECT MIN(id) FROM parse_job)"
|
||||
" RETURNING filepath,mtime,st_size,save_current_job_info(filepath);",
|
||||
-1, &db->pop_parse_job_stmt, NULL
|
||||
));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
|
||||
db->db,
|
||||
"DELETE FROM index_job WHERE id = (SELECT MIN(id) FROM index_job)"
|
||||
" RETURNING doc_id,type,line;",
|
||||
-1, &db->pop_index_job_stmt, NULL
|
||||
));
|
||||
|
||||
} else if (db->type == IPC_PRODUCER_DATABASE) {
|
||||
char sql[40];
|
||||
int max_size_mb = 10; // TODO: read from args.
|
||||
|
||||
snprintf(sql, sizeof(sql), "PRAGMA max_page_count=%d", (max_size_mb * 1024 * 1024) / 4096);
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, sql, NULL, NULL, NULL));
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
|
||||
db->db, "INSERT INTO parse_job (filepath,mtime,st_size) VALUES (?,?,?);", -1,
|
||||
&db->insert_parse_job_stmt, NULL));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
|
||||
db->db, "INSERT INTO index_job (doc_id,type,line) VALUES (?,?,?);", -1,
|
||||
&db->insert_index_job_stmt, NULL));
|
||||
|
||||
sqlite3_create_function(
|
||||
db->db,
|
||||
"path_parent",
|
||||
1,
|
||||
SQLITE_UTF8,
|
||||
NULL,
|
||||
path_parent_func,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void database_close(database_t *db, int optimize) {
|
||||
LOG_DEBUGF("database.c", "Closing database %s", db->filename);
|
||||
|
||||
if (optimize) {
|
||||
LOG_DEBUG("database.c", "Optimizing database");
|
||||
// TODO: This should be an optional argument
|
||||
// CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "VACUUM;", NULL, NULL, NULL));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA optimize;", NULL, NULL, NULL));
|
||||
}
|
||||
|
||||
sqlite3_close(db->db);
|
||||
free(db);
|
||||
db = NULL;
|
||||
}
|
||||
|
||||
void *database_read_thumbnail(database_t *db, const char *id, int num, size_t *return_value_len) {
|
||||
sqlite3_bind_text(db->select_thumbnail_stmt, 1, id, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_int(db->select_thumbnail_stmt, 2, num);
|
||||
|
||||
int ret = sqlite3_step(db->select_thumbnail_stmt);
|
||||
|
||||
// TODO: if row not found, return null
|
||||
if (ret != SQLITE_ROW) {
|
||||
LOG_FATALF("database.c", "FIXME: tn step returned %d", ret);
|
||||
}
|
||||
|
||||
const void *blob = sqlite3_column_blob(db->select_thumbnail_stmt, 0);
|
||||
const int blob_size = sqlite3_column_bytes(db->select_thumbnail_stmt, 0);
|
||||
|
||||
*return_value_len = blob_size;
|
||||
void *return_data = malloc(blob_size);
|
||||
memcpy(return_data, blob, blob_size);
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->select_thumbnail_stmt));
|
||||
|
||||
return return_data;
|
||||
}
|
||||
|
||||
void database_write_index_descriptor(database_t *db, index_descriptor_t *desc) {
|
||||
|
||||
sqlite3_exec(db->db, "DELETE FROM descriptor;", NULL, NULL, NULL);
|
||||
|
||||
sqlite3_stmt *stmt;
|
||||
|
||||
sqlite3_prepare_v2(db->db, "INSERT INTO descriptor (id, version_major, version_minor, version_patch,"
|
||||
" root, name, rewrite_url, timestamp) VALUES (?,?,?,?,?,?,?,?);", -1, &stmt, NULL);
|
||||
sqlite3_bind_text(stmt, 1, desc->id, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_int(stmt, 2, desc->version_major);
|
||||
sqlite3_bind_int(stmt, 3, desc->version_minor);
|
||||
sqlite3_bind_int(stmt, 4, desc->version_patch);
|
||||
sqlite3_bind_text(stmt, 5, desc->root, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 6, desc->name, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 7, desc->rewrite_url, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_int64(stmt, 8, desc->timestamp);
|
||||
|
||||
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
|
||||
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
|
||||
index_descriptor_t *database_read_index_descriptor(database_t *db) {
|
||||
|
||||
sqlite3_stmt *stmt;
|
||||
|
||||
sqlite3_prepare_v2(db->db, "SELECT id, version_major, version_minor, version_patch,"
|
||||
" root, name, rewrite_url, timestamp FROM descriptor;", -1, &stmt, NULL);
|
||||
|
||||
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
|
||||
|
||||
const char *id = (char *) sqlite3_column_text(stmt, 0);
|
||||
int v_major = sqlite3_column_int(stmt, 1);
|
||||
int v_minor = sqlite3_column_int(stmt, 2);
|
||||
int v_patch = sqlite3_column_int(stmt, 3);
|
||||
const char *root = (char *) sqlite3_column_text(stmt, 4);
|
||||
const char *name = (char *) sqlite3_column_text(stmt, 5);
|
||||
const char *rewrite_url = (char *) sqlite3_column_text(stmt, 6);
|
||||
int timestamp = sqlite3_column_int(stmt, 7);
|
||||
|
||||
index_descriptor_t *desc = malloc(sizeof(index_descriptor_t));
|
||||
strcpy(desc->id, id);
|
||||
snprintf(desc->version, sizeof(desc->version), "%d.%d.%d", v_major, v_minor, v_patch);
|
||||
desc->version_major = v_major;
|
||||
desc->version_minor = v_minor;
|
||||
desc->version_patch = v_patch;
|
||||
strcpy(desc->root, root);
|
||||
strcpy(desc->name, name);
|
||||
strcpy(desc->rewrite_url, rewrite_url);
|
||||
desc->timestamp = timestamp;
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_finalize(stmt));
|
||||
|
||||
return desc;
|
||||
}
|
||||
|
||||
database_iterator_t *database_create_document_iterator(database_t *db) {
|
||||
|
||||
sqlite3_stmt *stmt;
|
||||
|
||||
// TODO: remove mtime, size, _id from json_data
|
||||
|
||||
sqlite3_prepare_v2(db->db, "WITH doc (j) AS (SELECT CASE"
|
||||
" WHEN sc.json_data IS NULL THEN"
|
||||
" CASE"
|
||||
" WHEN t.tag IS NULL THEN"
|
||||
" document.json_data"
|
||||
" ELSE"
|
||||
" json_set(document.json_data, '$.tag', json_group_array(t.tag))"
|
||||
" END"
|
||||
" ELSE"
|
||||
" CASE"
|
||||
" WHEN t.tag IS NULL THEN"
|
||||
" json_patch(document.json_data, sc.json_data)"
|
||||
" ELSE"
|
||||
// This will overwrite any tags specified in the sidecar file!
|
||||
// TODO: concatenate the two arrays?
|
||||
" json_set(json_patch(document.json_data, sc.json_data), '$.tag', json_group_array(t.tag))"
|
||||
" END"
|
||||
" END"
|
||||
" FROM document"
|
||||
" LEFT JOIN document_sidecar sc ON document.id = sc.id"
|
||||
" LEFT JOIN tag t ON document.id = t.id"
|
||||
" GROUP BY document.id)"
|
||||
" SELECT json_set(j, '$.index', (SELECT id FROM descriptor)) FROM doc", -1, &stmt, NULL);
|
||||
|
||||
database_iterator_t *iter = malloc(sizeof(database_iterator_t));
|
||||
|
||||
iter->stmt = stmt;
|
||||
iter->db = db;
|
||||
|
||||
return iter;
|
||||
}
|
||||
|
||||
cJSON *database_document_iter(database_iterator_t *iter) {
|
||||
|
||||
if (iter->stmt == NULL) {
|
||||
LOG_ERROR("database.c", "FIXME: database_document_iter() called after iteration stopped");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int ret = sqlite3_step(iter->stmt);
|
||||
|
||||
if (ret == SQLITE_ROW) {
|
||||
const char *json_string = (const char *) sqlite3_column_text(iter->stmt, 0);
|
||||
return cJSON_Parse(json_string);
|
||||
}
|
||||
|
||||
if (ret != SQLITE_DONE) {
|
||||
LOG_FATALF("database.c", "FIXME: doc iter returned %s", sqlite3_errmsg(iter->db->db));
|
||||
}
|
||||
|
||||
if (sqlite3_finalize(iter->stmt) != SQLITE_OK) {
|
||||
LOG_FATALF("database.c", "FIXME: doc iter returned %s", sqlite3_errmsg(iter->db->db));
|
||||
}
|
||||
|
||||
iter->stmt = NULL;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
cJSON *database_incremental_scan_begin(database_t *db) {
|
||||
LOG_DEBUG("database.c", "Preparing database for incremental scan");
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "UPDATE document SET marked=0;", NULL, NULL, NULL));
|
||||
}
|
||||
|
||||
cJSON *database_incremental_scan_end(database_t *db) {
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
|
||||
db->db,
|
||||
"DELETE FROM delete_list WHERE id IN (SELECT id FROM document WHERE marked=1);",
|
||||
NULL, NULL, NULL
|
||||
));
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
|
||||
db->db,
|
||||
"DELETE FROM thumbnail WHERE id IN (SELECT id FROM document WHERE marked=0);",
|
||||
NULL, NULL, NULL
|
||||
));
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
|
||||
db->db,
|
||||
"INSERT INTO delete_list (id) SELECT id FROM document WHERE marked=0;",
|
||||
NULL, NULL, NULL
|
||||
));
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
|
||||
db->db,
|
||||
"DELETE FROM document_sidecar WHERE id IN (SELECT id FROM document WHERE marked=0);",
|
||||
NULL, NULL, NULL
|
||||
));
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
|
||||
db->db,
|
||||
"DELETE FROM document WHERE marked=0;",
|
||||
NULL, NULL, NULL
|
||||
));
|
||||
}
|
||||
|
||||
int database_mark_document(database_t *db, const char *id, int mtime) {
|
||||
sqlite3_bind_text(db->mark_document_stmt, 1, id, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_int(db->mark_document_stmt, 2, mtime);
|
||||
|
||||
pthread_mutex_lock(&db->ipc_ctx->index_db_mutex);
|
||||
int ret = sqlite3_step(db->mark_document_stmt);
|
||||
|
||||
if (ret == SQLITE_ROW) {
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->mark_document_stmt));
|
||||
pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
if (ret == SQLITE_DONE) {
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->mark_document_stmt));
|
||||
pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
|
||||
return FALSE;
|
||||
}
|
||||
pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
|
||||
|
||||
CRASH_IF_STMT_FAIL(ret);
|
||||
}
|
||||
|
||||
void database_write_document(database_t *db, document_t *doc, const char *json_data) {
|
||||
sqlite3_bind_text(db->write_document_stmt, 1, doc->doc_id, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_int(db->write_document_stmt, 2, doc->mtime);
|
||||
sqlite3_bind_int64(db->write_document_stmt, 3, (long) doc->size);
|
||||
sqlite3_bind_text(db->write_document_stmt, 4, json_data, -1, SQLITE_STATIC);
|
||||
|
||||
pthread_mutex_lock(&db->ipc_ctx->index_db_mutex);
|
||||
CRASH_IF_STMT_FAIL(sqlite3_step(db->write_document_stmt));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->write_document_stmt));
|
||||
pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
|
||||
}
|
||||
|
||||
|
||||
void database_write_document_sidecar(database_t *db, const char *id, const char *json_data) {
|
||||
sqlite3_bind_text(db->write_document_sidecar_stmt, 1, id, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(db->write_document_sidecar_stmt, 2, json_data, -1, SQLITE_STATIC);
|
||||
|
||||
pthread_mutex_lock(&db->ipc_ctx->index_db_mutex);
|
||||
CRASH_IF_STMT_FAIL(sqlite3_step(db->write_document_sidecar_stmt));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->write_document_sidecar_stmt));
|
||||
pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
|
||||
}
|
||||
|
||||
void database_write_thumbnail(database_t *db, const char *id, int num, void *data, size_t data_size) {
|
||||
sqlite3_bind_text(db->write_thumbnail_stmt, 1, id, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_int(db->write_thumbnail_stmt, 2, num);
|
||||
sqlite3_bind_blob(db->write_thumbnail_stmt, 3, data, (int) data_size, SQLITE_STATIC);
|
||||
|
||||
pthread_mutex_lock(&db->ipc_ctx->index_db_mutex);
|
||||
CRASH_IF_STMT_FAIL(sqlite3_step(db->write_thumbnail_stmt));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->write_thumbnail_stmt));
|
||||
pthread_mutex_unlock(&db->ipc_ctx->index_db_mutex);
|
||||
}
|
||||
|
||||
|
||||
//void database_create_fts_index(database_t *db, database_t *fts_db) {
|
||||
// // In a separate file,
|
||||
//
|
||||
// // use database_initialize() to create FTS schema
|
||||
// // if --force-reset, then truncate the tables first
|
||||
//
|
||||
// /*
|
||||
// * create/append fts table
|
||||
// *
|
||||
// * create/append scalar index table with
|
||||
// * id,index,size,mtime,mime
|
||||
// *
|
||||
// * create/append path index table with
|
||||
// * index,path,depth
|
||||
// *
|
||||
// * content table is a view with SELECT UNION for all attached tables
|
||||
// * random_seed column
|
||||
// */
|
||||
//
|
||||
// // INSERT INTO ft(ft) VALUES('optimize');
|
||||
//}
|
||||
|
||||
job_t *database_get_work(database_t *db, job_type_t job_type) {
|
||||
job_t *job;
|
||||
|
||||
pthread_mutex_lock(&db->ipc_ctx->mutex);
|
||||
while (db->ipc_ctx->job_count == 0 && !db->ipc_ctx->no_more_jobs) {
|
||||
pthread_cond_timedwait_ms(&db->ipc_ctx->has_work_cond, &db->ipc_ctx->mutex, 10);
|
||||
}
|
||||
pthread_mutex_unlock(&db->ipc_ctx->mutex);
|
||||
|
||||
pthread_mutex_lock(&db->ipc_ctx->db_mutex);
|
||||
|
||||
if (job_type == JOB_PARSE_JOB) {
|
||||
int ret = sqlite3_step(db->pop_parse_job_stmt);
|
||||
if (ret == SQLITE_DONE) {
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->pop_parse_job_stmt));
|
||||
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
|
||||
return NULL;
|
||||
} else {
|
||||
CRASH_IF_STMT_FAIL(ret);
|
||||
}
|
||||
|
||||
job = malloc(sizeof(*job));
|
||||
|
||||
job->parse_job = create_parse_job(
|
||||
(const char *) sqlite3_column_text(db->pop_parse_job_stmt, 0),
|
||||
sqlite3_column_int(db->pop_parse_job_stmt, 1),
|
||||
sqlite3_column_int64(db->pop_parse_job_stmt, 2));
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->pop_parse_job_stmt));
|
||||
} else {
|
||||
|
||||
int ret = sqlite3_step(db->pop_index_job_stmt);
|
||||
|
||||
if (ret == SQLITE_DONE) {
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->pop_index_job_stmt));
|
||||
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
|
||||
return NULL;
|
||||
} else {
|
||||
CRASH_IF_STMT_FAIL(ret);
|
||||
}
|
||||
|
||||
job = malloc(sizeof(*job));
|
||||
|
||||
const char *line = (const char *) sqlite3_column_text(db->pop_index_job_stmt, 2);
|
||||
if (line != NULL) {
|
||||
job->bulk_line = malloc(sizeof(es_bulk_line_t) + strlen(line) + 1);
|
||||
strcpy(job->bulk_line->line, line);
|
||||
} else {
|
||||
job->bulk_line = malloc(sizeof(es_bulk_line_t));
|
||||
}
|
||||
strcpy(job->bulk_line->doc_id, (const char *) sqlite3_column_text(db->pop_index_job_stmt, 0));
|
||||
job->bulk_line->type = sqlite3_column_int(db->pop_index_job_stmt, 1);
|
||||
job->bulk_line->next = NULL;
|
||||
|
||||
// TODO CRASH IF NOT OK
|
||||
sqlite3_step(db->pop_parse_job_stmt);
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->pop_index_job_stmt));
|
||||
}
|
||||
|
||||
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
|
||||
|
||||
pthread_mutex_lock(&db->ipc_ctx->mutex);
|
||||
db->ipc_ctx->job_count -= 1;
|
||||
pthread_mutex_unlock(&db->ipc_ctx->mutex);
|
||||
|
||||
job->type = job_type;
|
||||
return job;
|
||||
}
|
||||
|
||||
void database_add_work(database_t *db, job_t *job) {
|
||||
int ret;
|
||||
|
||||
pthread_mutex_lock(&db->ipc_ctx->db_mutex);
|
||||
|
||||
if (job->type == JOB_PARSE_JOB) {
|
||||
do {
|
||||
sqlite3_bind_text(db->insert_parse_job_stmt, 1, job->parse_job->filepath, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_int(db->insert_parse_job_stmt, 2, job->parse_job->vfile.mtime);
|
||||
sqlite3_bind_int64(db->insert_parse_job_stmt, 3, (long) job->parse_job->vfile.st_size);
|
||||
|
||||
ret = sqlite3_step(db->insert_parse_job_stmt);
|
||||
|
||||
if (ret == SQLITE_FULL) {
|
||||
usleep(1000000);
|
||||
} else {
|
||||
CRASH_IF_STMT_FAIL(ret);
|
||||
}
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->insert_parse_job_stmt));
|
||||
} while (ret != SQLITE_DONE);
|
||||
} else if (job->type == JOB_BULK_LINE) {
|
||||
do {
|
||||
sqlite3_bind_text(db->insert_index_job_stmt, 1, job->bulk_line->doc_id, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_int(db->insert_index_job_stmt, 2, job->bulk_line->type);
|
||||
sqlite3_bind_text(db->insert_index_job_stmt, 3, job->bulk_line->line, -1, SQLITE_STATIC);
|
||||
|
||||
ret = sqlite3_step(db->insert_index_job_stmt);
|
||||
|
||||
if (ret == SQLITE_FULL) {
|
||||
sqlite3_reset(db->insert_index_job_stmt);
|
||||
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
|
||||
usleep(100000);
|
||||
pthread_mutex_lock(&db->ipc_ctx->db_mutex);
|
||||
continue;
|
||||
} else {
|
||||
CRASH_IF_STMT_FAIL(ret);
|
||||
}
|
||||
|
||||
ret = sqlite3_reset(db->insert_index_job_stmt);
|
||||
if (ret == SQLITE_FULL) {
|
||||
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
|
||||
usleep(100000);
|
||||
pthread_mutex_lock(&db->ipc_ctx->db_mutex);
|
||||
}
|
||||
|
||||
} while (ret != SQLITE_DONE && ret != SQLITE_OK);
|
||||
} else {
|
||||
LOG_FATAL("database.c", "FIXME: invalid job type");
|
||||
}
|
||||
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
|
||||
|
||||
pthread_mutex_lock(&db->ipc_ctx->mutex);
|
||||
db->ipc_ctx->job_count += 1;
|
||||
pthread_cond_signal(&db->ipc_ctx->has_work_cond);
|
||||
pthread_mutex_unlock(&db->ipc_ctx->mutex);
|
||||
}
|
147
src/database/database.h
Normal file
147
src/database/database.h
Normal file
@ -0,0 +1,147 @@
|
||||
#ifndef SIST2_DATABASE_H
|
||||
#define SIST2_DATABASE_H
|
||||
|
||||
#include <sqlite3.h>
|
||||
#include <cjson/cJSON.h>
|
||||
#include "src/sist.h"
|
||||
#include "src/index/elastic.h"
|
||||
|
||||
typedef struct index_descriptor index_descriptor_t;
|
||||
|
||||
extern const char *IpcDatabaseSchema;
|
||||
extern const char *IndexDatabaseSchema;
|
||||
|
||||
typedef enum {
|
||||
INDEX_DATABASE,
|
||||
IPC_CONSUMER_DATABASE,
|
||||
IPC_PRODUCER_DATABASE,
|
||||
FTS_DATABASE
|
||||
} database_type_t;
|
||||
|
||||
typedef enum {
|
||||
JOB_UNDEFINED,
|
||||
JOB_BULK_LINE,
|
||||
JOB_PARSE_JOB
|
||||
} job_type_t;
|
||||
|
||||
typedef struct {
|
||||
job_type_t type;
|
||||
union {
|
||||
parse_job_t *parse_job;
|
||||
es_bulk_line_t *bulk_line;
|
||||
};
|
||||
} job_t;
|
||||
|
||||
typedef struct {
|
||||
int job_count;
|
||||
int no_more_jobs;
|
||||
int completed_job_count;
|
||||
|
||||
pthread_mutex_t mutex;
|
||||
pthread_mutex_t db_mutex;
|
||||
pthread_mutex_t index_db_mutex;
|
||||
pthread_cond_t has_work_cond;
|
||||
char current_job[256][PATH_MAX * 2];
|
||||
} database_ipc_ctx_t;
|
||||
|
||||
typedef struct database {
|
||||
char filename[PATH_MAX];
|
||||
database_type_t type;
|
||||
sqlite3 *db;
|
||||
|
||||
// Prepared statements
|
||||
sqlite3_stmt *select_thumbnail_stmt;
|
||||
sqlite3_stmt *treemap_merge_up_update_stmt;
|
||||
sqlite3_stmt *treemap_merge_up_delete_stmt;
|
||||
|
||||
sqlite3_stmt *mark_document_stmt;
|
||||
sqlite3_stmt *write_document_stmt;
|
||||
sqlite3_stmt *write_document_sidecar_stmt;
|
||||
sqlite3_stmt *write_thumbnail_stmt;
|
||||
|
||||
sqlite3_stmt *insert_parse_job_stmt;
|
||||
sqlite3_stmt *insert_index_job_stmt;
|
||||
sqlite3_stmt *pop_parse_job_stmt;
|
||||
sqlite3_stmt *pop_index_job_stmt;
|
||||
|
||||
database_ipc_ctx_t *ipc_ctx;
|
||||
} database_t;
|
||||
|
||||
typedef struct {
|
||||
database_t *db;
|
||||
sqlite3_stmt *stmt;
|
||||
} database_iterator_t;
|
||||
|
||||
typedef struct {
|
||||
const char *path;
|
||||
const char *parent;
|
||||
long size;
|
||||
} treemap_row_t;
|
||||
|
||||
static treemap_row_t null_treemap_row = {0, 0, 0};
|
||||
|
||||
|
||||
database_t *database_create(const char *filename, database_type_t type);
|
||||
|
||||
void database_initialize(database_t *db);
|
||||
|
||||
void database_open(database_t *db);
|
||||
|
||||
void database_close(database_t *, int optimize);
|
||||
|
||||
void database_write_thumbnail(database_t *db, const char *id, int num, void *data, size_t data_size);
|
||||
|
||||
void *database_read_thumbnail(database_t *db, const char *id, int num, size_t *return_value_len);
|
||||
|
||||
void database_write_index_descriptor(database_t *db, index_descriptor_t *desc);
|
||||
|
||||
index_descriptor_t *database_read_index_descriptor(database_t *db);
|
||||
|
||||
void database_write_document(database_t *db, document_t *doc, const char *json_data);
|
||||
|
||||
database_iterator_t *database_create_document_iterator(database_t *db);
|
||||
|
||||
cJSON *database_document_iter(database_iterator_t *);
|
||||
|
||||
#define database_document_iter_foreach(element, iter) \
|
||||
for (cJSON *element = database_document_iter(iter); element != NULL; element = database_document_iter(iter))
|
||||
|
||||
cJSON *database_incremental_scan_begin(database_t *db);
|
||||
|
||||
cJSON *database_incremental_scan_end(database_t *db);
|
||||
|
||||
int database_mark_document(database_t *db, const char *id, int mtime);
|
||||
|
||||
void database_write_document_sidecar(database_t *db, const char *id, const char *json_data);
|
||||
|
||||
database_iterator_t *database_create_treemap_iterator(database_t *db, long threshold);
|
||||
|
||||
treemap_row_t database_treemap_iter(database_iterator_t *iter);
|
||||
|
||||
#define database_treemap_iter_foreach(element, iter) \
|
||||
for (treemap_row_t element = database_treemap_iter(iter); element.path != NULL; element = database_treemap_iter(iter))
|
||||
|
||||
|
||||
void database_generate_stats(database_t *db, double treemap_threshold);
|
||||
|
||||
job_t *database_get_work(database_t *db, job_type_t job_type);
|
||||
|
||||
void database_add_work(database_t *db, job_t *job);
|
||||
|
||||
//void database_index(database_t *db);
|
||||
|
||||
#define CRASH_IF_STMT_FAIL(x) do { \
|
||||
int return_value = x; \
|
||||
if (return_value != SQLITE_DONE && return_value != SQLITE_ROW) { \
|
||||
LOG_FATALF("database.c", "Sqlite error @ database.c:%d : (%d) %s", __LINE__, return_value, sqlite3_errmsg(db->db)); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define CRASH_IF_NOT_SQLITE_OK(x) do { \
|
||||
int return_value = x; \
|
||||
if (return_value != SQLITE_OK) { \
|
||||
LOG_FATALF("database.c", "Sqlite error @ database.c:%d : (%d) %s", __LINE__, return_value, sqlite3_errmsg(db->db)); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#endif //SIST2_DATABASE_H
|
78
src/database/database_schema.c
Normal file
78
src/database/database_schema.c
Normal file
@ -0,0 +1,78 @@
|
||||
|
||||
const char *IpcDatabaseSchema =
|
||||
"CREATE TABLE parse_job ("
|
||||
" id INTEGER PRIMARY KEY,"
|
||||
" filepath TEXT NOT NULL,"
|
||||
" mtime INTEGER NOT NULL,"
|
||||
" st_size INTEGER NOT NULL"
|
||||
");"
|
||||
""
|
||||
"CREATE TABLE index_job ("
|
||||
" id INTEGER PRIMARY KEY,"
|
||||
" doc_id TEXT NOT NULL CHECK ( length(doc_id) = 32 ),"
|
||||
" type INTEGER NOT NULL,"
|
||||
" line TEXT"
|
||||
");";
|
||||
|
||||
const char *IndexDatabaseSchema =
|
||||
"CREATE TABLE thumbnail ("
|
||||
" id TEXT NOT NULL CHECK ( length(id) = 32 ),"
|
||||
" num INTEGER NOT NULL,"
|
||||
" data BLOB NOT NULL,"
|
||||
" PRIMARY KEY(id, num)"
|
||||
") WITHOUT ROWID;"
|
||||
""
|
||||
"CREATE TABLE document ("
|
||||
" id TEXT PRIMARY KEY CHECK ( length(id) = 32 ),"
|
||||
" marked INTEGER NOT NULL DEFAULT (1),"
|
||||
" mtime INTEGER NOT NULL,"
|
||||
" size INTEGER NOT NULL,"
|
||||
" json_data TEXT NOT NULL CHECK ( json_valid(json_data) )"
|
||||
") WITHOUT ROWID;"
|
||||
""
|
||||
"CREATE TABLE delete_list ("
|
||||
" id TEXT PRIMARY KEY CHECK ( length(id) = 32 )"
|
||||
") WITHOUT ROWID;"
|
||||
""
|
||||
"CREATE TABLE tag ("
|
||||
" id TEXT NOT NULL,"
|
||||
" tag TEXT NOT NULL"
|
||||
");"
|
||||
""
|
||||
"CREATE TABLE document_sidecar ("
|
||||
" id TEXT PRIMARY KEY NOT NULL,"
|
||||
" json_data TEXT NOT NULL"
|
||||
") WITHOUT ROWID;"
|
||||
""
|
||||
"CREATE TABLE descriptor ("
|
||||
" id TEXT NOT NULL,"
|
||||
" version_major INTEGER NOT NULL,"
|
||||
" version_minor INTEGER NOT NULL,"
|
||||
" version_patch INTEGER NOT NULL,"
|
||||
" root TEXT NOT NULL,"
|
||||
" name TEXT NOT NULL,"
|
||||
" rewrite_url TEXT,"
|
||||
" timestamp INTEGER NOT NULL"
|
||||
");"
|
||||
""
|
||||
"CREATE TABLE stats_treemap ("
|
||||
" path TEXT NOT NULL,"
|
||||
" size INTEGER NOT NULL"
|
||||
");"
|
||||
""
|
||||
"CREATE TABLE stats_size_agg ("
|
||||
" bucket INTEGER NOT NULL,"
|
||||
" count INTEGER NOT NULL"
|
||||
");"
|
||||
""
|
||||
"CREATE TABLE stats_date_agg ("
|
||||
" bucket INTEGER NOT NULL,"
|
||||
" count INTEGER NOT NULL"
|
||||
");"
|
||||
""
|
||||
"CREATE TABLE stats_mime_agg ("
|
||||
" mime TEXT NOT NULL,"
|
||||
" size INTEGER NOT NULL,"
|
||||
" count INTEGER NOT NULL"
|
||||
");";
|
||||
|
159
src/database/database_stats.c
Normal file
159
src/database/database_stats.c
Normal file
@ -0,0 +1,159 @@
|
||||
#include "database.h"
|
||||
#include "src/sist.h"
|
||||
#include "src/ctx.h"
|
||||
|
||||
#define TREEMAP_MINIMUM_MERGES_TO_CONTINUE (100)
|
||||
#define SIZE_BUCKET (long)(5 * 1000 * 1000)
|
||||
#define DATE_BUCKET (long)(2629800) // ~30 days
|
||||
|
||||
database_iterator_t *database_create_treemap_iterator(database_t *db, long threshold) {
|
||||
|
||||
sqlite3_stmt *stmt;
|
||||
|
||||
sqlite3_prepare_v2(db->db,
|
||||
"SELECT path, path_parent(path), size FROM tm"
|
||||
" WHERE path_parent(path) IN (SELECT path FROM tm)"
|
||||
" AND size<?",
|
||||
-1, &stmt, NULL);
|
||||
|
||||
sqlite3_bind_int64(stmt, 1, threshold);
|
||||
|
||||
database_iterator_t *iter = malloc(sizeof(database_iterator_t));
|
||||
|
||||
iter->stmt = stmt;
|
||||
iter->db = db;
|
||||
|
||||
return iter;
|
||||
}
|
||||
|
||||
treemap_row_t database_treemap_iter(database_iterator_t *iter) {
|
||||
|
||||
if (iter->stmt == NULL) {
|
||||
LOG_FATAL("database.c", "FIXME: database_treemap_iter() called after iteration stopped");
|
||||
}
|
||||
|
||||
int ret = sqlite3_step(iter->stmt);
|
||||
|
||||
if (ret == SQLITE_ROW) {
|
||||
treemap_row_t row = {
|
||||
.path = (const char *) sqlite3_column_text(iter->stmt, 0),
|
||||
.parent = (const char *) sqlite3_column_text(iter->stmt, 1),
|
||||
.size = sqlite3_column_int64(iter->stmt, 2)
|
||||
};
|
||||
|
||||
return row;
|
||||
}
|
||||
|
||||
if (ret != SQLITE_DONE) {
|
||||
LOG_FATALF("database.c", "FIXME: doc iter returned %s", sqlite3_errmsg(iter->db->db));
|
||||
}
|
||||
|
||||
sqlite3_finalize(iter->stmt);
|
||||
iter->stmt = NULL;
|
||||
|
||||
return (treemap_row_t) {NULL, NULL, 0};
|
||||
}
|
||||
|
||||
void database_generate_stats(database_t *db, double treemap_threshold) {
|
||||
|
||||
LOG_INFO("database.c", "Generating stats");
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "DELETE FROM stats_size_agg;", NULL, NULL, NULL));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "DELETE FROM stats_date_agg;", NULL, NULL, NULL));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "DELETE FROM stats_mime_agg;", NULL, NULL, NULL));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "DELETE FROM stats_treemap;", NULL, NULL, NULL));
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(
|
||||
sqlite3_exec(db->db, "CREATE TEMP TABLE tm(path TEXT PRIMARY KEY, size INT);", NULL, NULL, NULL));
|
||||
|
||||
sqlite3_prepare_v2(db->db, "UPDATE tm SET size=size+? WHERE path=?;", -1, &db->treemap_merge_up_update_stmt, NULL);
|
||||
sqlite3_prepare_v2(db->db, "DELETE FROM tm WHERE path = ?;", -1, &db->treemap_merge_up_delete_stmt, NULL);
|
||||
|
||||
// size aggregation
|
||||
sqlite3_stmt *stmt;
|
||||
sqlite3_prepare_v2(db->db, "INSERT INTO stats_size_agg"
|
||||
" SELECT"
|
||||
" cast(size / ?1 as int) * ?1 as bucket,"
|
||||
" count(*) as count"
|
||||
" FROM document"
|
||||
" GROUP BY bucket", -1, &stmt, NULL);
|
||||
sqlite3_bind_int(stmt, 1, SIZE_BUCKET);
|
||||
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
|
||||
|
||||
sqlite3_finalize(stmt);
|
||||
|
||||
// date aggregation
|
||||
sqlite3_prepare_v2(db->db, "INSERT INTO stats_date_agg"
|
||||
" SELECT"
|
||||
" cast(mtime / ?1 as int) * ?1 as bucket,"
|
||||
" count(*) as count"
|
||||
" FROM document"
|
||||
" GROUP BY bucket", -1, &stmt, NULL);
|
||||
sqlite3_bind_int(stmt, 1, DATE_BUCKET);
|
||||
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
|
||||
|
||||
sqlite3_finalize(stmt);
|
||||
|
||||
// mime aggregation
|
||||
sqlite3_prepare_v2(db->db, "INSERT INTO stats_mime_agg"
|
||||
" SELECT"
|
||||
" (json_data->>'mime') as bucket,"
|
||||
" sum(size),"
|
||||
" count(*)"
|
||||
" FROM document"
|
||||
" WHERE bucket IS NOT NULL"
|
||||
" GROUP BY bucket", -1, &stmt, NULL);
|
||||
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
|
||||
|
||||
sqlite3_finalize(stmt);
|
||||
|
||||
// Treemap
|
||||
sqlite3_prepare_v2(db->db, "SELECT SUM(size) FROM document;", -1, &stmt, NULL);
|
||||
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
|
||||
long total_size = sqlite3_column_int64(stmt, 0);
|
||||
long threshold = (long) ((double) total_size * treemap_threshold);
|
||||
sqlite3_finalize(stmt);
|
||||
|
||||
// flat map
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db,
|
||||
"INSERT INTO tm (path, size) SELECT json_data->>'path' as path, sum(size)"
|
||||
" FROM document WHERE json_data->>'parent' IS NULL GROUP BY path;",
|
||||
NULL, NULL, NULL));
|
||||
|
||||
// Merge up
|
||||
int merged_rows = 0;
|
||||
do {
|
||||
if (merged_rows) {
|
||||
LOG_INFOF("database.c", "Treemap merge iteration (%d rows changed)", merged_rows);
|
||||
}
|
||||
merged_rows = 0;
|
||||
|
||||
sqlite3_prepare_v2(db->db,
|
||||
"INSERT INTO tm (path, size) SELECT path_parent(path) as parent, 0 "
|
||||
" FROM tm WHERE parent not IN (SELECT path FROM tm) AND size<?"
|
||||
" ON CONFLICT DO NOTHING;", -1, &stmt, NULL);
|
||||
sqlite3_bind_int64(stmt, 1, threshold);
|
||||
CRASH_IF_STMT_FAIL(sqlite3_step(stmt));
|
||||
|
||||
database_iterator_t *iter = database_create_treemap_iterator(db, threshold);
|
||||
database_treemap_iter_foreach(row, iter) {
|
||||
sqlite3_bind_int64(db->treemap_merge_up_update_stmt, 1, row.size);
|
||||
sqlite3_bind_text(db->treemap_merge_up_update_stmt, 2, row.parent, -1, SQLITE_STATIC);
|
||||
CRASH_IF_STMT_FAIL(sqlite3_step(db->treemap_merge_up_update_stmt));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->treemap_merge_up_update_stmt));
|
||||
|
||||
sqlite3_bind_text(db->treemap_merge_up_delete_stmt, 1, row.path, -1, SQLITE_STATIC);
|
||||
CRASH_IF_STMT_FAIL(sqlite3_step(db->treemap_merge_up_delete_stmt));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->treemap_merge_up_delete_stmt));
|
||||
|
||||
merged_rows += 1;
|
||||
}
|
||||
} while (merged_rows > TREEMAP_MINIMUM_MERGES_TO_CONTINUE);
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db,
|
||||
"INSERT INTO stats_treemap (path, size) SELECT path,size FROM tm;",
|
||||
NULL, NULL, NULL));
|
||||
|
||||
LOG_INFO("database.c", "Done!");
|
||||
}
|
||||
|
5
src/database/database_stats.h
Normal file
5
src/database/database_stats.h
Normal file
@ -0,0 +1,5 @@
|
||||
#ifndef SIST2_DATABASE_STATS_H
|
||||
#define SIST2_DATABASE_STATS_H
|
||||
|
||||
|
||||
#endif //SIST2_DATABASE_STATS_H
|
@ -29,7 +29,7 @@ void destroy_indexer(es_indexer_t *indexer) {
|
||||
return;
|
||||
}
|
||||
|
||||
LOG_DEBUG("elastic.c", "Destroying indexer")
|
||||
LOG_DEBUG("elastic.c", "Destroying indexer");
|
||||
|
||||
if (indexer->es_url != NULL) {
|
||||
free(indexer->es_url);
|
||||
@ -64,26 +64,21 @@ void print_json(cJSON *document, const char id_str[SIST_DOC_ID_LEN]) {
|
||||
cJSON_Delete(line);
|
||||
}
|
||||
|
||||
void index_json_func(tpool_work_arg_shm_t *arg) {
|
||||
// Copy arg to heap because it's going to be freed immediately after this function returns
|
||||
es_bulk_line_t *line = malloc(arg->arg_size);
|
||||
memcpy(line, arg->arg, arg->arg_size);
|
||||
|
||||
elastic_index_line(line);
|
||||
void index_json_func(job_t *job) {
|
||||
elastic_index_line(job->bulk_line);
|
||||
}
|
||||
|
||||
void delete_document(const char *document_id_str, void *UNUSED(_data)) {
|
||||
void delete_document(const char *document_id) {
|
||||
es_bulk_line_t *bulk_line = malloc(sizeof(es_bulk_line_t));
|
||||
|
||||
bulk_line->type = ES_BULK_LINE_DELETE;
|
||||
bulk_line->next = NULL;
|
||||
strcpy(bulk_line->doc_id, document_id_str);
|
||||
strcpy(bulk_line->doc_id, document_id);
|
||||
|
||||
tpool_work_arg_t arg = {
|
||||
.arg_size = sizeof(es_bulk_line_t),
|
||||
.arg = bulk_line
|
||||
};
|
||||
tpool_add_work(IndexCtx.pool, index_json_func, &arg);
|
||||
tpool_add_work(IndexCtx.pool, &(job_t) {
|
||||
.type = JOB_BULK_LINE,
|
||||
.bulk_line = bulk_line,
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@ -100,11 +95,10 @@ void index_json(cJSON *document, const char doc_id[SIST_DOC_ID_LEN]) {
|
||||
bulk_line->next = NULL;
|
||||
|
||||
cJSON_free(json);
|
||||
tpool_work_arg_t arg = {
|
||||
.arg_size = sizeof(es_bulk_line_t) + json_len + 2,
|
||||
.arg = bulk_line
|
||||
};
|
||||
tpool_add_work(IndexCtx.pool, index_json_func, &arg);
|
||||
tpool_add_work(IndexCtx.pool, &(job_t) {
|
||||
.type = JOB_BULK_LINE,
|
||||
.bulk_line = bulk_line,
|
||||
});
|
||||
}
|
||||
|
||||
void execute_update_script(const char *script, int async, const char index_id[SIST_INDEX_ID_LEN]) {
|
||||
@ -278,7 +272,7 @@ void print_error(response_t *r) {
|
||||
void _elastic_flush(int max) {
|
||||
|
||||
if (max == 0) {
|
||||
LOG_WARNING("elastic.c", "calling _elastic_flush with 0 in queue")
|
||||
LOG_WARNING("elastic.c", "calling _elastic_flush with 0 in queue");
|
||||
return;
|
||||
}
|
||||
|
||||
@ -291,13 +285,13 @@ void _elastic_flush(int max) {
|
||||
response_t *r = web_post(bulk_url, buf, IndexCtx.es_insecure_ssl);
|
||||
|
||||
if (r->status_code == 0) {
|
||||
LOG_FATALF("elastic.c", "Could not connect to %s, make sure that elasticsearch is running!\n", IndexCtx.es_url)
|
||||
LOG_FATALF("elastic.c", "Could not connect to %s, make sure that elasticsearch is running!\n", IndexCtx.es_url);
|
||||
}
|
||||
|
||||
if (r->status_code == 413) {
|
||||
|
||||
if (max <= 1) {
|
||||
LOG_ERRORF("elastic.c", "Single document too large, giving up: {%s}", Indexer->line_head->doc_id)
|
||||
LOG_ERRORF("elastic.c", "Single document too large, giving up: {%s}", Indexer->line_head->doc_id);
|
||||
free_response(r);
|
||||
free(buf);
|
||||
free_queue(1);
|
||||
@ -318,7 +312,7 @@ void _elastic_flush(int max) {
|
||||
|
||||
free_response(r);
|
||||
free(buf);
|
||||
LOG_WARNING("elastic.c", "Got 429 status, will retry after delay")
|
||||
LOG_WARNING("elastic.c", "Got 429 status, will retry after delay");
|
||||
usleep(1000000 * 20);
|
||||
_elastic_flush(max);
|
||||
return;
|
||||
@ -453,7 +447,7 @@ es_version_t *elastic_get_version(const char *es_url, int insecure) {
|
||||
}
|
||||
|
||||
if (cJSON_GetObjectItem(response, "error") != NULL) {
|
||||
LOG_WARNING("elastic.c", "Could not get Elasticsearch version")
|
||||
LOG_WARNING("elastic.c", "Could not get Elasticsearch version");
|
||||
print_error(r);
|
||||
free_response(r);
|
||||
return NULL;
|
||||
@ -489,7 +483,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
|
||||
IndexCtx.es_version = es_version;
|
||||
|
||||
if (es_version == NULL) {
|
||||
LOG_FATAL("elastic.c", "Could not get ES version")
|
||||
LOG_FATAL("elastic.c", "Could not get ES version");
|
||||
}
|
||||
|
||||
LOG_INFOF("elastic.c",
|
||||
@ -497,7 +491,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
|
||||
format_es_version(es_version), IS_SUPPORTED_ES_VERSION(es_version), IS_LEGACY_VERSION(es_version));
|
||||
|
||||
if (!IS_SUPPORTED_ES_VERSION(es_version)) {
|
||||
LOG_FATAL("elastic.c", "This elasticsearch version is not supported!")
|
||||
LOG_FATAL("elastic.c", "This elasticsearch version is not supported!");
|
||||
}
|
||||
|
||||
char *settings = NULL;
|
||||
@ -524,7 +518,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
|
||||
|
||||
if (r->status_code != 200) {
|
||||
print_error(r);
|
||||
LOG_FATAL("elastic.c", "Could not create index")
|
||||
LOG_FATAL("elastic.c", "Could not create index");
|
||||
}
|
||||
|
||||
LOG_INFOF("elastic.c", "Create index <%d>", r->status_code);
|
||||
@ -545,7 +539,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
|
||||
LOG_INFOF("elastic.c", "Update ES settings <%d>", r->status_code);
|
||||
if (r->status_code != 200) {
|
||||
print_error(r);
|
||||
LOG_FATAL("elastic.c", "Could not update user settings")
|
||||
LOG_FATAL("elastic.c", "Could not update user settings");
|
||||
}
|
||||
free_response(r);
|
||||
|
||||
@ -560,7 +554,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
|
||||
LOG_INFOF("elastic.c", "Update ES mappings <%d>", r->status_code);
|
||||
if (r->status_code != 200) {
|
||||
print_error(r);
|
||||
LOG_FATAL("elastic.c", "Could not update user mappings")
|
||||
LOG_FATAL("elastic.c", "Could not update user mappings");
|
||||
}
|
||||
free_response(r);
|
||||
|
||||
|
@ -46,7 +46,7 @@ void print_json(cJSON *document, const char index_id_str[SIST_INDEX_ID_LEN]);
|
||||
|
||||
void index_json(cJSON *document, const char doc_id[SIST_INDEX_ID_LEN]);
|
||||
|
||||
void delete_document(const char *document_id_str, void* data);
|
||||
void delete_document(const char *document_id);
|
||||
|
||||
es_indexer_t *create_indexer(const char *url, const char *index);
|
||||
|
||||
|
@ -65,7 +65,7 @@ void web_post_async_poll(subreq_ctx_t *req) {
|
||||
curl_easy_getinfo(req->handle, CURLINFO_RESPONSE_CODE, &req->response->status_code);
|
||||
|
||||
if (req->response->status_code == 0) {
|
||||
LOG_ERRORF("web.c", "CURL Error: %s", req->curl_err_buffer)
|
||||
LOG_ERRORF("web.c", "CURL Error: %s", req->curl_err_buffer);
|
||||
}
|
||||
|
||||
curl_multi_cleanup(req->multi);
|
||||
@ -104,7 +104,7 @@ subreq_ctx_t *web_post_async(const char *url, char *data, int insecure) {
|
||||
curl_multi_add_handle(req->multi, curl);
|
||||
curl_multi_perform(req->multi, &req->running_handles);
|
||||
|
||||
LOG_DEBUGF("web.c", "async request POST %s", url)
|
||||
LOG_DEBUGF("web.c", "async request POST %s", url);
|
||||
|
||||
return req;
|
||||
}
|
||||
@ -136,7 +136,7 @@ response_t *web_get(const char *url, int timeout, int insecure) {
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp->status_code);
|
||||
|
||||
if (resp->status_code == 0) {
|
||||
LOG_ERRORF("web.c", "CURL Error: %s", err_buffer)
|
||||
LOG_ERRORF("web.c", "CURL Error: %s", err_buffer);
|
||||
}
|
||||
|
||||
curl_easy_cleanup(curl);
|
||||
@ -180,7 +180,7 @@ response_t *web_post(const char *url, const char *data, int insecure) {
|
||||
resp->size = buffer.cur;
|
||||
|
||||
if (resp->status_code == 0) {
|
||||
LOG_ERRORF("web.c", "CURL Error: %s", err_buffer)
|
||||
LOG_ERRORF("web.c", "CURL Error: %s", err_buffer);
|
||||
}
|
||||
|
||||
curl_easy_cleanup(curl);
|
||||
|
@ -1,9 +1,7 @@
|
||||
#include "src/ctx.h"
|
||||
#include "serialize.h"
|
||||
#include "src/parsing/parse.h"
|
||||
#include "src/parsing/mime.h"
|
||||
|
||||
#include <zstd.h>
|
||||
|
||||
char *get_meta_key_text(enum metakey meta_key) {
|
||||
|
||||
@ -79,7 +77,7 @@ char *get_meta_key_text(enum metakey meta_key) {
|
||||
case MetaChecksum:
|
||||
return "checksum";
|
||||
default:
|
||||
LOG_FATALF("serialize.c", "FIXME: Unknown meta key: %d", meta_key)
|
||||
LOG_FATALF("serialize.c", "FIXME: Unknown meta key: %d", meta_key);
|
||||
}
|
||||
}
|
||||
|
||||
@ -175,7 +173,7 @@ char *build_json_string(document_t *doc) {
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LOG_FATALF("serialize.c", "Invalid meta key: %x %s", meta->key, get_meta_key_text(meta->key))
|
||||
LOG_FATALF("serialize.c", "Invalid meta key: %x %s", meta->key, get_meta_key_text(meta->key));
|
||||
}
|
||||
|
||||
meta_line_t *tmp = meta;
|
||||
@ -189,394 +187,10 @@ char *build_json_string(document_t *doc) {
|
||||
return json_str;
|
||||
}
|
||||
|
||||
static struct {
|
||||
FILE *out_file;
|
||||
size_t buf_out_size;
|
||||
|
||||
void *buf_out;
|
||||
|
||||
ZSTD_CCtx *cctx;
|
||||
} WriterCtx = {
|
||||
.out_file = NULL
|
||||
};
|
||||
|
||||
#define ZSTD_COMPRESSION_LEVEL 10
|
||||
|
||||
void initialize_writer_ctx(const char *file_path) {
|
||||
WriterCtx.out_file = fopen(file_path, "wb");
|
||||
|
||||
WriterCtx.buf_out_size = ZSTD_CStreamOutSize();
|
||||
WriterCtx.buf_out = malloc(WriterCtx.buf_out_size);
|
||||
|
||||
WriterCtx.cctx = ZSTD_createCCtx();
|
||||
|
||||
ZSTD_CCtx_setParameter(WriterCtx.cctx, ZSTD_c_compressionLevel, ZSTD_COMPRESSION_LEVEL);
|
||||
ZSTD_CCtx_setParameter(WriterCtx.cctx, ZSTD_c_checksumFlag, FALSE);
|
||||
|
||||
LOG_DEBUGF("serialize.c", "Open index file for writing %s", file_path)
|
||||
}
|
||||
|
||||
void zstd_write_string(const char *string, const size_t len) {
|
||||
ZSTD_inBuffer input = {string, len, 0};
|
||||
|
||||
do {
|
||||
ZSTD_outBuffer output = {WriterCtx.buf_out, WriterCtx.buf_out_size, 0};
|
||||
ZSTD_compressStream2(WriterCtx.cctx, &output, &input, ZSTD_e_continue);
|
||||
|
||||
if (output.pos > 0) {
|
||||
ScanCtx.stat_index_size += fwrite(WriterCtx.buf_out, 1, output.pos, WriterCtx.out_file);
|
||||
}
|
||||
} while (input.pos != input.size);
|
||||
}
|
||||
|
||||
void write_document_func(tpool_work_arg_shm_t *arg) {
|
||||
|
||||
const char *json_str = arg->arg;
|
||||
|
||||
if (WriterCtx.out_file == NULL) {
|
||||
char dstfile[PATH_MAX];
|
||||
snprintf(dstfile, PATH_MAX, "%s_index_main.ndjson.zst", ScanCtx.index.path);
|
||||
initialize_writer_ctx(dstfile);
|
||||
}
|
||||
|
||||
zstd_write_string(json_str, arg->arg_size);
|
||||
}
|
||||
|
||||
void zstd_close() {
|
||||
if (WriterCtx.out_file == NULL) {
|
||||
LOG_DEBUG("serialize.c", "No zstd stream to close, skipping cleanup")
|
||||
return;
|
||||
}
|
||||
|
||||
size_t remaining;
|
||||
do {
|
||||
ZSTD_outBuffer output = {WriterCtx.buf_out, WriterCtx.buf_out_size, 0};
|
||||
remaining = ZSTD_endStream(WriterCtx.cctx, &output);
|
||||
|
||||
if (output.pos > 0) {
|
||||
ScanCtx.stat_index_size += fwrite(WriterCtx.buf_out, 1, output.pos, WriterCtx.out_file);
|
||||
}
|
||||
} while (remaining != 0);
|
||||
|
||||
ZSTD_freeCCtx(WriterCtx.cctx);
|
||||
free(WriterCtx.buf_out);
|
||||
fclose(WriterCtx.out_file);
|
||||
|
||||
LOG_DEBUG("serialize.c", "End zstd stream & close index file")
|
||||
}
|
||||
|
||||
void writer_cleanup() {
|
||||
zstd_close();
|
||||
WriterCtx.out_file = NULL;
|
||||
}
|
||||
|
||||
void write_index_descriptor(char *path, index_descriptor_t *desc) {
|
||||
cJSON *json = cJSON_CreateObject();
|
||||
cJSON_AddStringToObject(json, "id", desc->id);
|
||||
cJSON_AddStringToObject(json, "version", desc->version);
|
||||
cJSON_AddStringToObject(json, "root", desc->root);
|
||||
cJSON_AddStringToObject(json, "name", desc->name);
|
||||
cJSON_AddStringToObject(json, "type", desc->type);
|
||||
cJSON_AddStringToObject(json, "rewrite_url", desc->rewrite_url);
|
||||
cJSON_AddNumberToObject(json, "timestamp", (double) desc->timestamp);
|
||||
|
||||
int fd = open(path, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR);
|
||||
if (fd < 0) {
|
||||
LOG_FATALF("serialize.c", "Could not open index descriptor: %s", strerror(errno));
|
||||
}
|
||||
char *str = cJSON_Print(json);
|
||||
size_t ret = write(fd, str, strlen(str));
|
||||
if (ret == -1) {
|
||||
LOG_FATALF("serialize.c", "Could not write index descriptor: %s", strerror(errno));
|
||||
}
|
||||
free(str);
|
||||
close(fd);
|
||||
|
||||
cJSON_Delete(json);
|
||||
}
|
||||
|
||||
index_descriptor_t read_index_descriptor(char *path) {
|
||||
|
||||
struct stat info;
|
||||
stat(path, &info);
|
||||
int fd = open(path, O_RDONLY);
|
||||
|
||||
if (fd == -1) {
|
||||
LOG_FATALF("serialize.c", "Invalid/corrupt index (Could not find descriptor): %s: %s\n", path, strerror(errno))
|
||||
}
|
||||
|
||||
char *buf = malloc(info.st_size + 1);
|
||||
size_t ret = read(fd, buf, info.st_size);
|
||||
if (ret == -1) {
|
||||
LOG_FATALF("serialize.c", "Could not read index descriptor: %s", strerror(errno));
|
||||
}
|
||||
*(buf + info.st_size) = '\0';
|
||||
close(fd);
|
||||
|
||||
cJSON *json = cJSON_Parse(buf);
|
||||
|
||||
index_descriptor_t descriptor;
|
||||
descriptor.timestamp = (long) cJSON_GetObjectItem(json, "timestamp")->valuedouble;
|
||||
strcpy(descriptor.root, cJSON_GetObjectItem(json, "root")->valuestring);
|
||||
strcpy(descriptor.name, cJSON_GetObjectItem(json, "name")->valuestring);
|
||||
strcpy(descriptor.rewrite_url, cJSON_GetObjectItem(json, "rewrite_url")->valuestring);
|
||||
descriptor.root_len = (short) strlen(descriptor.root);
|
||||
strcpy(descriptor.version, cJSON_GetObjectItem(json, "version")->valuestring);
|
||||
strcpy(descriptor.id, cJSON_GetObjectItem(json, "id")->valuestring);
|
||||
if (cJSON_GetObjectItem(json, "type") == NULL) {
|
||||
strcpy(descriptor.type, INDEX_TYPE_NDJSON);
|
||||
} else {
|
||||
strcpy(descriptor.type, cJSON_GetObjectItem(json, "type")->valuestring);
|
||||
}
|
||||
|
||||
cJSON_Delete(json);
|
||||
free(buf);
|
||||
|
||||
return descriptor;
|
||||
}
|
||||
|
||||
|
||||
void write_document(document_t *doc) {
|
||||
char *json_str = build_json_string(doc);
|
||||
|
||||
database_write_document(ProcData.index_db, doc, json_str);
|
||||
free(doc);
|
||||
const size_t json_str_len = strlen(json_str);
|
||||
|
||||
json_str = realloc(json_str, json_str_len + 1);
|
||||
*(json_str + json_str_len) = '\n';
|
||||
|
||||
tpool_work_arg_t arg = {
|
||||
.arg_size = json_str_len + 1,
|
||||
.arg = json_str
|
||||
};
|
||||
|
||||
tpool_add_work(ScanCtx.writer_pool, write_document_func, &arg);
|
||||
}
|
||||
|
||||
void thread_cleanup() {
|
||||
cleanup_parse();
|
||||
cleanup_font();
|
||||
}
|
||||
|
||||
void read_index_bin_handle_line(const char *line, const char *index_id, index_func func) {
|
||||
|
||||
cJSON *document = cJSON_Parse(line);
|
||||
const char *path_md5_str = cJSON_GetObjectItem(document, "_id")->valuestring;
|
||||
|
||||
cJSON_AddStringToObject(document, "index", index_id);
|
||||
|
||||
// Load meta from sidecar files
|
||||
cJSON *meta_obj = NULL;
|
||||
if (IndexCtx.meta != NULL) {
|
||||
const char *meta_string = g_hash_table_lookup(IndexCtx.meta, path_md5_str);
|
||||
if (meta_string != NULL) {
|
||||
meta_obj = cJSON_Parse(meta_string);
|
||||
|
||||
cJSON *child;
|
||||
for (child = meta_obj->child; child != NULL; child = child->next) {
|
||||
char meta_key[4096];
|
||||
strcpy(meta_key, child->string);
|
||||
cJSON_DeleteItemFromObject(document, meta_key);
|
||||
cJSON_AddItemReferenceToObject(document, meta_key, child);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Load tags from tags DB
|
||||
if (IndexCtx.tags != NULL) {
|
||||
const char *tags_string = g_hash_table_lookup(IndexCtx.tags, path_md5_str);
|
||||
if (tags_string != NULL) {
|
||||
cJSON *tags_arr = cJSON_Parse(tags_string);
|
||||
cJSON_DeleteItemFromObject(document, "tag");
|
||||
cJSON_AddItemToObject(document, "tag", tags_arr);
|
||||
}
|
||||
}
|
||||
|
||||
func(document, path_md5_str);
|
||||
cJSON_DeleteItemFromObject(document, "_id");
|
||||
cJSON_Delete(document);
|
||||
if (meta_obj) {
|
||||
cJSON_Delete(meta_obj);
|
||||
}
|
||||
}
|
||||
|
||||
void read_lines(const char *path, const line_processor_t processor) {
|
||||
dyn_buffer_t buf = dyn_buffer_create();
|
||||
|
||||
// Initialize zstd things
|
||||
FILE *file = fopen(path, "rb");
|
||||
|
||||
size_t const buf_in_size = ZSTD_DStreamInSize();
|
||||
void *const buf_in = malloc(buf_in_size);
|
||||
|
||||
size_t const buf_out_size = ZSTD_DStreamOutSize();
|
||||
void *const buf_out = malloc(buf_out_size);
|
||||
|
||||
ZSTD_DCtx *const dctx = ZSTD_createDCtx();
|
||||
|
||||
size_t read;
|
||||
size_t last_ret = 0;
|
||||
while ((read = fread(buf_in, 1, buf_in_size, file))) {
|
||||
ZSTD_inBuffer input = {buf_in, read, 0};
|
||||
|
||||
while (input.pos < input.size) {
|
||||
ZSTD_outBuffer output = {buf_out, buf_out_size, 0};
|
||||
|
||||
size_t const ret = ZSTD_decompressStream(dctx, &output, &input);
|
||||
|
||||
for (int i = 0; i < output.pos; i++) {
|
||||
char c = ((char *) output.dst)[i];
|
||||
|
||||
if (c == '\n') {
|
||||
dyn_buffer_write_char(&buf, '\0');
|
||||
processor.func(buf.buf, processor.data);
|
||||
buf.cur = 0;
|
||||
} else {
|
||||
dyn_buffer_write_char(&buf, c);
|
||||
}
|
||||
}
|
||||
|
||||
last_ret = ret;
|
||||
}
|
||||
}
|
||||
|
||||
if (last_ret != 0) {
|
||||
/* The last return value from ZSTD_decompressStream did not end on a
|
||||
* frame, but we reached the end of the file! We assume this is an
|
||||
* error, and the input was truncated.
|
||||
*/
|
||||
LOG_FATALF("serialize.c", "EOF before end of stream: %zu", last_ret)
|
||||
}
|
||||
|
||||
ZSTD_freeDCtx(dctx);
|
||||
free(buf_in);
|
||||
free(buf_out);
|
||||
|
||||
dyn_buffer_destroy(&buf);
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
void read_index_ndjson(const char *line, void *_data) {
|
||||
void **data = _data;
|
||||
const char *index_id = data[0];
|
||||
index_func func = data[1];
|
||||
read_index_bin_handle_line(line, index_id, func);
|
||||
}
|
||||
|
||||
void read_index(const char *path, const char index_id[SIST_INDEX_ID_LEN], const char *type, index_func func) {
|
||||
if (strcmp(type, INDEX_TYPE_NDJSON) == 0) {
|
||||
read_lines(path, (line_processor_t) {
|
||||
.data = (void *[2]) {(void *) index_id, func},
|
||||
.func = read_index_ndjson,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static __thread GHashTable *IncrementalReadTable = NULL;
|
||||
|
||||
void json_put_incremental(cJSON *document, UNUSED(const char doc_id[SIST_DOC_ID_LEN])) {
|
||||
const char *path_md5_str = cJSON_GetObjectItem(document, "_id")->valuestring;
|
||||
const int mtime = cJSON_GetObjectItem(document, "mtime")->valueint;
|
||||
|
||||
incremental_put(IncrementalReadTable, path_md5_str, mtime);
|
||||
}
|
||||
|
||||
void incremental_read(GHashTable *table, const char *filepath, index_descriptor_t *desc) {
|
||||
IncrementalReadTable = table;
|
||||
read_index(filepath, desc->id, desc->type, json_put_incremental);
|
||||
}
|
||||
|
||||
static __thread GHashTable *IncrementalCopyTable = NULL;
|
||||
static __thread GHashTable *IncrementalNewTable = NULL;
|
||||
static __thread store_t *IncrementalCopySourceStore = NULL;
|
||||
static __thread store_t *IncrementalCopyDestinationStore = NULL;
|
||||
|
||||
void incremental_copy_handle_doc(cJSON *document, UNUSED(const char id_str[SIST_DOC_ID_LEN])) {
|
||||
|
||||
const char *doc_id = cJSON_GetObjectItem(document, "_id")->valuestring;
|
||||
|
||||
if (cJSON_GetObjectItem(document, "parent") != NULL || incremental_get(IncrementalCopyTable, doc_id)) {
|
||||
// Copy index line
|
||||
cJSON_DeleteItemFromObject(document, "index");
|
||||
char *json_str = cJSON_PrintUnformatted(document);
|
||||
const size_t json_str_len = strlen(json_str);
|
||||
|
||||
json_str = realloc(json_str, json_str_len + 1);
|
||||
*(json_str + json_str_len) = '\n';
|
||||
|
||||
// Copy tn store contents
|
||||
size_t buf_len;
|
||||
char *buf = store_read(IncrementalCopySourceStore, (char *) doc_id, SIST_DOC_ID_LEN, &buf_len);
|
||||
if (buf_len != 0) {
|
||||
store_write(IncrementalCopyDestinationStore, (char *) doc_id, SIST_DOC_ID_LEN, buf, buf_len);
|
||||
free(buf);
|
||||
}
|
||||
|
||||
// Also copy additional thumbnails
|
||||
if (cJSON_GetObjectItem(document, "thumbnail") != NULL) {
|
||||
const int thumbnail_count = cJSON_GetObjectItem(document, "thumbnail")->valueint;
|
||||
|
||||
for (int i = 1; i < thumbnail_count; i++) {
|
||||
char tn_key[SIST_DOC_ID_LEN + sizeof(char) * 4];
|
||||
|
||||
snprintf(tn_key, sizeof(tn_key), "%s%04d", doc_id, i);
|
||||
|
||||
buf = store_read(IncrementalCopySourceStore, tn_key, sizeof(tn_key), &buf_len);
|
||||
if (buf_len != 0) {
|
||||
store_write(IncrementalCopyDestinationStore, tn_key, sizeof(tn_key), buf, buf_len);
|
||||
free(buf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
zstd_write_string(json_str, json_str_len + 1);
|
||||
free(json_str);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy items from an index that are in the copy_table. Also copies from
|
||||
* the store.
|
||||
*/
|
||||
void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
|
||||
const char *dst_filepath, GHashTable *copy_table) {
|
||||
|
||||
if (WriterCtx.out_file == NULL) {
|
||||
initialize_writer_ctx(dst_filepath);
|
||||
}
|
||||
|
||||
IncrementalCopyTable = copy_table;
|
||||
IncrementalCopySourceStore = store;
|
||||
IncrementalCopyDestinationStore = dst_store;
|
||||
|
||||
read_index(filepath, "", INDEX_TYPE_NDJSON, incremental_copy_handle_doc);
|
||||
}
|
||||
|
||||
void incremental_delete_handle_doc(cJSON *document, UNUSED(const char id_str[SIST_DOC_ID_LEN])) {
|
||||
|
||||
char doc_id_n[SIST_DOC_ID_LEN + 1];
|
||||
doc_id_n[SIST_DOC_ID_LEN] = '\0';
|
||||
doc_id_n[SIST_DOC_ID_LEN - 1] = '\n';
|
||||
const char *doc_id = cJSON_GetObjectItem(document, "_id")->valuestring;
|
||||
|
||||
// do not delete archive virtual entries
|
||||
if (cJSON_GetObjectItem(document, "parent") == NULL
|
||||
&& !incremental_get(IncrementalCopyTable, doc_id)
|
||||
&& !incremental_get(IncrementalNewTable, doc_id)
|
||||
) {
|
||||
memcpy(doc_id_n, doc_id, SIST_DOC_ID_LEN - 1);
|
||||
zstd_write_string(doc_id, sizeof(doc_id_n));
|
||||
}
|
||||
}
|
||||
|
||||
void incremental_delete(const char *del_filepath, const char *index_filepath,
|
||||
GHashTable *copy_table, GHashTable *new_table) {
|
||||
|
||||
if (WriterCtx.out_file == NULL) {
|
||||
initialize_writer_ctx(del_filepath);
|
||||
}
|
||||
|
||||
IncrementalCopyTable = copy_table;
|
||||
IncrementalNewTable = new_table;
|
||||
|
||||
read_index(index_filepath, "", INDEX_TYPE_NDJSON, incremental_delete_handle_doc);
|
||||
}
|
||||
free(json_str);
|
||||
}
|
@ -2,55 +2,7 @@
|
||||
#define SIST2_SERIALIZE_H
|
||||
|
||||
#include "src/sist.h"
|
||||
#include "store.h"
|
||||
|
||||
#include <sys/syscall.h>
|
||||
#include <glib.h>
|
||||
|
||||
typedef struct line_processor {
|
||||
void* data;
|
||||
void (*func)(const char*, void*);
|
||||
} line_processor_t;
|
||||
|
||||
typedef void(*index_func)(cJSON *, const char[SIST_DOC_ID_LEN]);
|
||||
|
||||
void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
|
||||
const char *dst_filepath, GHashTable *copy_table);
|
||||
|
||||
void incremental_delete(const char *del_filepath, const char* index_filepath,
|
||||
GHashTable *copy_table, GHashTable *new_table);
|
||||
|
||||
void write_document(document_t *doc);
|
||||
|
||||
void read_lines(const char *path, const line_processor_t processor);
|
||||
|
||||
void read_index(const char *path, const char index_id[SIST_INDEX_ID_LEN], const char *type, index_func);
|
||||
|
||||
void incremental_read(GHashTable *table, const char *filepath, index_descriptor_t *desc);
|
||||
|
||||
/**
|
||||
* Must be called after write_document
|
||||
*/
|
||||
void thread_cleanup();
|
||||
|
||||
void writer_cleanup();
|
||||
|
||||
void write_index_descriptor(char *path, index_descriptor_t *desc);
|
||||
|
||||
index_descriptor_t read_index_descriptor(char *path);
|
||||
|
||||
// caller ensures char file_path[PATH_MAX]
|
||||
#define READ_INDICES(file_path, index_path, action_ok, action_main_fail, cond_original) \
|
||||
snprintf(file_path, PATH_MAX, "%s_index_main.ndjson.zst", index_path); \
|
||||
if (access(file_path, R_OK) == 0) { \
|
||||
action_ok; \
|
||||
} else { \
|
||||
action_main_fail; \
|
||||
} \
|
||||
snprintf(file_path, PATH_MAX, "%s_index_original.ndjson.zst", index_path); \
|
||||
if ((cond_original) && access(file_path, R_OK) == 0) { \
|
||||
action_ok; \
|
||||
} \
|
||||
|
||||
|
||||
#endif
|
||||
|
232
src/io/store.c
232
src/io/store.c
@ -1,232 +0,0 @@
|
||||
#include <sys/mman.h>
|
||||
#include "store.h"
|
||||
#include "src/ctx.h"
|
||||
|
||||
//#define SIST_FAKE_STORE 1
|
||||
|
||||
void open_env(const char *path, MDB_env **env, MDB_dbi *dbi) {
|
||||
mdb_env_create(env);
|
||||
|
||||
int open_ret = mdb_env_open(*env,
|
||||
path,
|
||||
MDB_WRITEMAP | MDB_MAPASYNC,
|
||||
S_IRUSR | S_IWUSR
|
||||
);
|
||||
|
||||
if (open_ret != 0) {
|
||||
LOG_FATALF("store.c", "Error while opening store: %s (%s)\n", mdb_strerror(open_ret), path)
|
||||
}
|
||||
|
||||
MDB_txn *txn;
|
||||
mdb_txn_begin(*env, NULL, 0, &txn);
|
||||
mdb_dbi_open(txn, NULL, 0, dbi);
|
||||
mdb_txn_commit(txn);
|
||||
}
|
||||
|
||||
store_t *store_create(const char *path, size_t chunk_size) {
|
||||
store_t *store = calloc(1, sizeof(struct store_t));
|
||||
mkdir(path, S_IWUSR | S_IRUSR | S_IXUSR);
|
||||
strcpy(store->path, path);
|
||||
|
||||
MDB_env *env;
|
||||
MDB_dbi dbi;
|
||||
|
||||
#if (SIST_FAKE_STORE != 1)
|
||||
store->chunk_size = chunk_size;
|
||||
|
||||
store->shm = mmap(NULL, sizeof(*store->shm), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
|
||||
|
||||
open_env(path, &env, &dbi);
|
||||
|
||||
store->shm->size = (size_t) store->chunk_size;
|
||||
mdb_env_set_mapsize(env, store->shm->size);
|
||||
|
||||
// Close, child processes will open the environment again
|
||||
mdb_env_close(env);
|
||||
#endif
|
||||
|
||||
return store;
|
||||
}
|
||||
|
||||
void store_destroy(store_t *store) {
|
||||
|
||||
LOG_DEBUG("store.c", "store_destroy()")
|
||||
#if (SIST_FAKE_STORE != 1)
|
||||
munmap(store->shm, sizeof(*store->shm));
|
||||
|
||||
mdb_dbi_close(store->proc.env, store->proc.dbi);
|
||||
mdb_env_close(store->proc.env);
|
||||
#endif
|
||||
free(store);
|
||||
}
|
||||
|
||||
void store_flush(store_t *store) {
|
||||
mdb_env_sync(store->proc.env, TRUE);
|
||||
}
|
||||
|
||||
void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len) {
|
||||
|
||||
ScanCtx.stat_tn_size += buf_len;
|
||||
|
||||
if (LogCtx.very_verbose) {
|
||||
LOG_DEBUGF("store.c", "Store write %s@{%s} %lu bytes", store->path, key, buf_len)
|
||||
}
|
||||
|
||||
#if (SIST_FAKE_STORE != 1)
|
||||
|
||||
if (store->proc.env == NULL) {
|
||||
open_env(store->path, &store->proc.env, &store->proc.dbi);
|
||||
LOG_DEBUGF("store.c", "Opening mdb environment %s", store->path)
|
||||
}
|
||||
|
||||
MDB_val mdb_key;
|
||||
mdb_key.mv_data = key;
|
||||
mdb_key.mv_size = key_len;
|
||||
|
||||
MDB_val mdb_value;
|
||||
mdb_value.mv_data = buf;
|
||||
mdb_value.mv_size = buf_len;
|
||||
|
||||
MDB_txn *txn;
|
||||
|
||||
int db_full = FALSE;
|
||||
int put_ret = 0;
|
||||
int should_abort_transaction = FALSE;
|
||||
int should_increase_size = TRUE;
|
||||
|
||||
int begin_ret = mdb_txn_begin(store->proc.env, NULL, 0, &txn);
|
||||
|
||||
if (begin_ret == MDB_MAP_RESIZED) {
|
||||
// mapsize was increased by another process. We don't need to increase the size again, but we need
|
||||
// to update the size of the environment for the current process.
|
||||
db_full = TRUE;
|
||||
should_increase_size = FALSE;
|
||||
} else {
|
||||
put_ret = mdb_put(txn, store->proc.dbi, &mdb_key, &mdb_value, 0);
|
||||
|
||||
if (put_ret == MDB_MAP_FULL) {
|
||||
// Database is full, we need to increase the environment size
|
||||
db_full = TRUE;
|
||||
should_abort_transaction = TRUE;
|
||||
} else {
|
||||
int commit_ret = mdb_txn_commit(txn);
|
||||
|
||||
if (commit_ret == MDB_MAP_FULL) {
|
||||
db_full = TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (db_full) {
|
||||
LOG_DEBUGF("store.c", "Updating mdb mapsize to %lu bytes", store->shm->size)
|
||||
|
||||
if (should_abort_transaction) {
|
||||
mdb_txn_abort(txn);
|
||||
}
|
||||
|
||||
// Cannot resize when there is an opened transaction in this process.
|
||||
// Resize take effect on the next commit.
|
||||
if (should_increase_size) {
|
||||
store->shm->size += store->chunk_size;
|
||||
}
|
||||
int resize_ret = mdb_env_set_mapsize(store->proc.env, store->shm->size);
|
||||
if (resize_ret != 0) {
|
||||
LOG_ERRORF("store.c", "mdb_env_set_mapsize() failed: %s", mdb_strerror(resize_ret))
|
||||
}
|
||||
mdb_txn_begin(store->proc.env, NULL, 0, &txn);
|
||||
int put_ret_retry = mdb_put(txn, store->proc.dbi, &mdb_key, &mdb_value, 0);
|
||||
|
||||
if (put_ret_retry != 0) {
|
||||
LOG_ERRORF("store.c", "mdb_put() (retry) failed: %s", mdb_strerror(put_ret_retry))
|
||||
}
|
||||
|
||||
int ret = mdb_txn_commit(txn);
|
||||
if (ret != 0) {
|
||||
LOG_FATALF("store.c", "FIXME: Could not commit to store %s: %s (%d), %d, %d %d",
|
||||
store->path, mdb_strerror(ret), ret,
|
||||
ret, put_ret_retry)
|
||||
}
|
||||
LOG_DEBUGF("store.c", "Updated mdb mapsize to %lu bytes", store->shm->size)
|
||||
} else if (put_ret != 0) {
|
||||
LOG_ERRORF("store.c", "mdb_put() failed: %s", mdb_strerror(put_ret))
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
char *store_read(store_t *store, char *key, size_t key_len, size_t *return_value_len) {
|
||||
char *buf = NULL;
|
||||
|
||||
#if (SIST_FAKE_STORE != 1)
|
||||
if (store->proc.env == NULL) {
|
||||
open_env(store->path, &store->proc.env, &store->proc.dbi);
|
||||
}
|
||||
|
||||
MDB_val mdb_key;
|
||||
mdb_key.mv_data = key;
|
||||
mdb_key.mv_size = key_len;
|
||||
|
||||
MDB_val mdb_value;
|
||||
|
||||
MDB_txn *txn;
|
||||
mdb_txn_begin(store->proc.env, NULL, MDB_RDONLY, &txn);
|
||||
|
||||
int get_ret = mdb_get(txn, store->proc.dbi, &mdb_key, &mdb_value);
|
||||
|
||||
if (get_ret == MDB_NOTFOUND) {
|
||||
*return_value_len = 0;
|
||||
} else {
|
||||
*return_value_len = mdb_value.mv_size;
|
||||
buf = malloc(mdb_value.mv_size);
|
||||
memcpy(buf, mdb_value.mv_data, mdb_value.mv_size);
|
||||
}
|
||||
|
||||
mdb_txn_abort(txn);
|
||||
#endif
|
||||
return buf;
|
||||
}
|
||||
|
||||
GHashTable *store_read_all(store_t *store) {
|
||||
|
||||
if (store->proc.env == NULL) {
|
||||
open_env(store->path, &store->proc.env, &store->proc.dbi);
|
||||
LOG_DEBUGF("store.c", "Opening mdb environment %s", store->path)
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
|
||||
GHashTable *table = g_hash_table_new_full(g_str_hash, g_str_equal, free, free);
|
||||
|
||||
MDB_txn *txn = NULL;
|
||||
mdb_txn_begin(store->proc.env, NULL, MDB_RDONLY, &txn);
|
||||
|
||||
MDB_cursor *cur = NULL;
|
||||
mdb_cursor_open(txn, store->proc.dbi, &cur);
|
||||
|
||||
MDB_val key;
|
||||
MDB_val value;
|
||||
|
||||
while (mdb_cursor_get(cur, &key, &value, MDB_NEXT) == 0) {
|
||||
char *key_str = malloc(key.mv_size);
|
||||
memcpy(key_str, key.mv_data, key.mv_size);
|
||||
char *val_str = malloc(value.mv_size);
|
||||
memcpy(val_str, value.mv_data, value.mv_size);
|
||||
|
||||
g_hash_table_insert(table, key_str, val_str);
|
||||
count += 1;
|
||||
}
|
||||
|
||||
const char *path;
|
||||
mdb_env_get_path(store->proc.env, &path);
|
||||
LOG_DEBUGF("store.c", "Read %d entries from %s", count, path)
|
||||
|
||||
mdb_cursor_close(cur);
|
||||
mdb_txn_abort(txn);
|
||||
return table;
|
||||
}
|
||||
|
||||
|
||||
void store_copy(store_t *store, const char *destination) {
|
||||
mkdir(destination, S_IWUSR | S_IRUSR | S_IXUSR);
|
||||
mdb_env_copy(store->proc.env, destination);
|
||||
}
|
@ -1,42 +0,0 @@
|
||||
#ifndef SIST2_STORE_H
|
||||
#define SIST2_STORE_H
|
||||
|
||||
#include <pthread.h>
|
||||
#include <lmdb.h>
|
||||
|
||||
#include <glib.h>
|
||||
|
||||
#define STORE_SIZE_TN (1024 * 1024 * 5)
|
||||
#define STORE_SIZE_TAG (1024 * 1024)
|
||||
#define STORE_SIZE_META STORE_SIZE_TAG
|
||||
|
||||
|
||||
typedef struct store_t {
|
||||
char path[PATH_MAX];
|
||||
size_t chunk_size;
|
||||
|
||||
struct {
|
||||
MDB_dbi dbi;
|
||||
MDB_env *env;
|
||||
} proc;
|
||||
|
||||
struct {
|
||||
size_t size;
|
||||
} *shm;
|
||||
} store_t;
|
||||
|
||||
store_t *store_create(const char *path, size_t chunk_size);
|
||||
|
||||
void store_destroy(store_t *store);
|
||||
|
||||
void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t buf_len);
|
||||
|
||||
void store_flush(store_t *store);
|
||||
|
||||
char *store_read(store_t *store, char *key, size_t key_len, size_t *return_value_len);
|
||||
|
||||
GHashTable *store_read_all(store_t *store);
|
||||
|
||||
void store_copy(store_t *store, const char *destination);
|
||||
|
||||
#endif
|
@ -1,46 +1,12 @@
|
||||
#include "walk.h"
|
||||
#include "src/ctx.h"
|
||||
#include "src/parsing/parse.h"
|
||||
#include "src/parsing/fs_util.h"
|
||||
|
||||
#include <ftw.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#define STR_STARTS_WITH(x, y) (strncmp(y, x, strlen(y) - 1) == 0)
|
||||
|
||||
__always_inline
|
||||
parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info, int base) {
|
||||
int len = (int) strlen(filepath);
|
||||
parse_job_t *job = malloc(sizeof(parse_job_t));
|
||||
|
||||
strcpy(job->filepath, filepath);
|
||||
job->base = base;
|
||||
char *p = strrchr(filepath + base, '.');
|
||||
if (p != NULL) {
|
||||
job->ext = (int) (p - filepath + 1);
|
||||
} else {
|
||||
job->ext = len;
|
||||
}
|
||||
|
||||
job->vfile.st_size = info->st_size;
|
||||
job->vfile.st_mode = info->st_mode;
|
||||
job->vfile.mtime = (int) info->st_mtim.tv_sec;
|
||||
|
||||
job->parent[0] = '\0';
|
||||
|
||||
memcpy(job->vfile.filepath, job->filepath, sizeof(job->vfile.filepath));
|
||||
job->vfile.read = fs_read;
|
||||
// Filesystem reads are always rewindable
|
||||
job->vfile.read_rewindable = fs_read;
|
||||
job->vfile.reset = fs_reset;
|
||||
job->vfile.close = fs_close;
|
||||
job->vfile.fd = -1;
|
||||
job->vfile.is_fs_file = TRUE;
|
||||
job->vfile.has_checksum = FALSE;
|
||||
job->vfile.rewind_buffer_size = 0;
|
||||
job->vfile.rewind_buffer = NULL;
|
||||
job->vfile.calculate_checksum = ScanCtx.calculate_checksums;
|
||||
|
||||
return job;
|
||||
}
|
||||
|
||||
int sub_strings[30];
|
||||
#define EXCLUDED(str) (pcre_exec(ScanCtx.exclude, ScanCtx.exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
|
||||
@ -55,7 +21,7 @@ int handle_entry(const char *filepath, const struct stat *info, int typeflag, st
|
||||
}
|
||||
|
||||
if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) {
|
||||
LOG_DEBUGF("walk.c", "Excluded: %s", filepath)
|
||||
LOG_DEBUGF("walk.c", "Excluded: %s", filepath);
|
||||
|
||||
if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
|
||||
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
|
||||
@ -69,13 +35,13 @@ int handle_entry(const char *filepath, const struct stat *info, int typeflag, st
|
||||
}
|
||||
|
||||
if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
|
||||
parse_job_t *job = create_fs_parse_job(filepath, info, ftw->base);
|
||||
parse_job_t *job = create_parse_job(filepath, (int) info->st_mtim.tv_sec, info->st_size);
|
||||
|
||||
tpool_work_arg_t arg = {
|
||||
.arg_size = sizeof(parse_job_t),
|
||||
.arg = job
|
||||
};
|
||||
tpool_add_work(ScanCtx.pool, parse, &arg);
|
||||
tpool_add_work(ScanCtx.pool, &(job_t) {
|
||||
.type = JOB_PARSE_JOB,
|
||||
.parse_job = job
|
||||
});
|
||||
free(job);
|
||||
}
|
||||
|
||||
return FTW_CONTINUE;
|
||||
@ -116,7 +82,7 @@ int iterate_file_list(void *input_file) {
|
||||
}
|
||||
|
||||
if (ScanCtx.exclude != NULL && EXCLUDED(absolute_path)) {
|
||||
LOG_DEBUGF("walk.c", "Excluded: %s", absolute_path)
|
||||
LOG_DEBUGF("walk.c", "Excluded: %s", absolute_path);
|
||||
|
||||
if (S_ISREG(info.st_mode)) {
|
||||
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
|
||||
@ -131,16 +97,14 @@ int iterate_file_list(void *input_file) {
|
||||
LOG_FATALF("walk.c", "File is not a children of root folder (%s): %s", ScanCtx.index.desc.root, buf);
|
||||
}
|
||||
|
||||
int base = (int) (strrchr(buf, '/') - buf) + 1;
|
||||
|
||||
parse_job_t *job = create_fs_parse_job(absolute_path, &info, base);
|
||||
parse_job_t *job = create_parse_job(absolute_path, (int) info.st_mtim.tv_sec, info.st_size);
|
||||
free(absolute_path);
|
||||
|
||||
tpool_work_arg_t arg = {
|
||||
.arg = job,
|
||||
.arg_size = sizeof(parse_job_t)
|
||||
};
|
||||
tpool_add_work(ScanCtx.pool, parse, &arg);
|
||||
tpool_add_work(ScanCtx.pool, &(job_t) {
|
||||
.type = JOB_PARSE_JOB,
|
||||
.parse_job = job
|
||||
});
|
||||
free(job);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
28
src/log.c
28
src/log.c
@ -21,8 +21,6 @@ void vsist_logf(const char *filepath, int level, char *format, va_list ap) {
|
||||
|
||||
char log_str[LOG_MAX_LENGTH];
|
||||
|
||||
unsigned long long pid = (unsigned long long) pthread_self();
|
||||
|
||||
char datetime[32];
|
||||
time_t t;
|
||||
struct tm result;
|
||||
@ -42,8 +40,8 @@ void vsist_logf(const char *filepath, int level, char *format, va_list ap) {
|
||||
|
||||
log_len = snprintf(
|
||||
log_str, sizeof(log_str),
|
||||
"{\"thread\":\"%04llX\",\"datetime\":\"%s\",\"level\":\"%s\",\"filepath\":%s,\"message\":%s}\n",
|
||||
pid, datetime, log_levels[level], filepath_json_str, log_str_json_str
|
||||
"{\"thread\":\"T%d\",\"datetime\":\"%s\",\"level\":\"%s\",\"filepath\":%s,\"message\":%s}\n",
|
||||
ProcData.thread_id, datetime, log_levels[level], filepath_json_str, log_str_json_str
|
||||
);
|
||||
|
||||
cJSON_Delete(filepath_json);
|
||||
@ -58,15 +56,15 @@ void vsist_logf(const char *filepath, int level, char *format, va_list ap) {
|
||||
if (is_tty) {
|
||||
log_len = snprintf(
|
||||
log_str, sizeof(log_str),
|
||||
"\033[%dm[%04llX]%s [%s] [%s %s] ",
|
||||
31 + ((unsigned int) (pid)) % 7, pid, log_colors[level],
|
||||
"\033[%dmT%d%s [%s] [%s %s] ",
|
||||
31 + ProcData.thread_id % 7, ProcData.thread_id, log_colors[level],
|
||||
datetime, log_levels[level], filepath
|
||||
);
|
||||
} else {
|
||||
log_len = snprintf(
|
||||
log_str, sizeof(log_str),
|
||||
"[%04llX] [%s] [%s %s] ",
|
||||
pid, datetime, log_levels[level], filepath
|
||||
"T%d [%s] [%s %s] ",
|
||||
ProcData.thread_id, datetime, log_levels[level], filepath
|
||||
);
|
||||
}
|
||||
|
||||
@ -112,8 +110,6 @@ void sist_log(const char *filepath, int level, char *str) {
|
||||
|
||||
char log_str[LOG_MAX_LENGTH];
|
||||
|
||||
unsigned long long pid = (unsigned long long) pthread_self();
|
||||
|
||||
char datetime[32];
|
||||
time_t t;
|
||||
struct tm result;
|
||||
@ -132,8 +128,8 @@ void sist_log(const char *filepath, int level, char *str) {
|
||||
|
||||
log_len = snprintf(
|
||||
log_str, sizeof(log_str),
|
||||
"{\"thread\":\"%04llX\",\"datetime\":\"%s\",\"level\":\"%s\",\"filepath\":%s,\"message\":%s}\n",
|
||||
pid, datetime, log_levels[level], filepath_json_str, log_str_json_str
|
||||
"{\"thread\":\"T%d\",\"datetime\":\"%s\",\"level\":\"%s\",\"filepath\":%s,\"message\":%s}\n",
|
||||
ProcData.thread_id, datetime, log_levels[level], filepath_json_str, log_str_json_str
|
||||
);
|
||||
|
||||
cJSON_Delete(log_str_json);
|
||||
@ -147,16 +143,16 @@ void sist_log(const char *filepath, int level, char *str) {
|
||||
if (is_tty) {
|
||||
log_len = snprintf(
|
||||
log_str, sizeof(log_str),
|
||||
"\033[%dm[%04llX]%s [%s] [%s %s] %s \033[0m\n",
|
||||
31 + ((unsigned int) (pid)) % 7, pid, log_colors[level],
|
||||
"\033[%dmT%d%s [%s] [%s %s] %s \033[0m\n",
|
||||
31 + ProcData.thread_id % 7, ProcData.thread_id, log_colors[level],
|
||||
datetime, log_levels[level], filepath,
|
||||
str
|
||||
);
|
||||
} else {
|
||||
log_len = snprintf(
|
||||
log_str, sizeof(log_str),
|
||||
"[%04llX] [%s] [%s %s] %s \n",
|
||||
pid, datetime, log_levels[level], filepath,
|
||||
"T%d [%s] [%s %s] %s \n",
|
||||
ProcData.thread_id, datetime, log_levels[level], filepath,
|
||||
str
|
||||
);
|
||||
}
|
||||
|
43
src/log.h
43
src/log.h
@ -2,6 +2,7 @@
|
||||
#define SIST2_LOG_H
|
||||
|
||||
|
||||
#include <signal.h>
|
||||
#define LOG_MAX_LENGTH 8192
|
||||
|
||||
#define LOG_SIST_DEBUG 0
|
||||
@ -10,37 +11,37 @@
|
||||
#define LOG_SIST_ERROR 3
|
||||
#define LOG_SIST_FATAL 4
|
||||
|
||||
#define LOG_DEBUGF(filepath, fmt, ...) \
|
||||
if (LogCtx.very_verbose) {sist_logf(filepath, LOG_SIST_DEBUG, fmt, __VA_ARGS__);}
|
||||
#define LOG_DEBUG(filepath, str) \
|
||||
if (LogCtx.very_verbose) {sist_log(filepath, LOG_SIST_DEBUG, str);}
|
||||
#define LOG_DEBUGF(filepath, fmt, ...) do{\
|
||||
if (LogCtx.very_verbose) {sist_logf(filepath, LOG_SIST_DEBUG, fmt, __VA_ARGS__);}}while(0)
|
||||
#define LOG_DEBUG(filepath, str) do{\
|
||||
if (LogCtx.very_verbose) {sist_log(filepath, LOG_SIST_DEBUG, str);}}while(0)
|
||||
|
||||
#define LOG_INFOF(filepath, fmt, ...) \
|
||||
if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_INFO, fmt, __VA_ARGS__);}
|
||||
#define LOG_INFO(filepath, str) \
|
||||
if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_INFO, str);}
|
||||
#define LOG_INFOF(filepath, fmt, ...) do {\
|
||||
if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_INFO, fmt, __VA_ARGS__);}} while(0)
|
||||
#define LOG_INFO(filepath, str) do {\
|
||||
if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_INFO, str);}} while(0)
|
||||
|
||||
#define LOG_WARNINGF(filepath, fmt, ...) \
|
||||
if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_WARNING, fmt, __VA_ARGS__);}
|
||||
#define LOG_WARNING(filepath, str) \
|
||||
if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_WARNING, str);}
|
||||
#define LOG_WARNINGF(filepath, fmt, ...) do {\
|
||||
if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_WARNING, fmt, __VA_ARGS__);}}while(0)
|
||||
#define LOG_WARNING(filepath, str) do{\
|
||||
if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_WARNING, str);}}while(0)
|
||||
|
||||
#define LOG_ERRORF(filepath, fmt, ...) \
|
||||
if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_ERROR, fmt, __VA_ARGS__);}
|
||||
#define LOG_ERROR(filepath, str) \
|
||||
if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_ERROR, str);}
|
||||
#define LOG_ERRORF(filepath, fmt, ...) do {\
|
||||
if (LogCtx.verbose) {sist_logf(filepath, LOG_SIST_ERROR, fmt, __VA_ARGS__);}}while(0)
|
||||
#define LOG_ERROR(filepath, str) do{\
|
||||
if (LogCtx.verbose) {sist_log(filepath, LOG_SIST_ERROR, str);}}while(0)
|
||||
|
||||
#define LOG_FATALF(filepath, fmt, ...) \
|
||||
#define LOG_FATALF(filepath, fmt, ...)\
|
||||
sist_logf(filepath, LOG_SIST_FATAL, fmt, __VA_ARGS__);\
|
||||
exit(-1);
|
||||
raise(SIGUSR1)
|
||||
#define LOG_FATAL(filepath, str) \
|
||||
sist_log(filepath, LOG_SIST_FATAL, str);\
|
||||
exit(-1);
|
||||
exit(SIGUSR1)
|
||||
|
||||
#define LOG_FATALF_NO_EXIT(filepath, fmt, ...) \
|
||||
sist_logf(filepath, LOG_SIST_FATAL, fmt, __VA_ARGS__);
|
||||
sist_logf(filepath, LOG_SIST_FATAL, fmt, __VA_ARGS__)
|
||||
#define LOG_FATAL_NO_EXIT(filepath, str) \
|
||||
sist_log(filepath, LOG_SIST_FATAL, str);
|
||||
sist_log(filepath, LOG_SIST_FATAL, str)
|
||||
|
||||
#include "sist.h"
|
||||
|
||||
|
437
src/main.c
437
src/main.c
@ -5,8 +5,6 @@
|
||||
#include <locale.h>
|
||||
|
||||
#include "cli.h"
|
||||
#include "io/serialize.h"
|
||||
#include "io/store.h"
|
||||
#include "tpool.h"
|
||||
#include "io/walk.h"
|
||||
#include "index/elastic.h"
|
||||
@ -16,10 +14,9 @@
|
||||
#include "auth0/auth0_c_api.h"
|
||||
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/mman.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#include "stats.h"
|
||||
#include "src/database/database.h"
|
||||
|
||||
#define DESCRIPTION "Lightning-fast file system indexer and search tool."
|
||||
|
||||
@ -46,30 +43,31 @@ void sig_handler(int signum) {
|
||||
LOG_ERROR("*SIGNAL HANDLER*", "=============================================\n\n");
|
||||
LOG_ERRORF("*SIGNAL HANDLER*", "Uh oh! Caught fatal signal: %s", strsignal(signum));
|
||||
|
||||
if (ScanCtx.dbg_current_files != NULL) {
|
||||
GHashTableIter iter;
|
||||
g_hash_table_iter_init(&iter, ScanCtx.dbg_current_files);
|
||||
|
||||
void *key;
|
||||
void *value;
|
||||
while (g_hash_table_iter_next(&iter, &key, &value)) {
|
||||
parse_job_t *job = value;
|
||||
|
||||
if (isatty(STDERR_FILENO)) {
|
||||
LOG_DEBUGF(
|
||||
"*SIGNAL HANDLER*",
|
||||
"Thread \033[%dm[%04llX]\033[0m was working on job '%s'",
|
||||
31 + ((unsigned int) key) % 7, key, job->filepath
|
||||
);
|
||||
} else {
|
||||
LOG_DEBUGF(
|
||||
"*SIGNAL HANDLER*",
|
||||
"THREAD [%04llX] was working on job %s",
|
||||
key, job->filepath
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO: Print debug info
|
||||
// if (ScanCtx.dbg_current_files != NULL) {
|
||||
// GHashTableIter iter;
|
||||
// g_hash_table_iter_init(&iter, ScanCtx.dbg_current_files);
|
||||
//
|
||||
// void *key;
|
||||
// void *value;
|
||||
// while (g_hash_table_iter_next(&iter, &key, &value)) {
|
||||
// parse_job_t *job = value;
|
||||
//
|
||||
// if (isatty(STDERR_FILENO)) {
|
||||
// LOG_DEBUGF(
|
||||
// "*SIGNAL HANDLER*",
|
||||
// "Thread \033[%dm[%04llX]\033[0m was working on job '%s'",
|
||||
// 31 + ((unsigned int) key) % 7, key, job->filepath
|
||||
// );
|
||||
// } else {
|
||||
// LOG_DEBUGF(
|
||||
// "*SIGNAL HANDLER*",
|
||||
// "THREAD [%04llX] was working on job %s",
|
||||
// key, job->filepath
|
||||
// );
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
if (ScanCtx.pool != NULL) {
|
||||
tpool_dump_debug_info(ScanCtx.pool);
|
||||
@ -82,18 +80,18 @@ void sig_handler(int signum) {
|
||||
LOG_INFO(
|
||||
"*SIGNAL HANDLER*",
|
||||
"Please consider creating a bug report at https://github.com/simon987/sist2/issues !"
|
||||
)
|
||||
);
|
||||
LOG_INFO(
|
||||
"*SIGNAL HANDLER*",
|
||||
"sist2 is an open source project and relies on the collaboration of its users to diagnose and fix bugs"
|
||||
)
|
||||
);
|
||||
|
||||
#ifndef SIST_DEBUG
|
||||
LOG_WARNING(
|
||||
"*SIGNAL HANDLER*",
|
||||
"You are running sist2 in release mode! Please consider downloading the debug binary from the Github "
|
||||
"releases page to provide additionnal information when submitting a bug report."
|
||||
)
|
||||
);
|
||||
#endif
|
||||
|
||||
if (signum == SIGSEGV && sigsegv_handler != NULL) {
|
||||
@ -105,36 +103,59 @@ void sig_handler(int signum) {
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
void init_dir(const char *dirpath, scan_args_t *args) {
|
||||
char path[PATH_MAX];
|
||||
snprintf(path, PATH_MAX, "%sdescriptor.json", dirpath);
|
||||
void database_scan_begin(scan_args_t *args) {
|
||||
index_descriptor_t *desc = &ScanCtx.index.desc;
|
||||
|
||||
time(&ScanCtx.index.desc.timestamp);
|
||||
strcpy(ScanCtx.index.desc.version, Version);
|
||||
strcpy(ScanCtx.index.desc.type, INDEX_TYPE_NDJSON);
|
||||
database_t *db = database_create(args->output, INDEX_DATABASE);
|
||||
|
||||
if (args->incremental) {
|
||||
// Update existing descriptor
|
||||
database_open(db);
|
||||
index_descriptor_t *original_desc = database_read_index_descriptor(db);
|
||||
|
||||
// copy original index id
|
||||
strcpy(desc->id, original_desc->id);
|
||||
|
||||
if (original_desc->version_major != VersionMajor) {
|
||||
LOG_FATALF("main.c", "Version mismatch! Index is %s but executable is %s", original_desc->version, Version);
|
||||
}
|
||||
|
||||
strcpy(original_desc->root, desc->root);
|
||||
original_desc->root_len = desc->root_len;
|
||||
strcpy(original_desc->rewrite_url, desc->rewrite_url);
|
||||
strcpy(original_desc->name, desc->name);
|
||||
|
||||
time(&original_desc->timestamp);
|
||||
|
||||
database_write_index_descriptor(db, original_desc);
|
||||
free(original_desc);
|
||||
|
||||
database_incremental_scan_begin(db);
|
||||
|
||||
if (args->incremental != NULL) {
|
||||
// copy old index id
|
||||
char descriptor_path[PATH_MAX];
|
||||
snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->incremental);
|
||||
index_descriptor_t original_desc = read_index_descriptor(descriptor_path);
|
||||
memcpy(ScanCtx.index.desc.id, original_desc.id, sizeof(original_desc.id));
|
||||
} else {
|
||||
// Create new descriptor
|
||||
|
||||
time(&desc->timestamp);
|
||||
strcpy(desc->version, Version);
|
||||
desc->version_major = VersionMajor;
|
||||
desc->version_minor = VersionMinor;
|
||||
desc->version_patch = VersionPatch;
|
||||
|
||||
// generate new index id based on timestamp
|
||||
unsigned char index_md5[MD5_DIGEST_LENGTH];
|
||||
MD5((unsigned char *) &ScanCtx.index.desc.timestamp, sizeof(ScanCtx.index.desc.timestamp), index_md5);
|
||||
buf2hex(index_md5, MD5_DIGEST_LENGTH, ScanCtx.index.desc.id);
|
||||
|
||||
database_initialize(db);
|
||||
database_open(db);
|
||||
database_write_index_descriptor(db, desc);
|
||||
}
|
||||
|
||||
write_index_descriptor(path, &ScanCtx.index.desc);
|
||||
database_close(db, FALSE);
|
||||
}
|
||||
|
||||
void scan_print_header() {
|
||||
LOG_INFOF("main.c", "sist2 v%s", Version)
|
||||
}
|
||||
|
||||
void _store(char *key, size_t key_len, char *buf, size_t buf_len) {
|
||||
store_write(ScanCtx.index.store, key, key_len, buf, buf_len);
|
||||
void write_thumbnail_callback(char *key, int num, void *buf, size_t buf_len) {
|
||||
database_write_thumbnail(ProcData.index_db, key, num, buf, buf_len);
|
||||
}
|
||||
|
||||
void _log(const char *filepath, int level, char *str) {
|
||||
@ -177,11 +198,8 @@ void _logf(const char *filepath, int level, char *format, ...) {
|
||||
}
|
||||
|
||||
void initialize_scan_context(scan_args_t *args) {
|
||||
|
||||
ScanCtx.dbg_current_files = g_hash_table_new_full(g_int64_hash, g_int64_equal, NULL, NULL);
|
||||
pthread_mutex_init(&ScanCtx.dbg_current_files_mu, NULL);
|
||||
// TODO: shared
|
||||
pthread_mutex_init(&ScanCtx.dbg_file_counts_mu, NULL);
|
||||
pthread_mutex_init(&ScanCtx.copy_table_mu, NULL);
|
||||
|
||||
ScanCtx.calculate_checksums = args->calculate_checksums;
|
||||
|
||||
@ -189,7 +207,7 @@ void initialize_scan_context(scan_args_t *args) {
|
||||
ScanCtx.arc_ctx.mode = args->archive_mode;
|
||||
ScanCtx.arc_ctx.log = _log;
|
||||
ScanCtx.arc_ctx.logf = _logf;
|
||||
ScanCtx.arc_ctx.parse = (parse_callback_t) parse_job;
|
||||
ScanCtx.arc_ctx.parse = (parse_callback_t) parse;
|
||||
if (args->archive_passphrase != NULL) {
|
||||
strcpy(ScanCtx.arc_ctx.passphrase, args->archive_passphrase);
|
||||
} else {
|
||||
@ -199,12 +217,12 @@ void initialize_scan_context(scan_args_t *args) {
|
||||
// Comic
|
||||
ScanCtx.comic_ctx.log = _log;
|
||||
ScanCtx.comic_ctx.logf = _logf;
|
||||
ScanCtx.comic_ctx.store = _store;
|
||||
ScanCtx.comic_ctx.store = write_thumbnail_callback;
|
||||
ScanCtx.comic_ctx.enable_tn = args->tn_count > 0;
|
||||
ScanCtx.comic_ctx.tn_size = args->tn_size;
|
||||
ScanCtx.comic_ctx.tn_qscale = args->tn_quality;
|
||||
ScanCtx.comic_ctx.cbr_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/x-cbr");
|
||||
ScanCtx.comic_ctx.cbz_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/x-cbz");
|
||||
ScanCtx.comic_ctx.cbr_mime = mime_get_mime_by_string("application/x-cbr");
|
||||
ScanCtx.comic_ctx.cbz_mime = mime_get_mime_by_string("application/x-cbz");
|
||||
|
||||
// Ebook
|
||||
ScanCtx.ebook_ctx.content_size = args->content_size;
|
||||
@ -216,7 +234,7 @@ void initialize_scan_context(scan_args_t *args) {
|
||||
}
|
||||
ScanCtx.ebook_ctx.log = _log;
|
||||
ScanCtx.ebook_ctx.logf = _logf;
|
||||
ScanCtx.ebook_ctx.store = _store;
|
||||
ScanCtx.ebook_ctx.store = write_thumbnail_callback;
|
||||
ScanCtx.ebook_ctx.fast_epub_parse = args->fast_epub;
|
||||
ScanCtx.ebook_ctx.tn_qscale = args->tn_quality;
|
||||
|
||||
@ -224,7 +242,7 @@ void initialize_scan_context(scan_args_t *args) {
|
||||
ScanCtx.font_ctx.enable_tn = args->tn_count > 0;
|
||||
ScanCtx.font_ctx.log = _log;
|
||||
ScanCtx.font_ctx.logf = _logf;
|
||||
ScanCtx.font_ctx.store = _store;
|
||||
ScanCtx.font_ctx.store = write_thumbnail_callback;
|
||||
|
||||
// Media
|
||||
ScanCtx.media_ctx.tn_qscale = args->tn_quality;
|
||||
@ -232,7 +250,7 @@ void initialize_scan_context(scan_args_t *args) {
|
||||
ScanCtx.media_ctx.tn_count = args->tn_count;
|
||||
ScanCtx.media_ctx.log = _log;
|
||||
ScanCtx.media_ctx.logf = _logf;
|
||||
ScanCtx.media_ctx.store = _store;
|
||||
ScanCtx.media_ctx.store = write_thumbnail_callback;
|
||||
ScanCtx.media_ctx.max_media_buffer = (long) args->max_memory_buffer_mib * 1024 * 1024;
|
||||
ScanCtx.media_ctx.read_subtitles = args->read_subtitles;
|
||||
ScanCtx.media_ctx.read_subtitles = args->tn_count;
|
||||
@ -248,7 +266,7 @@ void initialize_scan_context(scan_args_t *args) {
|
||||
ScanCtx.ooxml_ctx.content_size = args->content_size;
|
||||
ScanCtx.ooxml_ctx.log = _log;
|
||||
ScanCtx.ooxml_ctx.logf = _logf;
|
||||
ScanCtx.ooxml_ctx.store = _store;
|
||||
ScanCtx.ooxml_ctx.store = write_thumbnail_callback;
|
||||
|
||||
// MOBI
|
||||
ScanCtx.mobi_ctx.content_size = args->content_size;
|
||||
@ -264,8 +282,8 @@ void initialize_scan_context(scan_args_t *args) {
|
||||
ScanCtx.msdoc_ctx.content_size = args->content_size;
|
||||
ScanCtx.msdoc_ctx.log = _log;
|
||||
ScanCtx.msdoc_ctx.logf = _logf;
|
||||
ScanCtx.msdoc_ctx.store = _store;
|
||||
ScanCtx.msdoc_ctx.msdoc_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/msword");
|
||||
ScanCtx.msdoc_ctx.store = write_thumbnail_callback;
|
||||
ScanCtx.msdoc_ctx.msdoc_mime = mime_get_mime_by_string("application/msword");
|
||||
|
||||
ScanCtx.threads = args->threads;
|
||||
ScanCtx.depth = args->depth;
|
||||
@ -283,174 +301,67 @@ void initialize_scan_context(scan_args_t *args) {
|
||||
ScanCtx.raw_ctx.tn_size = args->tn_size;
|
||||
ScanCtx.raw_ctx.log = _log;
|
||||
ScanCtx.raw_ctx.logf = _logf;
|
||||
ScanCtx.raw_ctx.store = _store;
|
||||
ScanCtx.raw_ctx.store = write_thumbnail_callback;
|
||||
|
||||
// Wpd
|
||||
ScanCtx.wpd_ctx.content_size = args->content_size;
|
||||
ScanCtx.wpd_ctx.log = _log;
|
||||
ScanCtx.wpd_ctx.logf = _logf;
|
||||
ScanCtx.wpd_ctx.wpd_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/wordperfect");
|
||||
ScanCtx.wpd_ctx.wpd_mime = mime_get_mime_by_string("application/wordperfect");
|
||||
|
||||
// Json
|
||||
ScanCtx.json_ctx.content_size = args->content_size;
|
||||
ScanCtx.json_ctx.log = _log;
|
||||
ScanCtx.json_ctx.logf = _logf;
|
||||
ScanCtx.json_ctx.json_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/json");
|
||||
ScanCtx.json_ctx.ndjson_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/ndjson");
|
||||
ScanCtx.json_ctx.json_mime = mime_get_mime_by_string("application/json");
|
||||
ScanCtx.json_ctx.ndjson_mime = mime_get_mime_by_string("application/ndjson");
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads an existing index as the baseline for incremental scanning.
|
||||
* 1. load old index files (original+main) => original_table
|
||||
* 2. allocate empty table => copy_table
|
||||
* 3. allocate empty table => new_table
|
||||
* the original_table/copy_table/new_table will be populated in parsing/parse.c:parse
|
||||
* and consumed in main.c:save_incremental_index
|
||||
*
|
||||
* Note: the existing index may or may not be of incremental index form.
|
||||
*/
|
||||
void load_incremental_index(const scan_args_t *args) {
|
||||
char file_path[PATH_MAX];
|
||||
|
||||
ScanCtx.original_table = incremental_get_table();
|
||||
ScanCtx.copy_table = incremental_get_table();
|
||||
ScanCtx.new_table = incremental_get_table();
|
||||
|
||||
char descriptor_path[PATH_MAX];
|
||||
snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->incremental);
|
||||
index_descriptor_t original_desc = read_index_descriptor(descriptor_path);
|
||||
|
||||
if (strcmp(original_desc.version, Version) != 0) {
|
||||
LOG_FATALF("main.c", "Version mismatch! Index is %s but executable is %s", original_desc.version, Version)
|
||||
}
|
||||
|
||||
READ_INDICES(
|
||||
file_path,
|
||||
args->incremental,
|
||||
incremental_read(ScanCtx.original_table, file_path, &original_desc),
|
||||
LOG_DEBUG("main.c", "The base index for incremental scan does not have a main index"),
|
||||
TRUE
|
||||
);
|
||||
|
||||
LOG_INFOF("main.c", "Loaded %d items in to mtime table.", g_hash_table_size(ScanCtx.original_table))
|
||||
}
|
||||
|
||||
/**
|
||||
* Saves an incremental index.
|
||||
* Before calling this function, the scanner should have finished writing the main index.
|
||||
* 1. Build original_table - new_table => delete_table
|
||||
* 2. Incrementally copy from old index files [(original+main) /\ copy_table] => index_original.ndjson.zst & store
|
||||
*/
|
||||
void save_incremental_index(scan_args_t *args) {
|
||||
char dst_path[PATH_MAX];
|
||||
char store_path[PATH_MAX];
|
||||
char file_path[PATH_MAX];
|
||||
char del_path[PATH_MAX];
|
||||
snprintf(store_path, PATH_MAX, "%sthumbs", args->incremental);
|
||||
snprintf(dst_path, PATH_MAX, "%s_index_original.ndjson.zst", ScanCtx.index.path);
|
||||
store_t *source = store_create(store_path, STORE_SIZE_TN);
|
||||
|
||||
LOG_INFOF("main.c", "incremental_delete: original size = %u, copy size = %u, new size = %u",
|
||||
g_hash_table_size(ScanCtx.original_table),
|
||||
g_hash_table_size(ScanCtx.copy_table),
|
||||
g_hash_table_size(ScanCtx.new_table));
|
||||
snprintf(del_path, PATH_MAX, "%s_index_delete.list.zst", ScanCtx.index.path);
|
||||
READ_INDICES(file_path, args->incremental,
|
||||
incremental_delete(del_path, file_path, ScanCtx.copy_table, ScanCtx.new_table),
|
||||
perror("incremental_delete"), 1);
|
||||
writer_cleanup();
|
||||
|
||||
READ_INDICES(file_path, args->incremental,
|
||||
incremental_copy(source, ScanCtx.index.store, file_path, dst_path, ScanCtx.copy_table),
|
||||
perror("incremental_copy"), 1);
|
||||
writer_cleanup();
|
||||
|
||||
store_destroy(source);
|
||||
|
||||
snprintf(store_path, PATH_MAX, "%stags", args->incremental);
|
||||
snprintf(dst_path, PATH_MAX, "%stags", ScanCtx.index.path);
|
||||
store_t *source_tags = store_create(store_path, STORE_SIZE_TAG);
|
||||
store_copy(source_tags, dst_path);
|
||||
store_destroy(source_tags);
|
||||
}
|
||||
|
||||
/**
|
||||
* An index can be either incremental or non-incremental (initial index).
|
||||
* For an initial index, there is only the "main" index.
|
||||
* For an incremental index, there are, additionally:
|
||||
* - An "original" index, referencing all files unchanged since the previous index.
|
||||
* - A "delete" index, referencing all files that exist in the previous index, but deleted since then.
|
||||
* Therefore, for an incremental index, "main"+"original" covers all the current files in the live filesystem,
|
||||
* and is orthognal with the "delete" index. When building an incremental index upon an old incremental index,
|
||||
* the old "delete" index can be safely ignored.
|
||||
*/
|
||||
void sist2_scan(scan_args_t *args) {
|
||||
|
||||
ScanCtx.mime_table = mime_get_mime_table();
|
||||
ScanCtx.ext_table = mime_get_ext_table();
|
||||
|
||||
initialize_scan_context(args);
|
||||
|
||||
init_dir(ScanCtx.index.path, args);
|
||||
database_scan_begin(args);
|
||||
|
||||
char store_path[PATH_MAX];
|
||||
snprintf(store_path, PATH_MAX, "%sthumbs", ScanCtx.index.path);
|
||||
ScanCtx.index.store = store_create(store_path, STORE_SIZE_TN);
|
||||
LOG_INFOF("main.c", "sist2 v%s", Version);
|
||||
|
||||
snprintf(store_path, PATH_MAX, "%smeta", ScanCtx.index.path);
|
||||
ScanCtx.index.meta_store = store_create(store_path, STORE_SIZE_META);
|
||||
|
||||
scan_print_header();
|
||||
|
||||
if (args->incremental != NULL) {
|
||||
load_incremental_index(args);
|
||||
}
|
||||
|
||||
ScanCtx.writer_pool = tpool_create(1, writer_cleanup, FALSE);
|
||||
tpool_start(ScanCtx.writer_pool);
|
||||
|
||||
ScanCtx.pool = tpool_create(ScanCtx.threads, thread_cleanup, TRUE);
|
||||
ScanCtx.pool = tpool_create(ScanCtx.threads, TRUE);
|
||||
tpool_start(ScanCtx.pool);
|
||||
|
||||
if (args->list_path) {
|
||||
// Scan using file list
|
||||
int list_ret = iterate_file_list(args->list_file);
|
||||
if (list_ret != 0) {
|
||||
LOG_FATALF("main.c", "iterate_file_list() failed! (%d)", list_ret)
|
||||
LOG_FATALF("main.c", "iterate_file_list() failed! (%d)", list_ret);
|
||||
}
|
||||
} else {
|
||||
// Scan directory recursively
|
||||
int walk_ret = walk_directory_tree(ScanCtx.index.desc.root);
|
||||
if (walk_ret == -1) {
|
||||
LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno)
|
||||
LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno);
|
||||
}
|
||||
}
|
||||
|
||||
tpool_wait(ScanCtx.pool);
|
||||
tpool_destroy(ScanCtx.pool);
|
||||
|
||||
tpool_wait(ScanCtx.writer_pool);
|
||||
tpool_destroy(ScanCtx.writer_pool);
|
||||
LOG_DEBUGF("main.c", "Skipped files: %d", ScanCtx.dbg_skipped_files_count);
|
||||
LOG_DEBUGF("main.c", "Excluded files: %d", ScanCtx.dbg_excluded_files_count);
|
||||
LOG_DEBUGF("main.c", "Failed files: %d", ScanCtx.dbg_failed_files_count);
|
||||
LOG_DEBUGF("main.c", "Thumbnail store size: %lu", ScanCtx.stat_tn_size);
|
||||
LOG_DEBUGF("main.c", "Index size: %lu", ScanCtx.stat_index_size);
|
||||
|
||||
LOG_DEBUGF("main.c", "Skipped files: %d", ScanCtx.dbg_skipped_files_count)
|
||||
LOG_DEBUGF("main.c", "Excluded files: %d", ScanCtx.dbg_excluded_files_count)
|
||||
LOG_DEBUGF("main.c", "Failed files: %d", ScanCtx.dbg_failed_files_count)
|
||||
LOG_DEBUGF("main.c", "Thumbnail store size: %lu", ScanCtx.stat_tn_size)
|
||||
LOG_DEBUGF("main.c", "Index size: %lu", ScanCtx.stat_index_size)
|
||||
database_t *db = database_create(args->output, INDEX_DATABASE);
|
||||
database_open(db);
|
||||
|
||||
if (args->incremental != NULL) {
|
||||
save_incremental_index(args);
|
||||
if (args->incremental != FALSE) {
|
||||
database_incremental_scan_end(db);
|
||||
}
|
||||
|
||||
generate_stats(&ScanCtx.index, args->treemap_threshold, ScanCtx.index.path);
|
||||
|
||||
store_destroy(ScanCtx.index.store);
|
||||
store_destroy(ScanCtx.index.meta_store);
|
||||
database_generate_stats(db, args->treemap_threshold);
|
||||
database_close(db, TRUE);
|
||||
}
|
||||
|
||||
void sist2_index(index_args_t *args) {
|
||||
char file_path[PATH_MAX];
|
||||
|
||||
IndexCtx.es_url = args->es_url;
|
||||
IndexCtx.es_index = args->es_index;
|
||||
IndexCtx.es_insecure_ssl = args->es_insecure_ssl;
|
||||
@ -461,91 +372,69 @@ void sist2_index(index_args_t *args) {
|
||||
elastic_init(args->force_reset, args->es_mappings, args->es_settings);
|
||||
}
|
||||
|
||||
char descriptor_path[PATH_MAX];
|
||||
snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->index_path);
|
||||
database_t *db = database_create(args->index_path, INDEX_DATABASE);
|
||||
database_open(db);
|
||||
index_descriptor_t *desc = database_read_index_descriptor(db);
|
||||
database_close(db, FALSE);
|
||||
|
||||
index_descriptor_t desc = read_index_descriptor(descriptor_path);
|
||||
LOG_DEBUGF("main.c", "Index version %s", desc->version);
|
||||
|
||||
LOG_DEBUGF("main.c", "descriptor version %s (%s)", desc.version, desc.type)
|
||||
|
||||
if (strcmp(desc.version, Version) != 0) {
|
||||
LOG_FATALF("main.c", "Version mismatch! Index is %s but executable is %s", desc.version, Version)
|
||||
if (desc->version_major != VersionMajor) {
|
||||
LOG_FATALF("main.c", "Version mismatch! Index is %s but executable is %s", desc->version, Version);
|
||||
}
|
||||
|
||||
DIR *dir = opendir(args->index_path);
|
||||
if (dir == NULL) {
|
||||
LOG_FATALF("main.c", "Could not open index %s: %s", args->index_path, strerror(errno))
|
||||
}
|
||||
|
||||
char path_tmp[PATH_MAX];
|
||||
snprintf(path_tmp, sizeof(path_tmp), "%stags", args->index_path);
|
||||
IndexCtx.tag_store = store_create(path_tmp, STORE_SIZE_TAG);
|
||||
IndexCtx.tags = store_read_all(IndexCtx.tag_store);
|
||||
|
||||
snprintf(path_tmp, sizeof(path_tmp), "%smeta", args->index_path);
|
||||
IndexCtx.meta_store = store_create(path_tmp, STORE_SIZE_META);
|
||||
IndexCtx.meta = store_read_all(IndexCtx.meta_store);
|
||||
|
||||
index_func f;
|
||||
if (args->print) {
|
||||
f = print_json;
|
||||
} else {
|
||||
f = index_json;
|
||||
}
|
||||
|
||||
IndexCtx.pool = tpool_create(args->threads, elastic_cleanup, args->print == 0);
|
||||
IndexCtx.pool = tpool_create(args->threads, args->print == FALSE);
|
||||
tpool_start(IndexCtx.pool);
|
||||
|
||||
READ_INDICES(file_path, args->index_path, {
|
||||
read_index(file_path, desc.id, desc.type, f);
|
||||
LOG_DEBUGF("main.c", "Read index file %s (%s)", file_path, desc.type);
|
||||
}, {}, !args->incremental);
|
||||
int cnt = 0;
|
||||
|
||||
// Only read the _delete index if we're sending data to ES
|
||||
if (!args->print) {
|
||||
snprintf(file_path, PATH_MAX, "%s_index_delete.list.zst", args->index_path);
|
||||
if (0 == access(file_path, R_OK)) {
|
||||
read_lines(file_path, (line_processor_t) {
|
||||
.data = NULL,
|
||||
.func = delete_document
|
||||
});
|
||||
LOG_DEBUGF("main.c", "Read index file %s (%s)", file_path, desc.type)
|
||||
db = database_create(args->index_path, INDEX_DATABASE);
|
||||
database_open(db);
|
||||
database_iterator_t *iterator = database_create_document_iterator(db);
|
||||
database_document_iter_foreach(json, iterator) {
|
||||
const char *doc_id = cJSON_GetObjectItem(json, "_id")->valuestring;
|
||||
if (args->print) {
|
||||
print_json(json, doc_id);
|
||||
} else {
|
||||
index_json(json, doc_id);
|
||||
cnt +=1;
|
||||
}
|
||||
}
|
||||
|
||||
closedir(dir);
|
||||
free(iterator);
|
||||
database_close(db, FALSE);
|
||||
|
||||
// Only read the _delete index if we're sending data to ES
|
||||
if (!args->print) {
|
||||
// TODO: (delete_list iterator)
|
||||
}
|
||||
|
||||
tpool_wait(IndexCtx.pool);
|
||||
|
||||
tpool_destroy(IndexCtx.pool);
|
||||
|
||||
if (IndexCtx.needs_es_connection) {
|
||||
finish_indexer(args->script, args->async_script, desc.id);
|
||||
finish_indexer(args->script, args->async_script, desc->id);
|
||||
}
|
||||
|
||||
store_destroy(IndexCtx.tag_store);
|
||||
store_destroy(IndexCtx.meta_store);
|
||||
g_hash_table_remove_all(IndexCtx.tags);
|
||||
g_hash_table_destroy(IndexCtx.tags);
|
||||
free(desc);
|
||||
}
|
||||
|
||||
void sist2_exec_script(exec_args_t *args) {
|
||||
|
||||
LogCtx.verbose = TRUE;
|
||||
|
||||
char descriptor_path[PATH_MAX];
|
||||
snprintf(descriptor_path, PATH_MAX, "%sdescriptor.json", args->index_path);
|
||||
index_descriptor_t desc = read_index_descriptor(descriptor_path);
|
||||
|
||||
IndexCtx.es_url = args->es_url;
|
||||
IndexCtx.es_index = args->es_index;
|
||||
IndexCtx.es_insecure_ssl = args->es_insecure_ssl;
|
||||
IndexCtx.needs_es_connection = TRUE;
|
||||
|
||||
LOG_DEBUGF("main.c", "descriptor version %s (%s)", desc.version, desc.type)
|
||||
database_t *db = database_create(args->index_path, INDEX_DATABASE);
|
||||
database_open(db);
|
||||
|
||||
execute_update_script(args->script, args->async_script, desc.id);
|
||||
index_descriptor_t *desc = database_read_index_descriptor(db);
|
||||
LOG_DEBUGF("main.c", "Index version %s", desc->version);
|
||||
|
||||
execute_update_script(args->script, args->async_script, desc->id);
|
||||
free(args->script);
|
||||
database_close(db, FALSE);
|
||||
}
|
||||
|
||||
void sist2_web(web_args_t *args) {
|
||||
@ -569,23 +458,17 @@ void sist2_web(web_args_t *args) {
|
||||
|
||||
for (int i = 0; i < args->index_count; i++) {
|
||||
char *abs_path = abspath(args->indices[i]);
|
||||
if (abs_path == NULL) {
|
||||
return;
|
||||
}
|
||||
char path_tmp[PATH_MAX];
|
||||
|
||||
snprintf(path_tmp, PATH_MAX, "%sthumbs", abs_path);
|
||||
WebCtx.indices[i].store = store_create(path_tmp, STORE_SIZE_TN);
|
||||
|
||||
snprintf(path_tmp, PATH_MAX, "%stags", abs_path);
|
||||
mkdir(path_tmp, S_IWUSR | S_IRUSR | S_IXUSR);
|
||||
WebCtx.indices[i].tag_store = store_create(path_tmp, STORE_SIZE_TAG);
|
||||
|
||||
snprintf(path_tmp, PATH_MAX, "%sdescriptor.json", abs_path);
|
||||
WebCtx.indices[i].desc = read_index_descriptor(path_tmp);
|
||||
|
||||
strcpy(WebCtx.indices[i].path, abs_path);
|
||||
LOG_INFOF("main.c", "Loaded index: [%s]", WebCtx.indices[i].desc.name)
|
||||
|
||||
WebCtx.indices[i].db = database_create(abs_path, INDEX_DATABASE);
|
||||
database_open(WebCtx.indices[i].db);
|
||||
|
||||
index_descriptor_t *desc = database_read_index_descriptor(WebCtx.indices[i].db);
|
||||
WebCtx.indices[i].desc = *desc;
|
||||
free(desc);
|
||||
|
||||
LOG_INFOF("main.c", "Loaded index: [%s]", WebCtx.indices[i].desc.name);
|
||||
free(abs_path);
|
||||
}
|
||||
|
||||
@ -600,7 +483,7 @@ void sist2_web(web_args_t *args) {
|
||||
* Negative number -> Raise error
|
||||
* Specified a valid number -> Continue as normal
|
||||
*/
|
||||
int set_to_negative_if_value_is_zero(struct argparse *self, const struct argparse_option *option) {
|
||||
int set_to_negative_if_value_is_zero(UNUSED(struct argparse *self), const struct argparse_option *option) {
|
||||
int specified_value = *(int *) option->value;
|
||||
|
||||
if (specified_value == 0) {
|
||||
@ -613,6 +496,7 @@ int set_to_negative_if_value_is_zero(struct argparse *self, const struct argpars
|
||||
}
|
||||
}
|
||||
|
||||
#include <zlib.h>
|
||||
|
||||
int main(int argc, const char *argv[]) {
|
||||
// sigsegv_handler = signal(SIGSEGV, sig_handler);
|
||||
@ -645,8 +529,8 @@ int main(int argc, const char *argv[]) {
|
||||
OPT_GROUP("Scan options"),
|
||||
OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"),
|
||||
OPT_INTEGER('q', "thumbnail-quality", &scan_args->tn_quality,
|
||||
"Thumbnail quality, on a scale of 2 to 31, 2 being the best. DEFAULT=2",
|
||||
set_to_negative_if_value_is_zero, (intptr_t) &scan_args->tn_quality),
|
||||
"Thumbnail quality, on a scale of 2 to 31, 2 being the best. DEFAULT=2",
|
||||
set_to_negative_if_value_is_zero, (intptr_t) &scan_args->tn_quality),
|
||||
OPT_INTEGER(0, "thumbnail-size", &scan_args->tn_size,
|
||||
"Thumbnail size, in pixels. DEFAULT=500",
|
||||
set_to_negative_if_value_is_zero, (intptr_t) &scan_args->tn_size),
|
||||
@ -656,7 +540,8 @@ int main(int argc, const char *argv[]) {
|
||||
OPT_INTEGER(0, "content-size", &scan_args->content_size,
|
||||
"Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT=32768",
|
||||
set_to_negative_if_value_is_zero, (intptr_t) &scan_args->content_size),
|
||||
OPT_STRING(0, "incremental", &scan_args->incremental,
|
||||
OPT_BOOLEAN(0, "incremental", &scan_args->incremental,
|
||||
// TODO: Update help string
|
||||
"Reuse an existing index and only scan modified files."),
|
||||
OPT_STRING('o', "output", &scan_args->output, "Output directory. DEFAULT=index.sist2/"),
|
||||
OPT_STRING(0, "rewrite-url", &scan_args->rewrite_url, "Serve files from this url instead of from disk."),
|
||||
@ -692,7 +577,8 @@ int main(int argc, const char *argv[]) {
|
||||
OPT_GROUP("Index options"),
|
||||
OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"),
|
||||
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url with port. DEFAULT=http://localhost:9200"),
|
||||
OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl, "Do not verify SSL connections to Elasticsearch."),
|
||||
OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl,
|
||||
"Do not verify SSL connections to Elasticsearch."),
|
||||
OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"),
|
||||
OPT_BOOLEAN('p', "print", &index_args->print, "Just print JSON documents to stdout."),
|
||||
OPT_BOOLEAN(0, "incremental-index", &index_args->incremental,
|
||||
@ -701,20 +587,22 @@ int main(int argc, const char *argv[]) {
|
||||
OPT_STRING(0, "mappings-file", &index_args->es_mappings_path, "Path to Elasticsearch mappings."),
|
||||
OPT_STRING(0, "settings-file", &index_args->es_settings_path, "Path to Elasticsearch settings."),
|
||||
OPT_BOOLEAN(0, "async-script", &common_async_script, "Execute user script asynchronously."),
|
||||
OPT_INTEGER(0, "batch-size", &index_args->batch_size, "Index batch size. DEFAULT: 100"),
|
||||
OPT_INTEGER(0, "batch-size", &index_args->batch_size, "Index batch size. DEFAULT: 70"),
|
||||
OPT_BOOLEAN('f', "force-reset", &index_args->force_reset, "Reset Elasticsearch mappings and settings. "
|
||||
"(You must use this option the first time you use the index command)"),
|
||||
|
||||
OPT_GROUP("Web options"),
|
||||
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"),
|
||||
OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl, "Do not verify SSL connections to Elasticsearch."),
|
||||
OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl,
|
||||
"Do not verify SSL connections to Elasticsearch."),
|
||||
OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"),
|
||||
OPT_STRING(0, "bind", &web_args->listen_address, "Listen on this address. DEFAULT=localhost:4090"),
|
||||
OPT_STRING(0, "auth", &web_args->credentials, "Basic auth in user:password format"),
|
||||
OPT_STRING(0, "auth0-audience", &web_args->auth0_audience, "API audience/identifier"),
|
||||
OPT_STRING(0, "auth0-domain", &web_args->auth0_domain, "Application domain"),
|
||||
OPT_STRING(0, "auth0-client-id", &web_args->auth0_client_id, "Application client ID"),
|
||||
OPT_STRING(0, "auth0-public-key-file", &web_args->auth0_public_key_path, "Path to Auth0 public key file extracted from <domain>/pem"),
|
||||
OPT_STRING(0, "auth0-public-key-file", &web_args->auth0_public_key_path,
|
||||
"Path to Auth0 public key file extracted from <domain>/pem"),
|
||||
OPT_STRING(0, "tag-auth", &web_args->tag_credentials, "Basic auth in user:password format for tagging"),
|
||||
OPT_STRING(0, "tagline", &web_args->tagline, "Tagline in navbar"),
|
||||
OPT_BOOLEAN(0, "dev", &web_args->dev, "Serve html & js files from disk (for development)"),
|
||||
@ -722,7 +610,8 @@ int main(int argc, const char *argv[]) {
|
||||
|
||||
OPT_GROUP("Exec-script options"),
|
||||
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"),
|
||||
OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl, "Do not verify SSL connections to Elasticsearch."),
|
||||
OPT_BOOLEAN(0, "es-insecure-ssl", &common_es_insecure_ssl,
|
||||
"Do not verify SSL connections to Elasticsearch."),
|
||||
OPT_STRING(0, "es-index", &common_es_index, "Elasticsearch index name. DEFAULT=sist2"),
|
||||
OPT_STRING(0, "script-file", &common_script_path, "Path to user script."),
|
||||
OPT_BOOLEAN(0, "async-script", &common_async_script, "Execute user script asynchronously."),
|
||||
@ -800,7 +689,7 @@ int main(int argc, const char *argv[]) {
|
||||
|
||||
} else {
|
||||
argparse_usage(&argparse);
|
||||
LOG_FATALF("main.c", "Invalid command: '%s'\n", argv[0])
|
||||
LOG_FATALF("main.c", "Invalid command: '%s'\n", argv[0]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
|
@ -1,757 +0,0 @@
|
||||
#include "mempool.h"
|
||||
#include <unistd.h>
|
||||
|
||||
#define NCX_SLAB_PAGE_MASK 3
|
||||
#define NCX_SLAB_PAGE 0
|
||||
#define NCX_SLAB_BIG 1
|
||||
#define NCX_SLAB_EXACT 2
|
||||
#define NCX_SLAB_SMALL 3
|
||||
|
||||
#define NCX_SLAB_PAGE_FREE 0
|
||||
#define NCX_SLAB_PAGE_BUSY 0xffffffffffffffff
|
||||
#define NCX_SLAB_PAGE_START 0x8000000000000000
|
||||
|
||||
#define NCX_SLAB_SHIFT_MASK 0x000000000000000f
|
||||
#define NCX_SLAB_MAP_MASK 0xffffffff00000000
|
||||
#define NCX_SLAB_MAP_SHIFT 32
|
||||
|
||||
#define NCX_SLAB_BUSY 0xffffffffffffffff
|
||||
|
||||
|
||||
static ncx_slab_page_t *ncx_slab_alloc_pages(ncx_slab_pool_t *pool, ncx_uint_t pages);
|
||||
|
||||
static void ncx_slab_free_pages(ncx_slab_pool_t *pool, ncx_slab_page_t *page, ncx_uint_t pages);
|
||||
|
||||
static bool ncx_slab_empty(ncx_slab_pool_t *pool, ncx_slab_page_t *page);
|
||||
|
||||
static ncx_uint_t ncx_slab_max_size;
|
||||
static ncx_uint_t ncx_slab_exact_size;
|
||||
static ncx_uint_t ncx_slab_exact_shift;
|
||||
static ncx_uint_t ncx_pagesize;
|
||||
static ncx_uint_t ncx_pagesize_shift;
|
||||
static ncx_uint_t ncx_real_pages;
|
||||
|
||||
void ncx_slab_init(ncx_slab_pool_t *pool) {
|
||||
u_char *p;
|
||||
size_t size;
|
||||
ncx_uint_t i, n, pages;
|
||||
ncx_slab_page_t *slots;
|
||||
|
||||
/*pagesize*/
|
||||
ncx_pagesize = getpagesize();
|
||||
for (n = ncx_pagesize, ncx_pagesize_shift = 0;
|
||||
n >>= 1; ncx_pagesize_shift++) { /* void */ }
|
||||
|
||||
/* STUB */
|
||||
if (ncx_slab_max_size == 0) {
|
||||
ncx_slab_max_size = ncx_pagesize / 2;
|
||||
ncx_slab_exact_size = ncx_pagesize / (8 * sizeof(uintptr_t));
|
||||
for (n = ncx_slab_exact_size; n >>= 1; ncx_slab_exact_shift++) {
|
||||
/* void */
|
||||
}
|
||||
}
|
||||
|
||||
pool->min_size = 1 << pool->min_shift;
|
||||
|
||||
p = (u_char *) pool + sizeof(ncx_slab_pool_t);
|
||||
slots = (ncx_slab_page_t *) p;
|
||||
|
||||
n = ncx_pagesize_shift - pool->min_shift;
|
||||
for (i = 0; i < n; i++) {
|
||||
slots[i].slab = 0;
|
||||
slots[i].next = &slots[i];
|
||||
slots[i].prev = 0;
|
||||
}
|
||||
|
||||
p += n * sizeof(ncx_slab_page_t);
|
||||
|
||||
size = pool->end - p;
|
||||
|
||||
pages = (ncx_uint_t) (size / (ncx_pagesize + sizeof(ncx_slab_page_t)));
|
||||
|
||||
ncx_memzero(p, pages * sizeof(ncx_slab_page_t));
|
||||
|
||||
pool->pages = (ncx_slab_page_t *) p;
|
||||
|
||||
pool->free.prev = 0;
|
||||
pool->free.next = (ncx_slab_page_t *) p;
|
||||
|
||||
pool->pages->slab = pages;
|
||||
pool->pages->next = &pool->free;
|
||||
pool->pages->prev = (uintptr_t) &pool->free;
|
||||
|
||||
pool->start = (u_char *)
|
||||
ncx_align_ptr((uintptr_t) p + pages * sizeof(ncx_slab_page_t),
|
||||
ncx_pagesize);
|
||||
|
||||
ncx_real_pages = (pool->end - pool->start) / ncx_pagesize;
|
||||
pool->pages->slab = ncx_real_pages;
|
||||
}
|
||||
|
||||
|
||||
void *ncx_slab_alloc(ncx_slab_pool_t *pool, size_t size) {
|
||||
size_t s;
|
||||
uintptr_t p, n, m, mask, *bitmap;
|
||||
ncx_uint_t i, slot, shift, map;
|
||||
ncx_slab_page_t *page, *prev, *slots;
|
||||
|
||||
if (size >= ncx_slab_max_size) {
|
||||
|
||||
page = ncx_slab_alloc_pages(pool, (size >> ncx_pagesize_shift)
|
||||
+ ((size % ncx_pagesize) ? 1 : 0));
|
||||
if (page) {
|
||||
p = (page - pool->pages) << ncx_pagesize_shift;
|
||||
p += (uintptr_t) pool->start;
|
||||
|
||||
} else {
|
||||
p = 0;
|
||||
}
|
||||
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (size > pool->min_size) {
|
||||
shift = 1;
|
||||
for (s = size - 1; s >>= 1; shift++) { /* void */ }
|
||||
slot = shift - pool->min_shift;
|
||||
|
||||
} else {
|
||||
shift = pool->min_shift;
|
||||
slot = 0;
|
||||
}
|
||||
|
||||
slots = (ncx_slab_page_t *) ((u_char *) pool + sizeof(ncx_slab_pool_t));
|
||||
page = slots[slot].next;
|
||||
|
||||
if (page->next != page) {
|
||||
|
||||
if (shift < ncx_slab_exact_shift) {
|
||||
|
||||
do {
|
||||
p = (page - pool->pages) << ncx_pagesize_shift;
|
||||
bitmap = (uintptr_t *) (pool->start + p);
|
||||
|
||||
map = (1 << (ncx_pagesize_shift - shift))
|
||||
/ (sizeof(uintptr_t) * 8);
|
||||
|
||||
for (n = 0; n < map; n++) {
|
||||
|
||||
if (bitmap[n] != NCX_SLAB_BUSY) {
|
||||
|
||||
for (m = 1, i = 0; m; m <<= 1, i++) {
|
||||
if ((bitmap[n] & m)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bitmap[n] |= m;
|
||||
|
||||
i = ((n * sizeof(uintptr_t) * 8) << shift)
|
||||
+ (i << shift);
|
||||
|
||||
if (bitmap[n] == NCX_SLAB_BUSY) {
|
||||
for (n = n + 1; n < map; n++) {
|
||||
if (bitmap[n] != NCX_SLAB_BUSY) {
|
||||
p = (uintptr_t) bitmap + i;
|
||||
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
prev = (ncx_slab_page_t *)
|
||||
(page->prev & ~NCX_SLAB_PAGE_MASK);
|
||||
prev->next = page->next;
|
||||
page->next->prev = page->prev;
|
||||
|
||||
page->next = NULL;
|
||||
page->prev = NCX_SLAB_SMALL;
|
||||
}
|
||||
|
||||
p = (uintptr_t) bitmap + i;
|
||||
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
page = page->next;
|
||||
|
||||
} while (page);
|
||||
|
||||
} else if (shift == ncx_slab_exact_shift) {
|
||||
|
||||
do {
|
||||
if (page->slab != NCX_SLAB_BUSY) {
|
||||
|
||||
for (m = 1, i = 0; m; m <<= 1, i++) {
|
||||
if ((page->slab & m)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
page->slab |= m;
|
||||
|
||||
if (page->slab == NCX_SLAB_BUSY) {
|
||||
prev = (ncx_slab_page_t *)
|
||||
(page->prev & ~NCX_SLAB_PAGE_MASK);
|
||||
prev->next = page->next;
|
||||
page->next->prev = page->prev;
|
||||
|
||||
page->next = NULL;
|
||||
page->prev = NCX_SLAB_EXACT;
|
||||
}
|
||||
|
||||
p = (page - pool->pages) << ncx_pagesize_shift;
|
||||
p += i << shift;
|
||||
p += (uintptr_t) pool->start;
|
||||
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
page = page->next;
|
||||
|
||||
} while (page);
|
||||
|
||||
} else { /* shift > ncx_slab_exact_shift */
|
||||
|
||||
n = ncx_pagesize_shift - (page->slab & NCX_SLAB_SHIFT_MASK);
|
||||
n = 1 << n;
|
||||
n = ((uintptr_t) 1 << n) - 1;
|
||||
mask = n << NCX_SLAB_MAP_SHIFT;
|
||||
|
||||
do {
|
||||
if ((page->slab & NCX_SLAB_MAP_MASK) != mask) {
|
||||
|
||||
for (m = (uintptr_t) 1 << NCX_SLAB_MAP_SHIFT, i = 0;
|
||||
m & mask;
|
||||
m <<= 1, i++) {
|
||||
if ((page->slab & m)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
page->slab |= m;
|
||||
|
||||
if ((page->slab & NCX_SLAB_MAP_MASK) == mask) {
|
||||
prev = (ncx_slab_page_t *)
|
||||
(page->prev & ~NCX_SLAB_PAGE_MASK);
|
||||
prev->next = page->next;
|
||||
page->next->prev = page->prev;
|
||||
|
||||
page->next = NULL;
|
||||
page->prev = NCX_SLAB_BIG;
|
||||
}
|
||||
|
||||
p = (page - pool->pages) << ncx_pagesize_shift;
|
||||
p += i << shift;
|
||||
p += (uintptr_t) pool->start;
|
||||
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
page = page->next;
|
||||
|
||||
} while (page);
|
||||
}
|
||||
}
|
||||
|
||||
page = ncx_slab_alloc_pages(pool, 1);
|
||||
|
||||
if (page) {
|
||||
if (shift < ncx_slab_exact_shift) {
|
||||
p = (page - pool->pages) << ncx_pagesize_shift;
|
||||
bitmap = (uintptr_t *) (pool->start + p);
|
||||
|
||||
s = 1 << shift;
|
||||
n = (1 << (ncx_pagesize_shift - shift)) / 8 / s;
|
||||
|
||||
if (n == 0) {
|
||||
n = 1;
|
||||
}
|
||||
|
||||
bitmap[0] = (2 << n) - 1;
|
||||
|
||||
map = (1 << (ncx_pagesize_shift - shift)) / (sizeof(uintptr_t) * 8);
|
||||
|
||||
for (i = 1; i < map; i++) {
|
||||
bitmap[i] = 0;
|
||||
}
|
||||
|
||||
page->slab = shift;
|
||||
page->next = &slots[slot];
|
||||
page->prev = (uintptr_t) &slots[slot] | NCX_SLAB_SMALL;
|
||||
|
||||
slots[slot].next = page;
|
||||
|
||||
p = ((page - pool->pages) << ncx_pagesize_shift) + s * n;
|
||||
p += (uintptr_t) pool->start;
|
||||
|
||||
goto done;
|
||||
|
||||
} else if (shift == ncx_slab_exact_shift) {
|
||||
|
||||
page->slab = 1;
|
||||
page->next = &slots[slot];
|
||||
page->prev = (uintptr_t) &slots[slot] | NCX_SLAB_EXACT;
|
||||
|
||||
slots[slot].next = page;
|
||||
|
||||
p = (page - pool->pages) << ncx_pagesize_shift;
|
||||
p += (uintptr_t) pool->start;
|
||||
|
||||
goto done;
|
||||
|
||||
} else { /* shift > ncx_slab_exact_shift */
|
||||
|
||||
page->slab = ((uintptr_t) 1 << NCX_SLAB_MAP_SHIFT) | shift;
|
||||
page->next = &slots[slot];
|
||||
page->prev = (uintptr_t) &slots[slot] | NCX_SLAB_BIG;
|
||||
|
||||
slots[slot].next = page;
|
||||
|
||||
p = (page - pool->pages) << ncx_pagesize_shift;
|
||||
p += (uintptr_t) pool->start;
|
||||
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
p = 0;
|
||||
|
||||
done:
|
||||
|
||||
return (void *) p;
|
||||
}
|
||||
|
||||
|
||||
void ncx_slab_free(ncx_slab_pool_t *pool, void *p) {
|
||||
size_t size;
|
||||
uintptr_t slab, m, *bitmap;
|
||||
ncx_uint_t n, type, slot, shift, map;
|
||||
ncx_slab_page_t *slots, *page;
|
||||
|
||||
if ((u_char *) p < pool->start || (u_char *) p > pool->end) {
|
||||
// error("ncx_slab_free(): outside of pool");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
n = ((u_char *) p - pool->start) >> ncx_pagesize_shift;
|
||||
page = &pool->pages[n];
|
||||
slab = page->slab;
|
||||
type = page->prev & NCX_SLAB_PAGE_MASK;
|
||||
|
||||
switch (type) {
|
||||
|
||||
case NCX_SLAB_SMALL:
|
||||
|
||||
shift = slab & NCX_SLAB_SHIFT_MASK;
|
||||
size = 1 << shift;
|
||||
|
||||
if ((uintptr_t) p & (size - 1)) {
|
||||
goto wrong_chunk;
|
||||
}
|
||||
|
||||
n = ((uintptr_t) p & (ncx_pagesize - 1)) >> shift;
|
||||
m = (uintptr_t) 1 << (n & (sizeof(uintptr_t) * 8 - 1));
|
||||
n /= (sizeof(uintptr_t) * 8);
|
||||
bitmap = (uintptr_t *) ((uintptr_t) p & ~(ncx_pagesize - 1));
|
||||
|
||||
if (bitmap[n] & m) {
|
||||
|
||||
if (page->next == NULL) {
|
||||
slots = (ncx_slab_page_t *)
|
||||
((u_char *) pool + sizeof(ncx_slab_pool_t));
|
||||
slot = shift - pool->min_shift;
|
||||
|
||||
page->next = slots[slot].next;
|
||||
slots[slot].next = page;
|
||||
|
||||
page->prev = (uintptr_t) &slots[slot] | NCX_SLAB_SMALL;
|
||||
page->next->prev = (uintptr_t) page | NCX_SLAB_SMALL;
|
||||
}
|
||||
|
||||
bitmap[n] &= ~m;
|
||||
|
||||
n = (1 << (ncx_pagesize_shift - shift)) / 8 / (1 << shift);
|
||||
|
||||
if (n == 0) {
|
||||
n = 1;
|
||||
}
|
||||
|
||||
if (bitmap[0] & ~(((uintptr_t) 1 << n) - 1)) {
|
||||
goto done;
|
||||
}
|
||||
|
||||
map = (1 << (ncx_pagesize_shift - shift)) / (sizeof(uintptr_t) * 8);
|
||||
|
||||
for (n = 1; n < map; n++) {
|
||||
if (bitmap[n]) {
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
ncx_slab_free_pages(pool, page, 1);
|
||||
|
||||
goto done;
|
||||
}
|
||||
|
||||
goto chunk_already_free;
|
||||
|
||||
case NCX_SLAB_EXACT:
|
||||
|
||||
m = (uintptr_t) 1 <<
|
||||
(((uintptr_t) p & (ncx_pagesize - 1)) >> ncx_slab_exact_shift);
|
||||
size = ncx_slab_exact_size;
|
||||
|
||||
if ((uintptr_t) p & (size - 1)) {
|
||||
goto wrong_chunk;
|
||||
}
|
||||
|
||||
if (slab & m) {
|
||||
if (slab == NCX_SLAB_BUSY) {
|
||||
slots = (ncx_slab_page_t *)
|
||||
((u_char *) pool + sizeof(ncx_slab_pool_t));
|
||||
slot = ncx_slab_exact_shift - pool->min_shift;
|
||||
|
||||
page->next = slots[slot].next;
|
||||
slots[slot].next = page;
|
||||
|
||||
page->prev = (uintptr_t) &slots[slot] | NCX_SLAB_EXACT;
|
||||
page->next->prev = (uintptr_t) page | NCX_SLAB_EXACT;
|
||||
}
|
||||
|
||||
page->slab &= ~m;
|
||||
|
||||
if (page->slab) {
|
||||
goto done;
|
||||
}
|
||||
|
||||
ncx_slab_free_pages(pool, page, 1);
|
||||
|
||||
goto done;
|
||||
}
|
||||
|
||||
goto chunk_already_free;
|
||||
|
||||
case NCX_SLAB_BIG:
|
||||
|
||||
shift = slab & NCX_SLAB_SHIFT_MASK;
|
||||
size = 1 << shift;
|
||||
|
||||
if ((uintptr_t) p & (size - 1)) {
|
||||
goto wrong_chunk;
|
||||
}
|
||||
|
||||
m = (uintptr_t) 1 << ((((uintptr_t) p & (ncx_pagesize - 1)) >> shift)
|
||||
+ NCX_SLAB_MAP_SHIFT);
|
||||
|
||||
if (slab & m) {
|
||||
|
||||
if (page->next == NULL) {
|
||||
slots = (ncx_slab_page_t *)
|
||||
((u_char *) pool + sizeof(ncx_slab_pool_t));
|
||||
slot = shift - pool->min_shift;
|
||||
|
||||
page->next = slots[slot].next;
|
||||
slots[slot].next = page;
|
||||
|
||||
page->prev = (uintptr_t) &slots[slot] | NCX_SLAB_BIG;
|
||||
page->next->prev = (uintptr_t) page | NCX_SLAB_BIG;
|
||||
}
|
||||
|
||||
page->slab &= ~m;
|
||||
|
||||
if (page->slab & NCX_SLAB_MAP_MASK) {
|
||||
goto done;
|
||||
}
|
||||
|
||||
ncx_slab_free_pages(pool, page, 1);
|
||||
|
||||
goto done;
|
||||
}
|
||||
|
||||
goto chunk_already_free;
|
||||
|
||||
case NCX_SLAB_PAGE:
|
||||
|
||||
if ((uintptr_t) p & (ncx_pagesize - 1)) {
|
||||
goto wrong_chunk;
|
||||
}
|
||||
|
||||
if (slab == NCX_SLAB_PAGE_FREE) {
|
||||
// alert("ncx_slab_free(): page is already free");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (slab == NCX_SLAB_PAGE_BUSY) {
|
||||
// alert("ncx_slab_free(): pointer to wrong page");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
n = ((u_char *) p - pool->start) >> ncx_pagesize_shift;
|
||||
size = slab & ~NCX_SLAB_PAGE_START;
|
||||
|
||||
ncx_slab_free_pages(pool, &pool->pages[n], size);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/* not reached */
|
||||
|
||||
return;
|
||||
|
||||
done:
|
||||
|
||||
return;
|
||||
|
||||
wrong_chunk:
|
||||
|
||||
// error("ncx_slab_free(): pointer to wrong chunk");
|
||||
|
||||
goto fail;
|
||||
|
||||
chunk_already_free:
|
||||
|
||||
// error("ncx_slab_free(): chunk is already free");
|
||||
|
||||
fail:
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
static ncx_slab_page_t *ncx_slab_alloc_pages(ncx_slab_pool_t *pool, ncx_uint_t pages) {
|
||||
ncx_slab_page_t *page, *p;
|
||||
|
||||
for (page = pool->free.next; page != &pool->free; page = page->next) {
|
||||
|
||||
if (page->slab >= pages) {
|
||||
|
||||
if (page->slab > pages) {
|
||||
page[pages].slab = page->slab - pages;
|
||||
page[pages].next = page->next;
|
||||
page[pages].prev = page->prev;
|
||||
|
||||
p = (ncx_slab_page_t *) page->prev;
|
||||
p->next = &page[pages];
|
||||
page->next->prev = (uintptr_t) &page[pages];
|
||||
|
||||
} else {
|
||||
p = (ncx_slab_page_t *) page->prev;
|
||||
p->next = page->next;
|
||||
page->next->prev = page->prev;
|
||||
}
|
||||
|
||||
page->slab = pages | NCX_SLAB_PAGE_START;
|
||||
page->next = NULL;
|
||||
page->prev = NCX_SLAB_PAGE;
|
||||
|
||||
if (--pages == 0) {
|
||||
return page;
|
||||
}
|
||||
|
||||
for (p = page + 1; pages; pages--) {
|
||||
p->slab = NCX_SLAB_PAGE_BUSY;
|
||||
p->next = NULL;
|
||||
p->prev = NCX_SLAB_PAGE;
|
||||
p++;
|
||||
}
|
||||
|
||||
return page;
|
||||
}
|
||||
}
|
||||
|
||||
// error("ncx_slab_alloc() failed: no memory");
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void ncx_slab_free_pages(ncx_slab_pool_t *pool, ncx_slab_page_t *page, ncx_uint_t pages) {
|
||||
ncx_slab_page_t *prev;
|
||||
|
||||
if (pages > 1) {
|
||||
ncx_memzero(&page[1], (pages - 1) * sizeof(ncx_slab_page_t));
|
||||
}
|
||||
|
||||
if (page->next) {
|
||||
prev = (ncx_slab_page_t *) (page->prev & ~NCX_SLAB_PAGE_MASK);
|
||||
prev->next = page->next;
|
||||
page->next->prev = page->prev;
|
||||
}
|
||||
|
||||
page->slab = pages;
|
||||
page->prev = (uintptr_t) &pool->free;
|
||||
page->next = pool->free.next;
|
||||
page->next->prev = (uintptr_t) page;
|
||||
|
||||
pool->free.next = page;
|
||||
|
||||
#ifdef PAGE_MERGE
|
||||
if (pool->pages != page) {
|
||||
prev = page - 1;
|
||||
if (ncx_slab_empty(pool, prev)) {
|
||||
for (; prev >= pool->pages; prev--) {
|
||||
if (prev->slab != 0)
|
||||
{
|
||||
pool->free.next = page->next;
|
||||
page->next->prev = (uintptr_t) &pool->free;
|
||||
|
||||
prev->slab += pages;
|
||||
ncx_memzero(page, sizeof(ncx_slab_page_t));
|
||||
|
||||
page = prev;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((page - pool->pages + page->slab) < ncx_real_pages) {
|
||||
next = page + page->slab;
|
||||
if (ncx_slab_empty(pool, next))
|
||||
{
|
||||
prev = (ncx_slab_page_t *) (next->prev);
|
||||
prev->next = next->next;
|
||||
next->next->prev = next->prev;
|
||||
|
||||
page->slab += next->slab;
|
||||
ncx_memzero(next, sizeof(ncx_slab_page_t));
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void ncx_slab_stat(ncx_slab_pool_t *pool, ncx_slab_stat_t *stat) {
|
||||
uintptr_t m, n, mask, slab;
|
||||
uintptr_t *bitmap;
|
||||
ncx_uint_t i, j, map, type, obj_size;
|
||||
ncx_slab_page_t *page;
|
||||
|
||||
ncx_memzero(stat, sizeof(ncx_slab_stat_t));
|
||||
|
||||
page = pool->pages;
|
||||
stat->pages = (pool->end - pool->start) / ncx_pagesize;
|
||||
|
||||
for (i = 0; i < stat->pages; i++) {
|
||||
slab = page->slab;
|
||||
type = page->prev & NCX_SLAB_PAGE_MASK;
|
||||
|
||||
switch (type) {
|
||||
|
||||
case NCX_SLAB_SMALL:
|
||||
|
||||
n = (page - pool->pages) << ncx_pagesize_shift;
|
||||
bitmap = (uintptr_t *) (pool->start + n);
|
||||
|
||||
obj_size = 1 << slab;
|
||||
map = (1 << (ncx_pagesize_shift - slab))
|
||||
/ (sizeof(uintptr_t) * 8);
|
||||
|
||||
for (j = 0; j < map; j++) {
|
||||
for (m = 1; m; m <<= 1) {
|
||||
if ((bitmap[j] & m)) {
|
||||
stat->used_size += obj_size;
|
||||
stat->b_small += obj_size;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
stat->p_small++;
|
||||
|
||||
break;
|
||||
|
||||
case NCX_SLAB_EXACT:
|
||||
|
||||
if (slab == NCX_SLAB_BUSY) {
|
||||
stat->used_size += sizeof(uintptr_t) * 8 * ncx_slab_exact_size;
|
||||
stat->b_exact += sizeof(uintptr_t) * 8 * ncx_slab_exact_size;
|
||||
} else {
|
||||
for (m = 1; m; m <<= 1) {
|
||||
if (slab & m) {
|
||||
stat->used_size += ncx_slab_exact_size;
|
||||
stat->b_exact += ncx_slab_exact_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stat->p_exact++;
|
||||
|
||||
break;
|
||||
|
||||
case NCX_SLAB_BIG:
|
||||
|
||||
j = ncx_pagesize_shift - (slab & NCX_SLAB_SHIFT_MASK);
|
||||
j = 1 << j;
|
||||
j = ((uintptr_t) 1 << j) - 1;
|
||||
mask = j << NCX_SLAB_MAP_SHIFT;
|
||||
obj_size = 1 << (slab & NCX_SLAB_SHIFT_MASK);
|
||||
|
||||
for (m = (uintptr_t) 1 << NCX_SLAB_MAP_SHIFT; m & mask; m <<= 1) {
|
||||
if ((page->slab & m)) {
|
||||
stat->used_size += obj_size;
|
||||
stat->b_big += obj_size;
|
||||
}
|
||||
}
|
||||
|
||||
stat->p_big++;
|
||||
|
||||
break;
|
||||
|
||||
case NCX_SLAB_PAGE:
|
||||
|
||||
if (page->prev == NCX_SLAB_PAGE) {
|
||||
slab = slab & ~NCX_SLAB_PAGE_START;
|
||||
stat->used_size += slab * ncx_pagesize;
|
||||
stat->b_page += slab * ncx_pagesize;
|
||||
stat->p_page += slab;
|
||||
|
||||
i += (slab - 1);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
|
||||
if (slab > stat->max_free_pages) {
|
||||
stat->max_free_pages = page->slab;
|
||||
}
|
||||
|
||||
stat->free_page += slab;
|
||||
|
||||
i += (slab - 1);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
page = pool->pages + i + 1;
|
||||
}
|
||||
|
||||
stat->pool_size = pool->end - pool->start;
|
||||
stat->used_pct = stat->used_size * 100 / stat->pool_size;
|
||||
}
|
||||
|
||||
static bool ncx_slab_empty(ncx_slab_pool_t *pool, ncx_slab_page_t *page) {
|
||||
ncx_slab_page_t *prev;
|
||||
|
||||
if (page->slab == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
//page->prev == PAGE | SMALL | EXACT | BIG
|
||||
if (page->next == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
prev = (ncx_slab_page_t *) (page->prev & ~NCX_SLAB_PAGE_MASK);
|
||||
while (prev >= pool->pages) {
|
||||
prev = (ncx_slab_page_t *) (prev->prev & ~NCX_SLAB_PAGE_MASK);
|
||||
}
|
||||
|
||||
if (prev == &pool->free) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
@ -1,62 +0,0 @@
|
||||
#ifndef SIST2_MEMPOOL_H
|
||||
#define SIST2_MEMPOOL_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
#include <string.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
typedef unsigned char u_char;
|
||||
typedef uintptr_t ncx_uint_t;
|
||||
|
||||
#ifndef NCX_ALIGNMENT
|
||||
#define NCX_ALIGNMENT sizeof(unsigned long)
|
||||
#endif
|
||||
|
||||
#define ncx_align(d, a) (((d) + (a - 1)) & ~(a - 1))
|
||||
#define ncx_align_ptr(p, a) (u_char *) (((uintptr_t) (p) + ((uintptr_t) a - 1)) & ~((uintptr_t) a - 1))
|
||||
|
||||
#define ncx_memzero(buf, n) (void) memset(buf, 0, n)
|
||||
#define ncx_memset(buf, c, n) (void) memset(buf, c, n)
|
||||
|
||||
typedef struct ncx_slab_page_s ncx_slab_page_t;
|
||||
|
||||
struct ncx_slab_page_s {
|
||||
uintptr_t slab;
|
||||
ncx_slab_page_t *next;
|
||||
uintptr_t prev;
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
size_t min_size;
|
||||
size_t min_shift;
|
||||
|
||||
ncx_slab_page_t *pages;
|
||||
ncx_slab_page_t free;
|
||||
|
||||
u_char *start;
|
||||
u_char *end;
|
||||
|
||||
//ncx_shmtx_t mutex;
|
||||
|
||||
void *addr;
|
||||
} ncx_slab_pool_t;
|
||||
|
||||
typedef struct {
|
||||
size_t pool_size, used_size, used_pct;
|
||||
size_t pages, free_page;
|
||||
size_t p_small, p_exact, p_big, p_page;
|
||||
size_t b_small, b_exact, b_big, b_page;
|
||||
size_t max_free_pages;
|
||||
} ncx_slab_stat_t;
|
||||
|
||||
void ncx_slab_init(ncx_slab_pool_t *mempool);
|
||||
|
||||
void *ncx_slab_alloc(ncx_slab_pool_t *mempool, size_t size);
|
||||
|
||||
void ncx_slab_free(ncx_slab_pool_t *mempool, void *p);
|
||||
|
||||
void ncx_slab_stat(ncx_slab_pool_t *mempool, ncx_slab_stat_t *stat);
|
||||
|
||||
#endif //SIST2_MEMPOOL_H
|
42
src/parsing/fs_util.h
Normal file
42
src/parsing/fs_util.h
Normal file
@ -0,0 +1,42 @@
|
||||
#ifndef SIST2_FS_UTIL_H
|
||||
#define SIST2_FS_UTIL_H
|
||||
|
||||
#include "src/sist.h"
|
||||
|
||||
#define CLOSE_FILE(f) if ((f).close != NULL) {(f).close(&(f));};
|
||||
|
||||
static int fs_read(struct vfile *f, void *buf, size_t size) {
|
||||
|
||||
if (f->fd == -1) {
|
||||
SHA1_Init(&f->sha1_ctx);
|
||||
|
||||
f->fd = open(f->filepath, O_RDONLY);
|
||||
if (f->fd == -1) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
int ret = (int) read(f->fd, buf, size);
|
||||
|
||||
if (ret != 0 && f->calculate_checksum) {
|
||||
f->has_checksum = TRUE;
|
||||
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void fs_close(struct vfile *f) {
|
||||
if (f->fd != -1) {
|
||||
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
|
||||
close(f->fd);
|
||||
}
|
||||
}
|
||||
|
||||
static void fs_reset(struct vfile *f) {
|
||||
if (f->fd != -1) {
|
||||
lseek(f->fd, 0, SEEK_SET);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
32
src/parsing/magic_util.c
Normal file
32
src/parsing/magic_util.c
Normal file
@ -0,0 +1,32 @@
|
||||
#include "magic_util.h"
|
||||
#include "src/log.h"
|
||||
#include "mime.h"
|
||||
#include <magic.h>
|
||||
#include "src/magic_generated.c"
|
||||
|
||||
|
||||
char *magic_buffer_embedded(void *buffer, size_t buffer_size) {
|
||||
|
||||
magic_t magic = magic_open(MAGIC_MIME_TYPE);
|
||||
|
||||
const char *magic_buffers[1] = {magic_database_buffer,};
|
||||
size_t sizes[1] = {sizeof(magic_database_buffer),};
|
||||
|
||||
// TODO: check if we can reuse the magic instance
|
||||
int load_ret = magic_load_buffers(magic, (void **) &magic_buffers, sizes, 1);
|
||||
|
||||
if (load_ret != 0) {
|
||||
LOG_FATALF("parse.c", "Could not load libmagic database: (%d)", load_ret);
|
||||
}
|
||||
|
||||
const char *magic_mime_str = magic_buffer(magic, buffer, buffer_size);
|
||||
char *return_value = NULL;
|
||||
|
||||
if (magic_mime_str != NULL) {
|
||||
return_value = malloc(strlen(magic_mime_str) + 1);
|
||||
strcpy(return_value, magic_mime_str);
|
||||
}
|
||||
|
||||
magic_close(magic);
|
||||
return return_value;
|
||||
}
|
8
src/parsing/magic_util.h
Normal file
8
src/parsing/magic_util.h
Normal file
@ -0,0 +1,8 @@
|
||||
#ifndef SIST2_MAGIC_UTIL_H
|
||||
#define SIST2_MAGIC_UTIL_H
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
char *magic_buffer_embedded(void *buffer, size_t buffer_size);
|
||||
|
||||
#endif //SIST2_MAGIC_UTIL_H
|
@ -1,22 +1,30 @@
|
||||
#include "mime.h"
|
||||
#include <zlib.h>
|
||||
|
||||
unsigned int mime_get_mime_by_ext(GHashTable *ext_table, const char * ext) {
|
||||
char lower[8];
|
||||
char *p = lower;
|
||||
unsigned int mime_get_mime_by_ext(const char *ext) {
|
||||
unsigned char lower[16];
|
||||
unsigned char *p = lower;
|
||||
int cnt = 0;
|
||||
while ((*ext) != '\0' && cnt + 1 < sizeof(lower)) {
|
||||
*p++ = (char)tolower(*ext++);
|
||||
*p++ = tolower(*ext++);
|
||||
cnt++;
|
||||
}
|
||||
*p = '\0';
|
||||
return (size_t) g_hash_table_lookup(ext_table, lower);
|
||||
|
||||
unsigned long crc = crc32(0, lower, cnt);
|
||||
|
||||
unsigned int mime = mime_extension_lookup(crc);
|
||||
return mime;
|
||||
}
|
||||
|
||||
unsigned int mime_get_mime_by_string(GHashTable *mime_table, const char * str) {
|
||||
unsigned int mime_get_mime_by_string(const char *str) {
|
||||
|
||||
const char * ptr = str;
|
||||
const char *ptr = str;
|
||||
while (*ptr == ' ' || *ptr == '[') {
|
||||
ptr++;
|
||||
}
|
||||
return (size_t) g_hash_table_lookup(mime_table, ptr);
|
||||
|
||||
unsigned long crc = crc32(0, (unsigned char *) ptr, strlen(ptr));
|
||||
|
||||
return mime_name_lookup(crc);
|
||||
}
|
||||
|
@ -51,14 +51,14 @@ enum major_mime {
|
||||
|
||||
enum mime;
|
||||
|
||||
GHashTable *mime_get_mime_table();
|
||||
unsigned int mime_name_lookup(unsigned long mime_crc32);
|
||||
|
||||
GHashTable *mime_get_ext_table();
|
||||
unsigned int mime_extension_lookup(unsigned long extension_crc32);
|
||||
|
||||
char *mime_get_mime_text(unsigned int);
|
||||
const char *mime_get_mime_text(unsigned int);
|
||||
|
||||
unsigned int mime_get_mime_by_ext(GHashTable *ext_table, const char * ext);
|
||||
unsigned int mime_get_mime_by_ext(const char *ext);
|
||||
|
||||
unsigned int mime_get_mime_by_string(GHashTable *mime_table, const char * str);
|
||||
unsigned int mime_get_mime_by_string(const char *str);
|
||||
|
||||
#endif
|
||||
|
2730
src/parsing/mime_generated.c
vendored
2730
src/parsing/mime_generated.c
vendored
File diff suppressed because it is too large
Load Diff
@ -5,235 +5,242 @@
|
||||
#include "mime.h"
|
||||
#include "src/io/serialize.h"
|
||||
#include "src/parsing/sidecar.h"
|
||||
#include "src/magic_generated.c"
|
||||
|
||||
#include <magic.h>
|
||||
#include "src/parsing/fs_util.h"
|
||||
#include "src/parsing/magic_util.h"
|
||||
#include <pthread.h>
|
||||
|
||||
|
||||
#define MIN_VIDEO_SIZE (1024 * 64)
|
||||
#define MIN_IMAGE_SIZE (512)
|
||||
|
||||
int fs_read(struct vfile *f, void *buf, size_t size) {
|
||||
#define MAGIC_BUF_SIZE (4096 * 6)
|
||||
|
||||
if (f->fd == -1) {
|
||||
SHA1_Init(&f->sha1_ctx);
|
||||
typedef enum {
|
||||
FILETYPE_DONT_PARSE,
|
||||
FILETYPE_RAW,
|
||||
FILETYPE_MEDIA,
|
||||
FILETYPE_EBOOK,
|
||||
FILETYPE_MARKUP,
|
||||
FILETYPE_TEXT,
|
||||
FILETYPE_FONT,
|
||||
FILETYPE_ARCHIVE,
|
||||
FILETYPE_OOXML,
|
||||
FILETYPE_COMIC,
|
||||
FILETYPE_MOBI,
|
||||
FILETYPE_SIST2_SIDECAR,
|
||||
FILETYPE_MSDOC,
|
||||
FILETYPE_JSON,
|
||||
FILETYPE_NDJSON,
|
||||
} file_type_t;
|
||||
|
||||
f->fd = open(f->filepath, O_RDONLY);
|
||||
if (f->fd == -1) {
|
||||
return -1;
|
||||
file_type_t get_file_type(unsigned int mime, size_t size, const char *filepath) {
|
||||
|
||||
int major_mime = MAJOR_MIME(mime);
|
||||
|
||||
if (!(SHOULD_PARSE(mime))) {
|
||||
return FILETYPE_DONT_PARSE;
|
||||
} else if (IS_RAW(mime)) {
|
||||
return FILETYPE_RAW;
|
||||
} else if ((major_mime == MimeVideo && size >= MIN_VIDEO_SIZE) ||
|
||||
(major_mime == MimeImage && size >= MIN_IMAGE_SIZE) || major_mime == MimeAudio) {
|
||||
return FILETYPE_MEDIA;
|
||||
} else if (IS_PDF(mime)) {
|
||||
return FILETYPE_EBOOK;
|
||||
} else if (major_mime == MimeText && ScanCtx.text_ctx.content_size > 0) {
|
||||
if (IS_MARKUP(mime)) {
|
||||
return FILETYPE_MARKUP;
|
||||
} else {
|
||||
return FILETYPE_TEXT;
|
||||
}
|
||||
|
||||
} else if (IS_FONT(mime)) {
|
||||
return FILETYPE_FONT;
|
||||
} else if (
|
||||
ScanCtx.arc_ctx.mode != ARC_MODE_SKIP && (
|
||||
IS_ARC(mime) ||
|
||||
(IS_ARC_FILTER(mime) && should_parse_filtered_file(filepath))
|
||||
)) {
|
||||
return FILETYPE_ARCHIVE;
|
||||
} else if ((ScanCtx.ooxml_ctx.content_size > 0 || ScanCtx.media_ctx.tn_size > 0) && IS_DOC(mime)) {
|
||||
return FILETYPE_OOXML;
|
||||
} else if (is_cbr(&ScanCtx.comic_ctx, mime) || is_cbz(&ScanCtx.comic_ctx, mime)) {
|
||||
return FILETYPE_COMIC;
|
||||
} else if (IS_MOBI(mime)) {
|
||||
return FILETYPE_MOBI;
|
||||
} else if (mime == MIME_SIST2_SIDECAR) {
|
||||
return FILETYPE_SIST2_SIDECAR;
|
||||
} else if (is_msdoc(&ScanCtx.msdoc_ctx, mime)) {
|
||||
return FILETYPE_MSDOC;
|
||||
} else if (is_json(&ScanCtx.json_ctx, mime)) {
|
||||
return FILETYPE_JSON;
|
||||
} else if (is_ndjson(&ScanCtx.json_ctx, mime)) {
|
||||
return FILETYPE_NDJSON;
|
||||
}
|
||||
}
|
||||
|
||||
#define GET_MIME_ERROR_FATAL (-1)
|
||||
|
||||
int get_mime(parse_job_t *job) {
|
||||
|
||||
char *extension = job->filepath + job->ext;
|
||||
|
||||
int mime = 0;
|
||||
|
||||
if (job->vfile.st_size == 0) {
|
||||
return MIME_EMPTY;
|
||||
}
|
||||
|
||||
if (*extension != '\0' && (job->ext - job->base != 1)) {
|
||||
mime = (int) mime_get_mime_by_ext(extension);
|
||||
|
||||
if (mime != 0) {
|
||||
return mime;
|
||||
}
|
||||
}
|
||||
|
||||
int ret = (int) read(f->fd, buf, size);
|
||||
|
||||
if (ret != 0 && f->calculate_checksum) {
|
||||
f->has_checksum = TRUE;
|
||||
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, ret);
|
||||
if (strlen(extension) == 0 && strlen(job->filepath + job->base) == 40) {
|
||||
fprintf(stderr, "GIT? %s", job->filepath);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define CLOSE_FILE(f) if ((f).close != NULL) {(f).close(&(f));};
|
||||
|
||||
void fs_close(struct vfile *f) {
|
||||
if (f->fd != -1) {
|
||||
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
|
||||
close(f->fd);
|
||||
if (ScanCtx.fast) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
void fs_reset(struct vfile *f) {
|
||||
if (f->fd != -1) {
|
||||
lseek(f->fd, 0, SEEK_SET);
|
||||
// Get mime type with libmagic
|
||||
if (job->vfile.read_rewindable == NULL) {
|
||||
LOG_WARNING(job->filepath,
|
||||
"File does not support rewindable reads, cannot guess Media type");
|
||||
return 0;
|
||||
}
|
||||
|
||||
char *buf[MAGIC_BUF_SIZE];
|
||||
int bytes_read = job->vfile.read_rewindable(&job->vfile, buf, MAGIC_BUF_SIZE);
|
||||
if (bytes_read < 0) {
|
||||
if (job->vfile.is_fs_file) {
|
||||
LOG_ERRORF(job->filepath, "read(): [%d] %s", errno, strerror(errno));
|
||||
} else {
|
||||
LOG_ERRORF(job->filepath, "(virtual) read(): [%d] %s", bytes_read, archive_error_string(job->vfile.arc));
|
||||
}
|
||||
|
||||
|
||||
return GET_MIME_ERROR_FATAL;
|
||||
}
|
||||
|
||||
char *magic_mime_str = magic_buffer_embedded(buf, bytes_read);
|
||||
|
||||
if (magic_mime_str != NULL) {
|
||||
mime = (int) mime_get_mime_by_string(magic_mime_str);
|
||||
free(magic_mime_str);
|
||||
|
||||
if (mime == 0) {
|
||||
LOG_WARNINGF(job->filepath, "Couldn't find mime %s", magic_mime_str);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (job->vfile.reset != NULL) {
|
||||
job->vfile.reset(&job->vfile);
|
||||
}
|
||||
|
||||
return mime;
|
||||
}
|
||||
|
||||
void set_dbg_current_file(parse_job_t *job) {
|
||||
unsigned long long pid = (unsigned long long) pthread_self();
|
||||
pthread_mutex_lock(&ScanCtx.dbg_current_files_mu);
|
||||
g_hash_table_replace(ScanCtx.dbg_current_files, GINT_TO_POINTER(pid), job);
|
||||
pthread_mutex_unlock(&ScanCtx.dbg_current_files_mu);
|
||||
}
|
||||
void parse(parse_job_t *job) {
|
||||
|
||||
void parse_job(parse_job_t *job) {
|
||||
tpool_work_arg_shm_t *arg = malloc(sizeof(tpool_work_arg_shm_t) + sizeof(*job));
|
||||
|
||||
memcpy(arg->arg, job, sizeof(*job));
|
||||
arg->arg_size = -1;
|
||||
|
||||
parse(arg);
|
||||
|
||||
free(arg);
|
||||
}
|
||||
|
||||
void parse(tpool_work_arg_shm_t *arg) {
|
||||
|
||||
parse_job_t *job = (void*)arg->arg;
|
||||
if (job->vfile.is_fs_file) {
|
||||
job->vfile.read = fs_read;
|
||||
job->vfile.read_rewindable = fs_read;
|
||||
job->vfile.reset = fs_reset;
|
||||
job->vfile.close = fs_close;
|
||||
job->vfile.calculate_checksum = ScanCtx.calculate_checksums;
|
||||
}
|
||||
|
||||
document_t *doc = malloc(sizeof(document_t));
|
||||
|
||||
set_dbg_current_file(job);
|
||||
|
||||
strcpy(doc->filepath, job->filepath);
|
||||
doc->ext = (short) job->ext;
|
||||
doc->base = (short) job->base;
|
||||
|
||||
char *rel_path = doc->filepath + ScanCtx.index.desc.root_len;
|
||||
generate_doc_id(rel_path, doc->doc_id);
|
||||
|
||||
doc->ext = job->ext;
|
||||
doc->base = job->base;
|
||||
doc->meta_head = NULL;
|
||||
doc->meta_tail = NULL;
|
||||
doc->mime = 0;
|
||||
doc->size = job->vfile.st_size;
|
||||
doc->mtime = (int) job->vfile.mtime;
|
||||
doc->mime = get_mime(job);
|
||||
generate_doc_id(doc->filepath + ScanCtx.index.desc.root_len, doc->doc_id);
|
||||
|
||||
int inc_ts = incremental_get(ScanCtx.original_table, doc->doc_id);
|
||||
if (inc_ts != 0 && inc_ts == job->vfile.mtime) {
|
||||
pthread_mutex_lock(&ScanCtx.copy_table_mu);
|
||||
incremental_mark_file(ScanCtx.copy_table, doc->doc_id);
|
||||
pthread_mutex_unlock(&ScanCtx.copy_table_mu);
|
||||
if (doc->mime == GET_MIME_ERROR_FATAL) {
|
||||
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
|
||||
ScanCtx.dbg_failed_files_count += 1;
|
||||
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
|
||||
|
||||
CLOSE_FILE(job->vfile)
|
||||
free(doc);
|
||||
return;
|
||||
}
|
||||
|
||||
if (database_mark_document(ProcData.index_db, doc->doc_id, doc->mtime)) {
|
||||
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
|
||||
ScanCtx.dbg_skipped_files_count += 1;
|
||||
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
|
||||
|
||||
CLOSE_FILE(job->vfile)
|
||||
free(doc);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (ScanCtx.new_table != NULL) {
|
||||
pthread_mutex_lock(&ScanCtx.copy_table_mu);
|
||||
incremental_mark_file(ScanCtx.new_table, doc->doc_id);
|
||||
pthread_mutex_unlock(&ScanCtx.copy_table_mu);
|
||||
}
|
||||
|
||||
char *buf[MAGIC_BUF_SIZE];
|
||||
|
||||
if (LogCtx.very_verbose) {
|
||||
LOG_DEBUGF(job->filepath, "Starting parse job {%s}", doc->doc_id)
|
||||
LOG_DEBUGF(job->filepath, "Starting parse job {%s}", doc->doc_id);
|
||||
}
|
||||
|
||||
if (job->ext > 4096) {
|
||||
fprintf(stderr, "Ext is %d, filename is %s\n", job->ext, job->filepath);
|
||||
}
|
||||
|
||||
if (job->vfile.st_size == 0) {
|
||||
doc->mime = MIME_EMPTY;
|
||||
} else if (*(job->filepath + job->ext) != '\0' && (job->ext - job->base != 1)) {
|
||||
doc->mime = mime_get_mime_by_ext(ScanCtx.ext_table, job->filepath + job->ext);
|
||||
}
|
||||
|
||||
if (doc->mime == 0 && !ScanCtx.fast) {
|
||||
|
||||
// Get mime type with libmagic
|
||||
if (job->vfile.read_rewindable == NULL) {
|
||||
LOG_WARNING(job->filepath,
|
||||
"File does not support rewindable reads, cannot guess Media type");
|
||||
goto abort;
|
||||
}
|
||||
|
||||
int bytes_read = job->vfile.read_rewindable(&job->vfile, buf, MAGIC_BUF_SIZE);
|
||||
if (bytes_read < 0) {
|
||||
|
||||
if (job->vfile.is_fs_file) {
|
||||
LOG_ERRORF(job->filepath, "read(): [%d] %s", errno, strerror(errno))
|
||||
} else {
|
||||
LOG_ERRORF(job->filepath, "(virtual) read(): [%d] %s", bytes_read, archive_error_string(job->vfile.arc))
|
||||
}
|
||||
|
||||
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
|
||||
ScanCtx.dbg_failed_files_count += 1;
|
||||
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
|
||||
|
||||
switch (get_file_type(doc->mime, doc->size, doc->filepath)) {
|
||||
case FILETYPE_RAW:
|
||||
parse_raw(&ScanCtx.raw_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_MEDIA:
|
||||
parse_media(&ScanCtx.media_ctx, &job->vfile, doc, mime_get_mime_text(doc->mime));
|
||||
break;
|
||||
case FILETYPE_EBOOK:
|
||||
parse_ebook(&ScanCtx.ebook_ctx, &job->vfile, mime_get_mime_text(doc->mime), doc);
|
||||
break;
|
||||
case FILETYPE_MARKUP:
|
||||
parse_markup(&ScanCtx.text_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_TEXT:
|
||||
parse_text(&ScanCtx.text_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_FONT:
|
||||
parse_font(&ScanCtx.font_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_ARCHIVE:
|
||||
parse_archive(&ScanCtx.arc_ctx, &job->vfile, doc, ScanCtx.exclude, ScanCtx.exclude_extra);
|
||||
break;
|
||||
case FILETYPE_OOXML:
|
||||
parse_ooxml(&ScanCtx.ooxml_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_COMIC:
|
||||
parse_comic(&ScanCtx.comic_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_MOBI:
|
||||
parse_mobi(&ScanCtx.mobi_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_SIST2_SIDECAR:
|
||||
parse_sidecar(&job->vfile, doc);
|
||||
CLOSE_FILE(job->vfile)
|
||||
free(doc);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
magic_t magic = magic_open(MAGIC_MIME_TYPE);
|
||||
|
||||
const char *magic_buffers[1] = {magic_database_buffer,};
|
||||
size_t sizes[1] = {sizeof(magic_database_buffer),};
|
||||
|
||||
int load_ret = magic_load_buffers(magic, (void **) &magic_buffers, sizes, 1);
|
||||
|
||||
if (load_ret != 0) {
|
||||
LOG_FATALF("parse.c", "Could not load libmagic database: (%d)", load_ret)
|
||||
}
|
||||
|
||||
const char *magic_mime_str = magic_buffer(magic, buf, bytes_read);
|
||||
if (magic_mime_str != NULL) {
|
||||
doc->mime = mime_get_mime_by_string(ScanCtx.mime_table, magic_mime_str);
|
||||
|
||||
LOG_DEBUGF(job->filepath, "libmagic: %s", magic_mime_str);
|
||||
|
||||
if (doc->mime == 0) {
|
||||
LOG_WARNINGF(job->filepath, "Couldn't find mime %s", magic_mime_str);
|
||||
}
|
||||
}
|
||||
|
||||
if (job->vfile.reset != NULL) {
|
||||
job->vfile.reset(&job->vfile);
|
||||
}
|
||||
|
||||
magic_close(magic);
|
||||
case FILETYPE_MSDOC:
|
||||
parse_msdoc(&ScanCtx.msdoc_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_JSON:
|
||||
parse_json(&ScanCtx.json_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_NDJSON:
|
||||
parse_ndjson(&ScanCtx.json_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_DONT_PARSE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
int mmime = MAJOR_MIME(doc->mime);
|
||||
|
||||
if (!(SHOULD_PARSE(doc->mime))) {
|
||||
|
||||
} else if (IS_RAW(doc->mime)) {
|
||||
parse_raw(&ScanCtx.raw_ctx, &job->vfile, doc);
|
||||
} else if ((mmime == MimeVideo && doc->size >= MIN_VIDEO_SIZE) ||
|
||||
(mmime == MimeImage && doc->size >= MIN_IMAGE_SIZE) || mmime == MimeAudio) {
|
||||
|
||||
parse_media(&ScanCtx.media_ctx, &job->vfile, doc, mime_get_mime_text(doc->mime));
|
||||
|
||||
} else if (IS_PDF(doc->mime)) {
|
||||
parse_ebook(&ScanCtx.ebook_ctx, &job->vfile, mime_get_mime_text(doc->mime), doc);
|
||||
|
||||
} else if (mmime == MimeText && ScanCtx.text_ctx.content_size > 0) {
|
||||
if (IS_MARKUP(doc->mime)) {
|
||||
parse_markup(&ScanCtx.text_ctx, &job->vfile, doc);
|
||||
} else {
|
||||
parse_text(&ScanCtx.text_ctx, &job->vfile, doc);
|
||||
}
|
||||
|
||||
} else if (IS_FONT(doc->mime)) {
|
||||
parse_font(&ScanCtx.font_ctx, &job->vfile, doc);
|
||||
|
||||
} else if (
|
||||
ScanCtx.arc_ctx.mode != ARC_MODE_SKIP && (
|
||||
IS_ARC(doc->mime) ||
|
||||
(IS_ARC_FILTER(doc->mime) && should_parse_filtered_file(doc->filepath, doc->ext))
|
||||
)) {
|
||||
parse_archive(&ScanCtx.arc_ctx, &job->vfile, doc, ScanCtx.exclude, ScanCtx.exclude_extra);
|
||||
} else if ((ScanCtx.ooxml_ctx.content_size > 0 || ScanCtx.media_ctx.tn_size > 0) && IS_DOC(doc->mime)) {
|
||||
parse_ooxml(&ScanCtx.ooxml_ctx, &job->vfile, doc);
|
||||
} else if (is_cbr(&ScanCtx.comic_ctx, doc->mime) || is_cbz(&ScanCtx.comic_ctx, doc->mime)) {
|
||||
parse_comic(&ScanCtx.comic_ctx, &job->vfile, doc);
|
||||
} else if (IS_MOBI(doc->mime)) {
|
||||
parse_mobi(&ScanCtx.mobi_ctx, &job->vfile, doc);
|
||||
} else if (doc->mime == MIME_SIST2_SIDECAR) {
|
||||
parse_sidecar(&job->vfile, doc);
|
||||
CLOSE_FILE(job->vfile)
|
||||
free(doc);
|
||||
return;
|
||||
} else if (is_msdoc(&ScanCtx.msdoc_ctx, doc->mime)) {
|
||||
parse_msdoc(&ScanCtx.msdoc_ctx, &job->vfile, doc);
|
||||
} else if (is_json(&ScanCtx.json_ctx, doc->mime)) {
|
||||
parse_json(&ScanCtx.json_ctx, &job->vfile, doc);
|
||||
} else if (is_ndjson(&ScanCtx.json_ctx, doc->mime)) {
|
||||
parse_ndjson(&ScanCtx.json_ctx, &job->vfile, doc);
|
||||
}
|
||||
|
||||
abort:
|
||||
|
||||
//Parent meta
|
||||
if (job->parent[0] != '\0') {
|
||||
meta_line_t *meta_parent = malloc(sizeof(meta_line_t) + SIST_INDEX_ID_LEN);
|
||||
@ -247,12 +254,8 @@ void parse(tpool_work_arg_shm_t *arg) {
|
||||
if (job->vfile.has_checksum) {
|
||||
char sha1_digest_str[SHA1_STR_LENGTH];
|
||||
buf2hex((unsigned char *) job->vfile.sha1_digest, SHA1_DIGEST_LENGTH, (char *) sha1_digest_str);
|
||||
APPEND_STR_META(doc, MetaChecksum, (const char *) sha1_digest_str);
|
||||
APPEND_STR_META(doc, MetaChecksum, (const char *) sha1_digest_str)
|
||||
}
|
||||
|
||||
write_document(doc);
|
||||
}
|
||||
|
||||
void cleanup_parse() {
|
||||
// noop
|
||||
}
|
||||
|
@ -4,15 +4,7 @@
|
||||
#include "../sist.h"
|
||||
#include "src/tpool.h"
|
||||
|
||||
#define MAGIC_BUF_SIZE (4096 * 6)
|
||||
|
||||
int fs_read(struct vfile *f, void *buf, size_t size);
|
||||
void fs_close(struct vfile *f);
|
||||
void fs_reset(struct vfile *f);
|
||||
|
||||
void parse_job(parse_job_t *job);
|
||||
void parse(tpool_work_arg_shm_t *arg);
|
||||
|
||||
void cleanup_parse();
|
||||
void parse(parse_job_t *arg);
|
||||
|
||||
#endif
|
||||
|
@ -4,12 +4,12 @@
|
||||
|
||||
void parse_sidecar(vfile_t *vfile, document_t *doc) {
|
||||
|
||||
LOG_DEBUGF("sidecar.c", "Parsing sidecar file %s", vfile->filepath)
|
||||
LOG_DEBUGF("sidecar.c", "Parsing sidecar file %s", vfile->filepath);
|
||||
|
||||
size_t size;
|
||||
char *buf = read_all(vfile, &size);
|
||||
if (buf == NULL) {
|
||||
LOG_ERRORF("sidecar.c", "Read error for %s", vfile->filepath)
|
||||
LOG_ERRORF("sidecar.c", "Read error for %s", vfile->filepath);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -18,7 +18,7 @@ void parse_sidecar(vfile_t *vfile, document_t *doc) {
|
||||
|
||||
cJSON *json = cJSON_Parse(buf);
|
||||
if (json == NULL) {
|
||||
LOG_ERRORF("sidecar.c", "Could not parse JSON sidecar %s", vfile->filepath)
|
||||
LOG_ERRORF("sidecar.c", "Could not parse JSON sidecar %s", vfile->filepath);
|
||||
return;
|
||||
}
|
||||
char *json_str = cJSON_PrintUnformatted(json);
|
||||
@ -32,8 +32,7 @@ void parse_sidecar(vfile_t *vfile, document_t *doc) {
|
||||
|
||||
generate_doc_id(rel_path, assoc_doc_id);
|
||||
|
||||
store_write(ScanCtx.index.meta_store, assoc_doc_id, sizeof(assoc_doc_id), json_str,
|
||||
strlen(json_str) + 1);
|
||||
database_write_document_sidecar(ProcData.index_db, assoc_doc_id, json_str);
|
||||
|
||||
cJSON_Delete(json);
|
||||
free(json_str);
|
||||
|
@ -49,8 +49,11 @@
|
||||
#include <ctype.h>
|
||||
#include "git_hash.h"
|
||||
|
||||
#define VERSION "2.14.3"
|
||||
#define VERSION "3.0.0"
|
||||
static const char *const Version = VERSION;
|
||||
static const int VersionMajor = 3;
|
||||
static const int VersionMinor = 0;
|
||||
static const int VersionPatch = 0;
|
||||
|
||||
#ifndef SIST_PLATFORM
|
||||
#define SIST_PLATFORM unknown
|
||||
|
343
src/stats.c
343
src/stats.c
@ -1,343 +0,0 @@
|
||||
#include "sist.h"
|
||||
#include "io/serialize.h"
|
||||
#include "ctx.h"
|
||||
|
||||
static GHashTable *FlatTree;
|
||||
static GHashTable *BufferTable;
|
||||
|
||||
static GHashTable *AggMime;
|
||||
static GHashTable *AggSize;
|
||||
static GHashTable *AggDate;
|
||||
|
||||
#define SIZE_BUCKET (long)(5 * 1024 * 1024)
|
||||
#define DATE_BUCKET (long)(2629800)
|
||||
|
||||
static long TotalSize = 0;
|
||||
static long DocumentCount = 0;
|
||||
|
||||
typedef struct {
|
||||
long size;
|
||||
long count;
|
||||
} agg_t;
|
||||
|
||||
void fill_tables(cJSON *document, UNUSED(const char index_id[SIST_INDEX_ID_LEN])) {
|
||||
|
||||
if (cJSON_GetObjectItem(document, "parent") != NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
const char *json_path = cJSON_GetObjectItem(document, "path")->valuestring;
|
||||
char *path = malloc(strlen(json_path) + 1);
|
||||
strcpy(path, json_path);
|
||||
|
||||
const char *json_mime = cJSON_GetObjectItem(document, "mime")->valuestring;
|
||||
char *mime;
|
||||
if (json_mime == NULL) {
|
||||
mime = NULL;
|
||||
} else {
|
||||
mime = malloc(strlen(json_mime) + 1);
|
||||
strcpy(mime, json_mime);
|
||||
}
|
||||
|
||||
long size = (long) cJSON_GetObjectItem(document, "size")->valuedouble;
|
||||
int mtime = cJSON_GetObjectItem(document, "mtime")->valueint;
|
||||
|
||||
// treemap
|
||||
void *existing_path = g_hash_table_lookup(FlatTree, path);
|
||||
if (existing_path == NULL) {
|
||||
g_hash_table_insert(FlatTree, path, (gpointer) size);
|
||||
} else {
|
||||
g_hash_table_replace(FlatTree, path, (gpointer) ((long) existing_path + size));
|
||||
}
|
||||
|
||||
// mime agg
|
||||
if (mime != NULL) {
|
||||
agg_t *orig_agg = g_hash_table_lookup(AggMime, mime);
|
||||
if (orig_agg == NULL) {
|
||||
agg_t *agg = malloc(sizeof(agg_t));
|
||||
agg->size = size;
|
||||
agg->count = 1;
|
||||
g_hash_table_insert(AggMime, mime, agg);
|
||||
} else {
|
||||
orig_agg->size += size;
|
||||
orig_agg->count += 1;
|
||||
free(mime);
|
||||
}
|
||||
}
|
||||
|
||||
// size agg
|
||||
long size_bucket = size - (size % SIZE_BUCKET);
|
||||
agg_t *orig_agg = g_hash_table_lookup(AggSize, (gpointer) size_bucket);
|
||||
if (orig_agg == NULL) {
|
||||
agg_t *agg = malloc(sizeof(agg_t));
|
||||
agg->size = size;
|
||||
agg->count = 1;
|
||||
g_hash_table_insert(AggSize, (gpointer) size_bucket, agg);
|
||||
} else {
|
||||
orig_agg->count += 1;
|
||||
orig_agg->size += size;
|
||||
}
|
||||
|
||||
// date agg
|
||||
long date_bucket = mtime - (mtime % DATE_BUCKET);
|
||||
orig_agg = g_hash_table_lookup(AggDate, (gpointer) date_bucket);
|
||||
if (orig_agg == NULL) {
|
||||
agg_t *agg = malloc(sizeof(agg_t));
|
||||
agg->size = size;
|
||||
agg->count = 1;
|
||||
g_hash_table_insert(AggDate, (gpointer) date_bucket, agg);
|
||||
} else {
|
||||
orig_agg->count += 1;
|
||||
orig_agg->size += size;
|
||||
}
|
||||
|
||||
TotalSize += size;
|
||||
DocumentCount += 1;
|
||||
}
|
||||
|
||||
void read_index_into_tables(index_t *index) {
|
||||
char file_path[PATH_MAX];
|
||||
READ_INDICES(file_path, index->path, read_index(file_path, index->desc.id, index->desc.type, fill_tables), {}, 1);
|
||||
}
|
||||
|
||||
static size_t rfind(const char *str, int c) {
|
||||
for (int i = (int)strlen(str); i >= 0; i--) {
|
||||
if (str[i] == c) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int merge_up(double thresh) {
|
||||
long min_size = (long) (thresh * (double) TotalSize);
|
||||
|
||||
int count = 0;
|
||||
GHashTableIter iter;
|
||||
g_hash_table_iter_init(&iter, FlatTree);
|
||||
|
||||
void *key;
|
||||
void *value;
|
||||
|
||||
while (g_hash_table_iter_next(&iter, &key, &value)) {
|
||||
long size = (long) value;
|
||||
|
||||
if (size < min_size) {
|
||||
int stop = rfind(key, '/');
|
||||
if (stop == -1) {
|
||||
stop = 0;
|
||||
}
|
||||
char *parent = malloc(stop + 1);
|
||||
strncpy(parent, key, stop);
|
||||
*(parent + stop) = '\0';
|
||||
|
||||
void *existing_parent = g_hash_table_lookup(FlatTree, parent);
|
||||
if (existing_parent == NULL) {
|
||||
void *existing_parent2_key;
|
||||
void *existing_parent2_val;
|
||||
int found = g_hash_table_lookup_extended(BufferTable, parent, &existing_parent2_key,
|
||||
&existing_parent2_val);
|
||||
if (!found) {
|
||||
g_hash_table_insert(BufferTable, parent, value);
|
||||
} else {
|
||||
g_hash_table_replace(BufferTable, parent, (gpointer) ((long) existing_parent2_val + size));
|
||||
free(existing_parent2_key);
|
||||
}
|
||||
} else {
|
||||
g_hash_table_replace(FlatTree, parent, (gpointer) ((long) existing_parent + size));
|
||||
}
|
||||
|
||||
g_hash_table_iter_remove(&iter);
|
||||
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
g_hash_table_iter_init(&iter, BufferTable);
|
||||
while (g_hash_table_iter_next(&iter, &key, &value)) {
|
||||
g_hash_table_insert(FlatTree, key, value);
|
||||
g_hash_table_iter_remove(&iter);
|
||||
}
|
||||
|
||||
int size = g_hash_table_size(FlatTree);
|
||||
|
||||
LOG_DEBUGF("stats.c", "Merge up iteration (%d merged, %d in tree)", count, size)
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assumes out is at at least PATH_MAX *4
|
||||
*/
|
||||
void csv_escape(char *dst, const char *str) {
|
||||
|
||||
const char *ptr = str;
|
||||
char *out = dst;
|
||||
|
||||
if (rfind(str, ',') == -1 && rfind(str, '"') == -1) {
|
||||
strcpy(dst, str);
|
||||
return;
|
||||
}
|
||||
|
||||
*out++ = '"';
|
||||
char c;
|
||||
while ((c = *ptr++) != 0) {
|
||||
if (c == '"') {
|
||||
*out++ = '"';
|
||||
*out++ = '"';
|
||||
} else {
|
||||
*out++ = c;
|
||||
}
|
||||
}
|
||||
*out++ = '"';
|
||||
*out = '\0';
|
||||
}
|
||||
|
||||
int open_or_exit(const char *path) {
|
||||
int fd = open(path, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR);
|
||||
if (fd < 0) {
|
||||
LOG_FATALF("stats.c", "Error while creating file: %s [%d]\n", strerror(errno), errno)
|
||||
}
|
||||
return fd;
|
||||
}
|
||||
|
||||
#define TREEMAP_CSV_HEADER "path,size"
|
||||
#define MIME_AGG_CSV_HEADER "mime,size,count"
|
||||
#define SIZE_AGG_CSV_HEADER "bucket,size,count"
|
||||
#define DATE_AGG_CSV_HEADER "bucket,size,count"
|
||||
|
||||
void write_treemap_csv(double thresh, const char *out_path) {
|
||||
|
||||
void *key;
|
||||
void *value;
|
||||
|
||||
long min_size = (long) (thresh * (double) TotalSize);
|
||||
|
||||
int fd = open_or_exit(out_path);
|
||||
int ret = write(fd, TREEMAP_CSV_HEADER, sizeof(TREEMAP_CSV_HEADER) - 1);
|
||||
if (ret == -1) {
|
||||
LOG_FATALF("stats.c", "Write error: %s", strerror(errno))
|
||||
}
|
||||
|
||||
GHashTableIter iter;
|
||||
g_hash_table_iter_init(&iter, FlatTree);
|
||||
while (g_hash_table_iter_next(&iter, &key, &value)) {
|
||||
long size = (long) value;
|
||||
|
||||
if (size >= min_size) {
|
||||
char path_buf[PATH_MAX * 4];
|
||||
char buf[PATH_MAX * 4 + 16];
|
||||
|
||||
csv_escape(path_buf, key);
|
||||
size_t written = sprintf(buf, "\n%s,%ld", path_buf, (long) value);
|
||||
ret = write(fd, buf, written);
|
||||
if (ret == -1) {
|
||||
LOG_FATALF("stats.c", "Write error: %s", strerror(errno))
|
||||
}
|
||||
}
|
||||
}
|
||||
close(fd);
|
||||
}
|
||||
|
||||
void write_agg_csv_str(const char *out_path, const char *header, GHashTable *table) {
|
||||
void *key;
|
||||
void *value;
|
||||
char buf[4096];
|
||||
|
||||
int fd = open_or_exit(out_path);
|
||||
int ret = write(fd, header, strlen(header));
|
||||
if (ret == -1) {
|
||||
LOG_FATALF("stats.c", "Write error: %s", strerror(errno))
|
||||
}
|
||||
|
||||
GHashTableIter iter;
|
||||
g_hash_table_iter_init(&iter, table);
|
||||
while (g_hash_table_iter_next(&iter, &key, &value)) {
|
||||
agg_t *agg = value;
|
||||
|
||||
size_t written = sprintf(buf, "\n%s,%ld,%ld", (const char*)key, agg->size, agg->count);
|
||||
ret = write(fd, buf, written);
|
||||
if (ret == -1) {
|
||||
LOG_FATALF("stats.c", "Write error: %s", strerror(errno))
|
||||
}
|
||||
}
|
||||
|
||||
close(fd);
|
||||
}
|
||||
|
||||
void write_agg_csv_long(const char *out_path, const char *header, GHashTable *table) {
|
||||
void *key;
|
||||
void *value;
|
||||
char buf[4096];
|
||||
|
||||
int fd = open_or_exit(out_path);
|
||||
int ret = write(fd, header, strlen(header));
|
||||
if (ret == -1) {
|
||||
LOG_FATALF("stats.c", "Write error: %s", strerror(errno))
|
||||
}
|
||||
|
||||
GHashTableIter iter;
|
||||
g_hash_table_iter_init(&iter, table);
|
||||
while (g_hash_table_iter_next(&iter, &key, &value)) {
|
||||
agg_t *agg = value;
|
||||
size_t written = sprintf(buf, "\n%ld,%ld,%ld", (long)key, agg->size, agg->count);
|
||||
ret = write(fd, buf, written);
|
||||
if (ret == -1) {
|
||||
LOG_FATALF("stats.c", "Write error: %s", strerror(errno))
|
||||
}
|
||||
}
|
||||
|
||||
close(fd);
|
||||
}
|
||||
|
||||
int generate_stats(index_t *index, const double threshold, const char *out_prefix) {
|
||||
|
||||
FlatTree = g_hash_table_new_full(g_str_hash, g_str_equal, free, NULL);
|
||||
BufferTable = g_hash_table_new(g_str_hash, g_str_equal);
|
||||
|
||||
AggMime = g_hash_table_new_full(g_str_hash, g_str_equal, free, free);
|
||||
AggSize = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL, free);
|
||||
AggDate = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL, free);
|
||||
|
||||
LOG_INFO("stats.c", "Generating stats...")
|
||||
|
||||
read_index_into_tables(index);
|
||||
|
||||
LOG_DEBUG("stats.c", "Read index into tables")
|
||||
LOG_DEBUGF("stats.c", "Total size is %ld", TotalSize)
|
||||
LOG_DEBUGF("stats.c", "Document count is %ld", DocumentCount)
|
||||
LOG_DEBUGF("stats.c", "Merging small directories upwards with a threshold of %f%%", threshold * 100)
|
||||
|
||||
while (merge_up(threshold) > 100) {}
|
||||
|
||||
char tmp[PATH_MAX];
|
||||
|
||||
strncpy(tmp, out_prefix, sizeof(tmp));
|
||||
strcat(tmp, "treemap.csv");
|
||||
write_treemap_csv(threshold, tmp);
|
||||
|
||||
strncpy(tmp, out_prefix, sizeof(tmp));
|
||||
strcat(tmp, "mime_agg.csv");
|
||||
write_agg_csv_str(tmp, MIME_AGG_CSV_HEADER, AggMime);
|
||||
|
||||
strncpy(tmp, out_prefix, sizeof(tmp));
|
||||
strcat(tmp, "size_agg.csv");
|
||||
write_agg_csv_long(tmp, SIZE_AGG_CSV_HEADER, AggSize);
|
||||
|
||||
strncpy(tmp, out_prefix, sizeof(tmp));
|
||||
strcat(tmp, "date_agg.csv");
|
||||
write_agg_csv_long(tmp, DATE_AGG_CSV_HEADER, AggDate);
|
||||
|
||||
g_hash_table_remove_all(FlatTree);
|
||||
g_hash_table_destroy(FlatTree);
|
||||
g_hash_table_destroy(BufferTable);
|
||||
|
||||
g_hash_table_remove_all(AggMime);
|
||||
g_hash_table_destroy(AggMime);
|
||||
g_hash_table_remove_all(AggSize);
|
||||
g_hash_table_destroy(AggSize);
|
||||
g_hash_table_remove_all(AggDate);
|
||||
g_hash_table_destroy(AggDate);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,6 +0,0 @@
|
||||
#ifndef SIST2_STATS_H
|
||||
#define SIST2_STATS_H
|
||||
|
||||
int generate_stats(index_t *index, double threshold, const char* out_prefix);
|
||||
|
||||
#endif
|
456
src/tpool.c
456
src/tpool.c
@ -4,257 +4,250 @@
|
||||
#include <pthread.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/wait.h>
|
||||
#include "mempool/mempool.h"
|
||||
#include "parsing/parse.h"
|
||||
|
||||
#define BLANK_STR " "
|
||||
// TODO: Use slab OOM to control queue size
|
||||
#define MAX_QUEUE_SIZE 100000
|
||||
|
||||
typedef struct tpool_work {
|
||||
tpool_work_arg_shm_t *arg;
|
||||
thread_func_t func;
|
||||
struct tpool_work *next;
|
||||
} tpool_work_t;
|
||||
typedef struct {
|
||||
int thread_id;
|
||||
tpool_t *pool;
|
||||
} start_thread_arg_t;
|
||||
|
||||
|
||||
typedef struct tpool {
|
||||
tpool_work_t *work_head;
|
||||
tpool_work_t *work_tail;
|
||||
|
||||
pthread_mutex_t work_mutex;
|
||||
pthread_mutex_t mem_mutex;
|
||||
|
||||
// TODO: Initialize with SHARED attr
|
||||
pthread_cond_t has_work_cond;
|
||||
pthread_cond_t working_cond;
|
||||
|
||||
pthread_t threads[256];
|
||||
|
||||
int thread_cnt;
|
||||
int work_cnt;
|
||||
int done_cnt;
|
||||
int busy_cnt;
|
||||
|
||||
int stop;
|
||||
int waiting;
|
||||
int num_threads;
|
||||
int fork;
|
||||
|
||||
int print_progress;
|
||||
|
||||
void (*cleanup_func)();
|
||||
|
||||
void *shared_memory;
|
||||
size_t shared_memory_size;
|
||||
ncx_slab_pool_t *mempool;
|
||||
struct {
|
||||
job_type_t job_type;
|
||||
int stop;
|
||||
int waiting;
|
||||
database_ipc_ctx_t ipc_ctx;
|
||||
pthread_mutex_t mutex;
|
||||
pthread_mutex_t data_mutex;
|
||||
pthread_cond_t done_working_cond;
|
||||
pthread_cond_t workers_initialized_cond;
|
||||
int busy_count;
|
||||
int initialized_count;
|
||||
} *shm;
|
||||
} tpool_t;
|
||||
|
||||
|
||||
/**
|
||||
* Create a work object
|
||||
*/
|
||||
static tpool_work_t *tpool_work_create(tpool_t *pool, thread_func_t func, tpool_work_arg_t *arg) {
|
||||
|
||||
if (func == NULL) {
|
||||
return NULL;
|
||||
void job_destroy(job_t *job) {
|
||||
if (job->type == JOB_PARSE_JOB) {
|
||||
free(job->parse_job);
|
||||
}
|
||||
|
||||
// Copy heap arg to shm arg
|
||||
pthread_mutex_lock(&pool->mem_mutex);
|
||||
|
||||
tpool_work_arg_shm_t *shm_arg = ncx_slab_alloc(pool->mempool, sizeof(tpool_work_arg_shm_t) + arg->arg_size);
|
||||
|
||||
shm_arg->arg_size = arg->arg_size;
|
||||
memcpy(shm_arg->arg, arg->arg, arg->arg_size);
|
||||
|
||||
free(arg->arg);
|
||||
|
||||
tpool_work_t *work = ncx_slab_alloc(pool->mempool, sizeof(tpool_work_t));
|
||||
|
||||
pthread_mutex_unlock(&pool->mem_mutex);
|
||||
|
||||
work->func = func;
|
||||
work->arg = shm_arg;
|
||||
work->next = NULL;
|
||||
|
||||
return work;
|
||||
free(job);
|
||||
}
|
||||
|
||||
void tpool_dump_debug_info(tpool_t *pool) {
|
||||
LOG_DEBUGF("tpool.c", "pool->thread_cnt = %d", pool->thread_cnt)
|
||||
LOG_DEBUGF("tpool.c", "pool->work_cnt = %d", pool->work_cnt)
|
||||
LOG_DEBUGF("tpool.c", "pool->done_cnt = %d", pool->done_cnt)
|
||||
LOG_DEBUGF("tpool.c", "pool->busy_cnt = %d", pool->busy_cnt)
|
||||
LOG_DEBUGF("tpool.c", "pool->stop = %d", pool->stop)
|
||||
}
|
||||
|
||||
/**
|
||||
* Pop work object from thread pool
|
||||
*/
|
||||
static tpool_work_t *tpool_work_get(tpool_t *pool) {
|
||||
|
||||
tpool_work_t *work = pool->work_head;
|
||||
if (work == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (work->next == NULL) {
|
||||
pool->work_head = NULL;
|
||||
pool->work_tail = NULL;
|
||||
} else {
|
||||
pool->work_head = work->next;
|
||||
}
|
||||
|
||||
return work;
|
||||
// TODO
|
||||
LOG_DEBUGF("tpool.c", "pool->num_threads = %d", pool->num_threads);
|
||||
}
|
||||
|
||||
/**
|
||||
* Push work object to thread pool
|
||||
*/
|
||||
int tpool_add_work(tpool_t *pool, thread_func_t func, tpool_work_arg_t *arg) {
|
||||
int tpool_add_work(tpool_t *pool, job_t *job) {
|
||||
|
||||
while ((pool->work_cnt - pool->done_cnt) >= MAX_QUEUE_SIZE) {
|
||||
usleep(10000);
|
||||
}
|
||||
tpool_work_t *work = tpool_work_create(pool, func, arg);
|
||||
if (work == NULL) {
|
||||
return 0;
|
||||
if (pool->shm->job_type == JOB_UNDEFINED) {
|
||||
pool->shm->job_type = job->type;
|
||||
} else if (pool->shm->job_type != job->type) {
|
||||
LOG_FATAL("tpool.c", "FIXME: tpool cannot queue jobs with different types!");
|
||||
}
|
||||
|
||||
pthread_mutex_lock(&(pool->work_mutex));
|
||||
if (pool->work_head == NULL) {
|
||||
pool->work_head = work;
|
||||
pool->work_tail = pool->work_head;
|
||||
} else {
|
||||
pool->work_tail->next = work;
|
||||
pool->work_tail = work;
|
||||
}
|
||||
database_add_work(ProcData.ipc_db, job);
|
||||
|
||||
pool->work_cnt++;
|
||||
|
||||
pthread_cond_broadcast(&(pool->has_work_cond));
|
||||
pthread_mutex_unlock(&(pool->work_mutex));
|
||||
|
||||
return 1;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static void worker_thread_loop(tpool_t *pool) {
|
||||
while (TRUE) {
|
||||
pthread_mutex_lock(&pool->work_mutex);
|
||||
if (pool->stop) {
|
||||
if (pool->shm->stop) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (pool->work_head == NULL) {
|
||||
pthread_cond_wait(&(pool->has_work_cond), &(pool->work_mutex));
|
||||
if (pool->shm->job_type == JOB_UNDEFINED) {
|
||||
// Wait before first job is queued
|
||||
pthread_mutex_lock(&pool->shm->mutex);
|
||||
pthread_cond_timedwait_ms(&pool->shm->ipc_ctx.has_work_cond, &pool->shm->mutex, 1000);
|
||||
pthread_mutex_unlock(&pool->shm->mutex);
|
||||
}
|
||||
|
||||
tpool_work_t *work = tpool_work_get(pool);
|
||||
job_t *job = database_get_work(ProcData.ipc_db, pool->shm->job_type);
|
||||
|
||||
if (work != NULL) {
|
||||
pool->busy_cnt += 1;
|
||||
}
|
||||
if (job != NULL) {
|
||||
pthread_mutex_lock(&(pool->shm->data_mutex));
|
||||
pool->shm->busy_count += 1;
|
||||
pthread_mutex_unlock(&(pool->shm->data_mutex));
|
||||
|
||||
pthread_mutex_unlock(&(pool->work_mutex));
|
||||
|
||||
if (work != NULL) {
|
||||
if (pool->stop) {
|
||||
if (pool->shm->stop) {
|
||||
break;
|
||||
}
|
||||
|
||||
work->func(work->arg);
|
||||
if (job->type == JOB_PARSE_JOB) {
|
||||
parse(job->parse_job);
|
||||
} else if (job->type == JOB_BULK_LINE) {
|
||||
elastic_index_line(job->bulk_line);
|
||||
}
|
||||
|
||||
pthread_mutex_lock(&pool->mem_mutex);
|
||||
ncx_slab_free(pool->mempool, work->arg);
|
||||
ncx_slab_free(pool->mempool, work);
|
||||
pthread_mutex_unlock(&pool->mem_mutex);
|
||||
}
|
||||
job_destroy(job);
|
||||
|
||||
pthread_mutex_lock(&(pool->work_mutex));
|
||||
if (work != NULL) {
|
||||
pool->busy_cnt -= 1;
|
||||
pool->done_cnt++;
|
||||
pthread_mutex_lock(&(pool->shm->data_mutex));
|
||||
pool->shm->busy_count -= 1;
|
||||
pthread_mutex_unlock(&(pool->shm->data_mutex));
|
||||
|
||||
pthread_mutex_lock(&(pool->shm->ipc_ctx.mutex));
|
||||
pool->shm->ipc_ctx.completed_job_count += 1;
|
||||
pthread_mutex_unlock(&(pool->shm->ipc_ctx.mutex));
|
||||
}
|
||||
|
||||
if (pool->print_progress) {
|
||||
|
||||
int done = pool->shm->ipc_ctx.completed_job_count;
|
||||
int count = pool->shm->ipc_ctx.completed_job_count + pool->shm->ipc_ctx.job_count;
|
||||
|
||||
if (LogCtx.json_logs) {
|
||||
progress_bar_print_json(pool->done_cnt, pool->work_cnt, ScanCtx.stat_tn_size,
|
||||
ScanCtx.stat_index_size, pool->waiting);
|
||||
progress_bar_print_json(done,
|
||||
count,
|
||||
ScanCtx.stat_tn_size,
|
||||
ScanCtx.stat_index_size, pool->shm->waiting);
|
||||
} else {
|
||||
progress_bar_print((double) pool->done_cnt / pool->work_cnt, ScanCtx.stat_tn_size,
|
||||
ScanCtx.stat_index_size);
|
||||
progress_bar_print((double) done / count,
|
||||
ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
|
||||
}
|
||||
}
|
||||
|
||||
if (pool->work_head == NULL) {
|
||||
pthread_cond_signal(&(pool->working_cond));
|
||||
if (job == NULL) {
|
||||
pthread_mutex_lock(&pool->shm->mutex);
|
||||
pthread_cond_signal(&pool->shm->done_working_cond);
|
||||
pthread_mutex_unlock(&pool->shm->mutex);
|
||||
}
|
||||
pthread_mutex_unlock(&(pool->work_mutex));
|
||||
}
|
||||
}
|
||||
|
||||
static void worker_proc_init(tpool_t *pool, int thread_id) {
|
||||
// TODO create PID -> thread_id mapping for signal handler
|
||||
|
||||
ProcData.thread_id = thread_id;
|
||||
|
||||
if (ScanCtx.index.path[0] != '\0') {
|
||||
// TODO This should be closed in proc cleanup function
|
||||
ProcData.index_db = database_create(ScanCtx.index.path, INDEX_DATABASE);
|
||||
ProcData.index_db->ipc_ctx = &pool->shm->ipc_ctx;
|
||||
database_open(ProcData.index_db);
|
||||
}
|
||||
|
||||
// TODO /dev/shm
|
||||
pthread_mutex_lock(&pool->shm->mutex);
|
||||
ProcData.ipc_db = database_create("/dev/shm/ipc.sist2", IPC_CONSUMER_DATABASE);
|
||||
ProcData.ipc_db->ipc_ctx = &pool->shm->ipc_ctx;
|
||||
database_open(ProcData.ipc_db);
|
||||
pthread_mutex_unlock(&pool->shm->mutex);
|
||||
}
|
||||
|
||||
void worker_proc_cleanup(tpool_t* pool) {
|
||||
if (ProcData.index_db != NULL) {
|
||||
database_close(ProcData.index_db, FALSE);
|
||||
}
|
||||
database_close(ProcData.ipc_db, FALSE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Thread worker function
|
||||
*/
|
||||
static void *tpool_worker(void *arg) {
|
||||
tpool_t *pool = arg;
|
||||
tpool_t *pool = ((start_thread_arg_t *) arg)->pool;
|
||||
|
||||
int pid = fork();
|
||||
if (pool->fork) {
|
||||
while (TRUE) {
|
||||
int pid = fork();
|
||||
|
||||
if (pid == 0) {
|
||||
if (pid == 0) {
|
||||
worker_proc_init(pool, ((start_thread_arg_t *) arg)->thread_id);
|
||||
|
||||
pthread_mutex_lock(&pool->shm->mutex);
|
||||
pthread_cond_signal(&pool->shm->workers_initialized_cond);
|
||||
pool->shm->initialized_count += 1;
|
||||
pthread_mutex_unlock(&pool->shm->mutex);
|
||||
|
||||
worker_thread_loop(pool);
|
||||
|
||||
pthread_mutex_lock(&pool->shm->mutex);
|
||||
pthread_cond_signal(&pool->shm->done_working_cond);
|
||||
pthread_mutex_unlock(&pool->shm->mutex);
|
||||
|
||||
worker_proc_cleanup(pool);
|
||||
|
||||
exit(0);
|
||||
|
||||
} else {
|
||||
int status;
|
||||
// TODO: On crash, print debug info and resume thread
|
||||
waitpid(pid, &status, 0);
|
||||
|
||||
LOG_DEBUGF("tpool.c", "Child process terminated with status code %d", WEXITSTATUS(status));
|
||||
|
||||
pthread_mutex_lock(&(pool->shm->ipc_ctx.mutex));
|
||||
pool->shm->ipc_ctx.completed_job_count += 1;
|
||||
pthread_mutex_unlock(&(pool->shm->ipc_ctx.mutex));
|
||||
|
||||
pthread_mutex_lock(&(pool->shm->data_mutex));
|
||||
pool->shm->busy_count -= 1;
|
||||
pthread_mutex_unlock(&(pool->shm->data_mutex));
|
||||
|
||||
if (WIFSIGNALED(status)) {
|
||||
// TODO: Get current_job based on PID
|
||||
const char *job_filepath = "TODO";
|
||||
|
||||
LOG_FATALF_NO_EXIT(
|
||||
"tpool.c",
|
||||
"Child process was terminated by signal (%s).\n"
|
||||
BLANK_STR "The process was working on %s",
|
||||
strsignal(WTERMSIG(status)),
|
||||
job_filepath
|
||||
);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
worker_proc_init(pool, ((start_thread_arg_t *) arg)->thread_id);
|
||||
|
||||
pthread_mutex_lock(&pool->shm->mutex);
|
||||
pthread_cond_signal(&pool->shm->workers_initialized_cond);
|
||||
pool->shm->initialized_count += 1;
|
||||
pthread_mutex_unlock(&pool->shm->mutex);
|
||||
|
||||
worker_thread_loop(pool);
|
||||
|
||||
if (pool->cleanup_func != NULL) {
|
||||
LOG_INFO("tpool.c", "Executing cleanup function")
|
||||
pool->cleanup_func();
|
||||
LOG_DEBUG("tpool.c", "Done executing cleanup function")
|
||||
}
|
||||
pthread_mutex_lock(&pool->shm->mutex);
|
||||
pthread_cond_signal(&pool->shm->done_working_cond);
|
||||
pthread_mutex_unlock(&pool->shm->mutex);
|
||||
|
||||
pthread_cond_signal(&(pool->working_cond));
|
||||
pthread_mutex_unlock(&(pool->work_mutex));
|
||||
exit(0);
|
||||
|
||||
} else {
|
||||
int status;
|
||||
// TODO: On crash, print debug info and resume thread
|
||||
waitpid(pid, &status, 0);
|
||||
|
||||
LOG_DEBUGF("tpool.c", "Child process terminated with status code %d", WEXITSTATUS(status))
|
||||
|
||||
pthread_mutex_lock(&(pool->work_mutex));
|
||||
pool->busy_cnt -= 1;
|
||||
pool->done_cnt++;
|
||||
pthread_mutex_unlock(&(pool->work_mutex));
|
||||
|
||||
if (WIFSIGNALED(status)) {
|
||||
// parse_job_t *job = g_hash_table_lookup(ScanCtx.dbg_current_files, GINT_TO_POINTER(pthread_self()));
|
||||
const char *job_filepath = "TODO";
|
||||
|
||||
LOG_FATALF_NO_EXIT(
|
||||
"tpool.c",
|
||||
"Child process was terminated by signal (%s).\n"
|
||||
BLANK_STR "The process was working on %s",
|
||||
strsignal(WTERMSIG(status)),
|
||||
job_filepath
|
||||
)
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void tpool_wait(tpool_t *pool) {
|
||||
LOG_DEBUG("tpool.c", "Waiting for worker threads to finish")
|
||||
pthread_mutex_lock(&(pool->work_mutex));
|
||||
LOG_DEBUG("tpool.c", "Waiting for worker threads to finish");
|
||||
pthread_mutex_lock(&pool->shm->mutex);
|
||||
|
||||
pool->waiting = TRUE;
|
||||
pool->shm->waiting = TRUE;
|
||||
pool->shm->ipc_ctx.no_more_jobs = TRUE;
|
||||
|
||||
while (TRUE) {
|
||||
if (pool->done_cnt < pool->work_cnt) {
|
||||
pthread_cond_wait(&(pool->working_cond), &(pool->work_mutex));
|
||||
if (pool->shm->ipc_ctx.job_count > 0) {
|
||||
pthread_cond_wait(&(pool->shm->done_working_cond), &pool->shm->mutex);
|
||||
} else {
|
||||
LOG_INFOF("tpool.c", "Received head=NULL signal, busy_cnt=%d", pool->busy_cnt);
|
||||
|
||||
if (pool->done_cnt == pool->work_cnt && pool->busy_cnt == 0) {
|
||||
pool->stop = TRUE;
|
||||
if (pool->shm->ipc_ctx.job_count == 0 && pool->shm->busy_count == 0) {
|
||||
pool->shm->stop = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -262,34 +255,25 @@ void tpool_wait(tpool_t *pool) {
|
||||
if (pool->print_progress && !LogCtx.json_logs) {
|
||||
progress_bar_print(1.0, ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
|
||||
}
|
||||
pthread_mutex_unlock(&(pool->work_mutex));
|
||||
pthread_mutex_unlock(&pool->shm->mutex);
|
||||
|
||||
LOG_INFO("tpool.c", "Worker threads finished")
|
||||
LOG_INFO("tpool.c", "Worker threads finished");
|
||||
}
|
||||
|
||||
void tpool_destroy(tpool_t *pool) {
|
||||
if (pool == NULL) {
|
||||
return;
|
||||
}
|
||||
LOG_INFO("tpool.c", "Destroying thread pool");
|
||||
|
||||
LOG_INFO("tpool.c", "Destroying thread pool")
|
||||
database_close(ProcData.ipc_db, FALSE);
|
||||
|
||||
pthread_mutex_lock(&(pool->work_mutex));
|
||||
tpool_work_t *work = pool->work_head;
|
||||
int count = 0;
|
||||
while (work != NULL) {
|
||||
tpool_work_t *tmp = work->next;
|
||||
free(work);
|
||||
work = tmp;
|
||||
count += 1;
|
||||
}
|
||||
|
||||
LOG_DEBUGF("tpool.c", "Destroyed %d jobs", count);
|
||||
|
||||
pthread_cond_broadcast(&(pool->has_work_cond));
|
||||
pthread_mutex_unlock(&(pool->work_mutex));
|
||||
pthread_mutex_lock(&pool->shm->mutex);
|
||||
pthread_cond_broadcast(&pool->shm->ipc_ctx.has_work_cond);
|
||||
pthread_mutex_unlock(&pool->shm->mutex);
|
||||
|
||||
for (size_t i = 0; i < pool->thread_cnt; i++) {
|
||||
for (size_t i = 0; i < pool->num_threads; i++) {
|
||||
pthread_t thread = pool->threads[i];
|
||||
if (thread != 0) {
|
||||
void *_;
|
||||
@ -297,42 +281,33 @@ void tpool_destroy(tpool_t *pool) {
|
||||
}
|
||||
}
|
||||
|
||||
LOG_INFO("tpool.c", "Final cleanup")
|
||||
pthread_mutex_destroy(&pool->shm->ipc_ctx.mutex);
|
||||
pthread_mutex_destroy(&pool->shm->mutex);
|
||||
pthread_cond_destroy(&pool->shm->ipc_ctx.has_work_cond);
|
||||
pthread_cond_destroy(&pool->shm->done_working_cond);
|
||||
|
||||
pthread_mutex_destroy(&(pool->work_mutex));
|
||||
pthread_cond_destroy(&(pool->has_work_cond));
|
||||
pthread_cond_destroy(&(pool->working_cond));
|
||||
|
||||
munmap(pool->shared_memory, pool->shared_memory_size);
|
||||
munmap(pool->shm, sizeof(*pool->shm));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a thread pool
|
||||
* @param thread_cnt Worker threads count
|
||||
*/
|
||||
tpool_t *tpool_create(int thread_cnt, void cleanup_func(), int print_progress) {
|
||||
tpool_t *tpool_create(int thread_cnt, int print_progress) {
|
||||
|
||||
size_t shm_size = 1024 * 1024 * 2000;
|
||||
int fork = FALSE;
|
||||
|
||||
void *shared_memory = mmap(NULL, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
|
||||
tpool_t *pool = malloc(sizeof(tpool_t));
|
||||
|
||||
tpool_t *pool = (tpool_t *) shared_memory;
|
||||
pool->shared_memory = shared_memory;
|
||||
pool->shared_memory_size = shm_size;
|
||||
pool->mempool = (ncx_slab_pool_t *) (pool->shared_memory + sizeof(tpool_t));
|
||||
pool->mempool->addr = pool->mempool;
|
||||
pool->mempool->min_shift = 4;
|
||||
pool->mempool->end = pool->shared_memory + shm_size;
|
||||
pool->shm = mmap(NULL, sizeof(*pool->shm), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
|
||||
|
||||
ncx_slab_init(pool->mempool);
|
||||
|
||||
pool->thread_cnt = thread_cnt;
|
||||
pool->work_cnt = 0;
|
||||
pool->done_cnt = 0;
|
||||
pool->busy_cnt = 0;
|
||||
pool->stop = FALSE;
|
||||
pool->waiting = FALSE;
|
||||
pool->cleanup_func = cleanup_func;
|
||||
pool->fork = fork;
|
||||
pool->num_threads = thread_cnt;
|
||||
pool->shm->ipc_ctx.job_count = 0;
|
||||
pool->shm->ipc_ctx.no_more_jobs = FALSE;
|
||||
pool->shm->stop = FALSE;
|
||||
pool->shm->waiting = FALSE;
|
||||
pool->shm->job_type = JOB_UNDEFINED;
|
||||
memset(pool->threads, 0, sizeof(pool->threads));
|
||||
pool->print_progress = print_progress;
|
||||
|
||||
@ -340,27 +315,50 @@ tpool_t *tpool_create(int thread_cnt, void cleanup_func(), int print_progress) {
|
||||
pthread_mutexattr_init(&mutexattr);
|
||||
pthread_mutexattr_setpshared(&mutexattr, TRUE);
|
||||
|
||||
pthread_mutex_init(&(pool->work_mutex), &mutexattr);
|
||||
pthread_mutex_init(&(pool->mem_mutex), &mutexattr);
|
||||
pthread_mutex_init(&(pool->shm->mutex), &mutexattr);
|
||||
pthread_mutex_init(&(pool->shm->data_mutex), &mutexattr);
|
||||
pthread_mutex_init(&(pool->shm->ipc_ctx.mutex), &mutexattr);
|
||||
pthread_mutex_init(&(pool->shm->ipc_ctx.db_mutex), &mutexattr);
|
||||
pthread_mutex_init(&(pool->shm->ipc_ctx.index_db_mutex), &mutexattr);
|
||||
|
||||
pthread_condattr_t condattr;
|
||||
pthread_condattr_init(&condattr);
|
||||
pthread_condattr_setpshared(&condattr, TRUE);
|
||||
|
||||
pthread_cond_init(&(pool->has_work_cond), &condattr);
|
||||
pthread_cond_init(&(pool->working_cond), &condattr);
|
||||
pthread_cond_init(&(pool->shm->ipc_ctx.has_work_cond), &condattr);
|
||||
pthread_cond_init(&(pool->shm->done_working_cond), &condattr);
|
||||
pthread_cond_init(&(pool->shm->workers_initialized_cond), &condattr);
|
||||
|
||||
pool->work_head = NULL;
|
||||
pool->work_tail = NULL;
|
||||
remove("/dev/shm/ipc.sist2");
|
||||
remove("/dev/shm/ipc.sist2-wal");
|
||||
remove("/dev/shm/ipc.sist2-shm");
|
||||
ProcData.ipc_db = database_create("/dev/shm/ipc.sist2", IPC_PRODUCER_DATABASE);
|
||||
ProcData.ipc_db->ipc_ctx = &pool->shm->ipc_ctx;
|
||||
database_initialize(ProcData.ipc_db);
|
||||
|
||||
return pool;
|
||||
}
|
||||
|
||||
void tpool_start(tpool_t *pool) {
|
||||
|
||||
LOG_INFOF("tpool.c", "Starting thread pool with %d threads", pool->thread_cnt)
|
||||
LOG_INFOF("tpool.c", "Starting thread pool with %d threads", pool->num_threads);
|
||||
|
||||
for (size_t i = 0; i < pool->thread_cnt; i++) {
|
||||
pthread_create(&pool->threads[i], NULL, tpool_worker, pool);
|
||||
pthread_mutex_lock(&pool->shm->mutex);
|
||||
|
||||
for (int i = 0; i < pool->num_threads; i++) {
|
||||
|
||||
start_thread_arg_t *arg = malloc(sizeof(start_thread_arg_t));
|
||||
arg->thread_id = i + 1;
|
||||
arg->pool = pool;
|
||||
|
||||
pthread_create(&pool->threads[i], NULL, tpool_worker, arg);
|
||||
}
|
||||
|
||||
// Only open the database when all workers are done initializing
|
||||
while (pool->shm->initialized_count != pool->num_threads) {
|
||||
pthread_cond_wait(&pool->shm->workers_initialized_cond, &pool->shm->mutex);
|
||||
}
|
||||
pthread_mutex_unlock(&pool->shm->mutex);
|
||||
|
||||
database_open(ProcData.ipc_db);
|
||||
}
|
||||
|
21
src/tpool.h
21
src/tpool.h
@ -2,34 +2,27 @@
|
||||
#define SIST2_TPOOL_H
|
||||
|
||||
#include "sist.h"
|
||||
#include "third-party/libscan/libscan/scan.h"
|
||||
#include "index/elastic.h"
|
||||
#include "src/database/database.h"
|
||||
|
||||
struct tpool;
|
||||
typedef struct tpool tpool_t;
|
||||
|
||||
typedef struct {
|
||||
size_t arg_size;
|
||||
void *arg;
|
||||
} tpool_work_arg_t;
|
||||
|
||||
typedef struct {
|
||||
size_t arg_size;
|
||||
char arg[0];
|
||||
} tpool_work_arg_shm_t;
|
||||
|
||||
typedef void (*thread_func_t)(tpool_work_arg_shm_t *arg);
|
||||
|
||||
tpool_t *tpool_create(int num, void (*cleanup_func)(), int print_progress);
|
||||
tpool_t *tpool_create(int num, int print_progress);
|
||||
|
||||
void tpool_start(tpool_t *pool);
|
||||
|
||||
void tpool_destroy(tpool_t *pool);
|
||||
|
||||
int tpool_add_work(tpool_t *pool, thread_func_t func, tpool_work_arg_t *arg);
|
||||
int tpool_add_work(tpool_t *pool, job_t *job);
|
||||
|
||||
void tpool_wait(tpool_t *pool);
|
||||
|
||||
void tpool_dump_debug_info(tpool_t *pool);
|
||||
|
||||
void job_destroy(job_t *job);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
14
src/types.h
14
src/types.h
@ -1,24 +1,26 @@
|
||||
#ifndef SIST2_TYPES_H
|
||||
#define SIST2_TYPES_H
|
||||
|
||||
#define INDEX_TYPE_NDJSON "ndjson"
|
||||
typedef struct database database_t;
|
||||
|
||||
typedef struct index_descriptor {
|
||||
char id[SIST_INDEX_ID_LEN];
|
||||
char version[64];
|
||||
int version_major;
|
||||
int version_minor;
|
||||
int version_patch;
|
||||
long timestamp;
|
||||
char root[PATH_MAX];
|
||||
char rewrite_url[8192];
|
||||
short root_len;
|
||||
int root_len;
|
||||
char name[1024];
|
||||
char type[64];
|
||||
} index_descriptor_t;
|
||||
|
||||
typedef struct index_t {
|
||||
struct index_descriptor desc;
|
||||
struct store_t *store;
|
||||
struct store_t *tag_store;
|
||||
struct store_t *meta_store;
|
||||
|
||||
database_t *db;
|
||||
|
||||
char path[PATH_MAX];
|
||||
} index_t;
|
||||
|
||||
|
53
src/util.c
53
src/util.c
@ -25,7 +25,6 @@ dyn_buffer_t url_escape(char *str) {
|
||||
}
|
||||
|
||||
char *abspath(const char *path) {
|
||||
|
||||
char *expanded = expandpath(path);
|
||||
|
||||
char *abs = realpath(expanded, NULL);
|
||||
@ -34,8 +33,7 @@ char *abspath(const char *path) {
|
||||
return NULL;
|
||||
}
|
||||
if (strlen(abs) > 1) {
|
||||
abs = realloc(abs, strlen(abs) + 2);
|
||||
strcat(abs, "/");
|
||||
abs = realloc(abs, strlen(abs) + 1);
|
||||
}
|
||||
|
||||
return abs;
|
||||
@ -76,9 +74,8 @@ char *expandpath(const char *path) {
|
||||
}
|
||||
}
|
||||
|
||||
char *expanded = malloc(strlen(tmp) + 2);
|
||||
char *expanded = malloc(strlen(tmp) + 1);
|
||||
strcpy(expanded, tmp);
|
||||
strcat(expanded, "/");
|
||||
|
||||
wordfree(&w);
|
||||
return expanded;
|
||||
@ -103,6 +100,10 @@ void progress_bar_print_json(size_t done, size_t count, size_t tn_size, size_t i
|
||||
|
||||
void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {
|
||||
|
||||
if (isnan(percentage)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: Fix this with shm/ctx
|
||||
static int last_val = -1;
|
||||
|
||||
@ -150,10 +151,6 @@ void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {
|
||||
PrintingProgressBar = TRUE;
|
||||
}
|
||||
|
||||
GHashTable *incremental_get_table() {
|
||||
GHashTable *file_table = g_hash_table_new_full(g_str_hash, g_str_equal, free, NULL);
|
||||
return file_table;
|
||||
}
|
||||
|
||||
const char *find_file_in_paths(const char *paths[], const char *filename) {
|
||||
|
||||
@ -167,7 +164,7 @@ const char *find_file_in_paths(const char *paths[], const char *filename) {
|
||||
char path[PATH_MAX];
|
||||
snprintf(path, sizeof(path), "%s%s", apath, filename);
|
||||
|
||||
LOG_DEBUGF("util.c", "Looking for '%s' in folder '%s'", filename, apath)
|
||||
LOG_DEBUGF("util.c", "Looking for '%s' in folder '%s'", filename, apath);
|
||||
free(apath);
|
||||
|
||||
struct stat info;
|
||||
@ -269,3 +266,39 @@ void str_unescape(char *dst, const char *str) {
|
||||
}
|
||||
*cur = '\0';
|
||||
}
|
||||
|
||||
#define NSEC_PER_SEC 1000000000
|
||||
|
||||
struct timespec timespec_normalise(struct timespec ts) {
|
||||
while (ts.tv_nsec >= NSEC_PER_SEC) {
|
||||
ts.tv_sec += 1;
|
||||
ts.tv_nsec -= NSEC_PER_SEC;
|
||||
}
|
||||
|
||||
while (ts.tv_nsec <= -NSEC_PER_SEC) {
|
||||
ts.tv_sec -= 1;
|
||||
ts.tv_nsec += NSEC_PER_SEC;
|
||||
}
|
||||
|
||||
if (ts.tv_nsec < 0) {
|
||||
ts.tv_sec -= 1;
|
||||
ts.tv_nsec = (NSEC_PER_SEC + ts.tv_nsec);
|
||||
}
|
||||
|
||||
return ts;
|
||||
}
|
||||
|
||||
struct timespec timespec_add(struct timespec ts1, long usec) {
|
||||
ts1 = timespec_normalise(ts1);
|
||||
|
||||
struct timespec ts2 = timespec_normalise((struct timespec) {
|
||||
.tv_sec = 0,
|
||||
.tv_nsec = usec * 1000
|
||||
});
|
||||
|
||||
ts1.tv_sec += ts2.tv_sec;
|
||||
ts1.tv_nsec += ts2.tv_nsec;
|
||||
|
||||
return timespec_normalise(ts1);
|
||||
}
|
||||
|
||||
|
45
src/util.h
45
src/util.h
@ -5,8 +5,6 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <glib.h>
|
||||
|
||||
#include "third-party/utf8.h/utf8.h"
|
||||
#include "libscan/scan.h"
|
||||
|
||||
@ -22,9 +20,6 @@ extern int PrintingProgressBar;
|
||||
void progress_bar_print_json(size_t done, size_t count, size_t tn_size, size_t index_size, int waiting);
|
||||
void progress_bar_print(double percentage, size_t tn_size, size_t index_size);
|
||||
|
||||
GHashTable *incremental_get_table();
|
||||
|
||||
|
||||
const char *find_file_in_paths(const char **paths, const char *filename);
|
||||
|
||||
|
||||
@ -100,31 +95,23 @@ static void generate_doc_id(const char *rel_path, char *doc_id) {
|
||||
buf2hex(md, sizeof(md), doc_id);
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static void incremental_put(GHashTable *table, const char doc_id[SIST_DOC_ID_LEN], int mtime) {
|
||||
char *ptr = malloc(SIST_DOC_ID_LEN);
|
||||
strcpy(ptr, doc_id);
|
||||
g_hash_table_insert(table, ptr, GINT_TO_POINTER(mtime));
|
||||
}
|
||||
#define MILLISECOND 1000
|
||||
|
||||
__always_inline
|
||||
static int incremental_get(GHashTable *table, const char doc_id[SIST_DOC_ID_LEN]) {
|
||||
if (table != NULL) {
|
||||
return GPOINTER_TO_INT(g_hash_table_lookup(table, doc_id));
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
struct timespec timespec_add(struct timespec ts1, long usec);
|
||||
|
||||
/**
|
||||
* Marks a file by adding it to a table.
|
||||
* !!Not thread safe.
|
||||
*/
|
||||
__always_inline
|
||||
static int incremental_mark_file(GHashTable *table, const char doc_id[SIST_DOC_ID_LEN]) {
|
||||
char *ptr = malloc(SIST_DOC_ID_LEN);
|
||||
strcpy(ptr, doc_id);
|
||||
return g_hash_table_insert(table, ptr, GINT_TO_POINTER(1));
|
||||
}
|
||||
#define TIMER_INIT() struct timespec timer_begin
|
||||
#define TIMER_START() clock_gettime(CLOCK_REALTIME, &timer_begin)
|
||||
#define TIMER_END(x) do { \
|
||||
struct timespec timer_end; \
|
||||
clock_gettime(CLOCK_REALTIME, &timer_end); \
|
||||
x = (timer_end.tv_sec - timer_begin.tv_sec) * 1000000 + (timer_end.tv_nsec - timer_begin.tv_nsec) / 1000; \
|
||||
} while (0)
|
||||
|
||||
#define pthread_cond_timedwait_ms(cond, mutex, delay_ms) do {\
|
||||
struct timespec now; \
|
||||
clock_gettime(CLOCK_REALTIME, &now); \
|
||||
struct timespec end_time = timespec_add(now, MILLISECOND * delay_ms); \
|
||||
pthread_cond_timedwait(cond, mutex, &end_time); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
451
src/web/serve.c
451
src/web/serve.c
@ -1,15 +1,14 @@
|
||||
#include "serve.h"
|
||||
|
||||
#include "src/sist.h"
|
||||
#include "src/io/store.h"
|
||||
#include "static_generated.c"
|
||||
//#include "src/io/store.h"
|
||||
#include "src/index/elastic.h"
|
||||
#include "src/index/web.h"
|
||||
#include "src/auth0/auth0_c_api.h"
|
||||
#include "src/web/web_util.h"
|
||||
|
||||
#include <src/ctx.h>
|
||||
|
||||
#define HTTP_SERVER_HEADER "Server: sist2/" VERSION "\r\n"
|
||||
#define HTTP_TEXT_TYPE_HEADER "Content-Type: text/plain;charset=utf-8\r\n"
|
||||
#define HTTP_REPLY_NOT_FOUND mg_http_reply(nc, 404, HTTP_SERVER_HEADER HTTP_TEXT_TYPE_HEADER, "Not found");
|
||||
|
||||
@ -20,62 +19,6 @@ static struct mg_http_serve_opts DefaultServeOpts = {
|
||||
.mime_types = ""
|
||||
};
|
||||
|
||||
|
||||
__always_inline
|
||||
static char *address_to_string(struct mg_addr *addr) {
|
||||
static char address_to_string_buf[INET6_ADDRSTRLEN];
|
||||
|
||||
return mg_ntoa(addr, address_to_string_buf, sizeof(address_to_string_buf));
|
||||
}
|
||||
|
||||
static void send_response_line(struct mg_connection *nc, int status_code, size_t length, char *extra_headers) {
|
||||
mg_printf(
|
||||
nc,
|
||||
"HTTP/1.1 %d %s\r\n"
|
||||
HTTP_SERVER_HEADER
|
||||
"Content-Length: %d\r\n"
|
||||
"%s\r\n\r\n",
|
||||
status_code, "OK",
|
||||
length,
|
||||
extra_headers
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
index_t *get_index_by_id(const char *index_id) {
|
||||
for (int i = WebCtx.index_count; i >= 0; i--) {
|
||||
if (strncmp(index_id, WebCtx.indices[i].desc.id, SIST_INDEX_ID_LEN) == 0) {
|
||||
return &WebCtx.indices[i];
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
store_t *get_store(const char *index_id) {
|
||||
index_t *idx = get_index_by_id(index_id);
|
||||
if (idx != NULL) {
|
||||
return idx->store;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
store_t *get_tag_store(const char *index_id) {
|
||||
index_t *idx = get_index_by_id(index_id);
|
||||
if (idx != NULL) {
|
||||
return idx->tag_store;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void search_index(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
if (WebCtx.dev) {
|
||||
mg_http_serve_file(nc, hm, "sist2-vue/dist/index.html", &DefaultServeOpts);
|
||||
} else {
|
||||
send_response_line(nc, 200, sizeof(index_html), "Content-Type: text/html");
|
||||
mg_send(nc, index_html, sizeof(index_html));
|
||||
}
|
||||
}
|
||||
|
||||
void stats_files(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
|
||||
if (hm->uri.len != SIST_INDEX_ID_LEN + 4) {
|
||||
@ -87,7 +30,7 @@ void stats_files(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
memcpy(arg_index_id, hm->uri.ptr + 3, SIST_INDEX_ID_LEN);
|
||||
*(arg_index_id + SIST_INDEX_ID_LEN - 1) = '\0';
|
||||
|
||||
index_t *index = get_index_by_id(arg_index_id);
|
||||
index_t *index = web_get_index_by_id(arg_index_id);
|
||||
if (index == NULL) {
|
||||
HTTP_REPLY_NOT_FOUND
|
||||
return;
|
||||
@ -123,87 +66,58 @@ void stats_files(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
mg_http_serve_file(nc, hm, full_path, &opts);
|
||||
}
|
||||
|
||||
void javascript(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
void serve_index_html(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
if (WebCtx.dev) {
|
||||
mg_http_serve_file(nc, hm, "sist2-vue/dist/index.html", &DefaultServeOpts);
|
||||
} else {
|
||||
web_serve_asset_index_html(nc);
|
||||
}
|
||||
}
|
||||
|
||||
void serve_index_js(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
if (WebCtx.dev) {
|
||||
mg_http_serve_file(nc, hm, "sist2-vue/dist/js/index.js", &DefaultServeOpts);
|
||||
} else {
|
||||
send_response_line(nc, 200, sizeof(index_js), "Content-Type: application/javascript");
|
||||
mg_send(nc, index_js, sizeof(index_js));
|
||||
web_serve_asset_index_js(nc);
|
||||
}
|
||||
}
|
||||
|
||||
void javascript_vendor(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
void serve_chunk_vendors_js(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
if (WebCtx.dev) {
|
||||
mg_http_serve_file(nc, hm, "sist2-vue/dist/js/chunk-vendors.js", &DefaultServeOpts);
|
||||
} else {
|
||||
send_response_line(nc, 200, sizeof(chunk_vendors_js), "Content-Type: application/javascript");
|
||||
mg_send(nc, chunk_vendors_js, sizeof(chunk_vendors_js));
|
||||
web_serve_asset_chunk_vendors_js(nc);
|
||||
}
|
||||
}
|
||||
|
||||
void favicon(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
send_response_line(nc, 200, sizeof(favicon_ico), "Content-Type: image/x-icon");
|
||||
mg_send(nc, favicon_ico, sizeof(favicon_ico));
|
||||
void serve_favicon_ico(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
web_serve_asset_favicon_ico(nc);
|
||||
}
|
||||
|
||||
void style(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
send_response_line(nc, 200, sizeof(index_css), "Content-Type: text/css");
|
||||
mg_send(nc, index_css, sizeof(index_css));
|
||||
void serve_style_css(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
web_serve_asset_style_css(nc);
|
||||
}
|
||||
|
||||
void style_vendor(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
send_response_line(nc, 200, sizeof(chunk_vendors_css), "Content-Type: text/css");
|
||||
mg_send(nc, chunk_vendors_css, sizeof(chunk_vendors_css));
|
||||
void serve_chunk_vendors_css(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
web_serve_asset_chunk_vendors_css(nc);
|
||||
}
|
||||
|
||||
void thumbnail(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
void serve_thumbnail(struct mg_connection *nc, struct mg_http_message *hm, const char *arg_index,
|
||||
const char *arg_doc_id, int arg_num) {
|
||||
|
||||
int has_thumbnail_index = FALSE;
|
||||
|
||||
if (hm->uri.len != SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2) {
|
||||
|
||||
if (hm->uri.len != SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2 + 4) {
|
||||
LOG_DEBUGF("serve.c", "Invalid thumbnail path: %.*s", (int) hm->uri.len, hm->uri.ptr)
|
||||
HTTP_REPLY_NOT_FOUND
|
||||
return;
|
||||
}
|
||||
has_thumbnail_index = TRUE;
|
||||
}
|
||||
|
||||
char arg_doc_id[SIST_DOC_ID_LEN];
|
||||
char arg_index[SIST_INDEX_ID_LEN];
|
||||
|
||||
memcpy(arg_index, hm->uri.ptr + 3, SIST_INDEX_ID_LEN);
|
||||
*(arg_index + SIST_INDEX_ID_LEN - 1) = '\0';
|
||||
memcpy(arg_doc_id, hm->uri.ptr + 3 + SIST_INDEX_ID_LEN, SIST_DOC_ID_LEN);
|
||||
*(arg_doc_id + SIST_DOC_ID_LEN - 1) = '\0';
|
||||
|
||||
store_t *store = get_store(arg_index);
|
||||
if (store == NULL) {
|
||||
LOG_DEBUGF("serve.c", "Could not get store for index: %s", arg_index)
|
||||
database_t *db = web_get_database(arg_index);
|
||||
if (db == NULL) {
|
||||
LOG_DEBUGF("serve.c", "Could not get database for index: %s", arg_index);
|
||||
HTTP_REPLY_NOT_FOUND
|
||||
return;
|
||||
}
|
||||
|
||||
char *data;
|
||||
size_t data_len = 0;
|
||||
|
||||
if (has_thumbnail_index) {
|
||||
const char *tn_index = hm->uri.ptr + SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2;
|
||||
|
||||
char tn_key[sizeof(arg_doc_id) + sizeof(char) * 4];
|
||||
|
||||
memcpy(tn_key, arg_doc_id, sizeof(arg_doc_id));
|
||||
memcpy(tn_key + sizeof(arg_doc_id) - 1, tn_index, sizeof(char) * 4);
|
||||
*(tn_key + sizeof(tn_key) - 1) = '\0';
|
||||
|
||||
data = store_read(store, (char *) tn_key, sizeof(tn_key), &data_len);
|
||||
} else {
|
||||
data = store_read(store, (char *) arg_doc_id, sizeof(arg_doc_id), &data_len);
|
||||
}
|
||||
void *data = database_read_thumbnail(db, arg_doc_id, arg_num, &data_len);
|
||||
|
||||
if (data_len != 0) {
|
||||
send_response_line(
|
||||
web_send_headers(
|
||||
nc, 200, data_len,
|
||||
"Content-Type: image/jpeg\r\n"
|
||||
"Cache-Control: max-age=31536000"
|
||||
@ -216,10 +130,50 @@ void thumbnail(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
}
|
||||
}
|
||||
|
||||
void search(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
void thumbnail_with_num(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
if (hm->uri.len != SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2 + 5) {
|
||||
LOG_DEBUGF("serve.c", "Invalid thumbnail path: %.*s", (int) hm->uri.len, hm->uri.ptr);
|
||||
HTTP_REPLY_NOT_FOUND
|
||||
return;
|
||||
}
|
||||
|
||||
char arg_doc_id[SIST_DOC_ID_LEN];
|
||||
char arg_index[SIST_INDEX_ID_LEN];
|
||||
char arg_num[5] = {0};
|
||||
|
||||
memcpy(arg_index, hm->uri.ptr + 3, SIST_INDEX_ID_LEN);
|
||||
*(arg_index + SIST_INDEX_ID_LEN - 1) = '\0';
|
||||
memcpy(arg_doc_id, hm->uri.ptr + 3 + SIST_INDEX_ID_LEN, SIST_DOC_ID_LEN);
|
||||
*(arg_doc_id + SIST_DOC_ID_LEN - 1) = '\0';
|
||||
memcpy(arg_num, hm->uri.ptr + SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2, 4);
|
||||
|
||||
int num = (int) strtol(arg_num, NULL, 10);
|
||||
|
||||
serve_thumbnail(nc, hm, arg_index, arg_doc_id, num);
|
||||
}
|
||||
|
||||
void thumbnail(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
|
||||
if (hm->uri.len != SIST_INDEX_ID_LEN + SIST_DOC_ID_LEN + 2) {
|
||||
LOG_DEBUGF("serve.c", "Invalid thumbnail path: %.*s", (int) hm->uri.len, hm->uri.ptr);
|
||||
HTTP_REPLY_NOT_FOUND
|
||||
return;
|
||||
}
|
||||
|
||||
char arg_doc_id[SIST_DOC_ID_LEN];
|
||||
char arg_index[SIST_INDEX_ID_LEN];
|
||||
|
||||
memcpy(arg_index, hm->uri.ptr + 3, SIST_INDEX_ID_LEN);
|
||||
*(arg_index + SIST_INDEX_ID_LEN - 1) = '\0';
|
||||
memcpy(arg_doc_id, hm->uri.ptr + 3 + SIST_INDEX_ID_LEN, SIST_DOC_ID_LEN);
|
||||
*(arg_doc_id + SIST_DOC_ID_LEN - 1) = '\0';
|
||||
|
||||
serve_thumbnail(nc, hm, arg_index, arg_doc_id, 0);
|
||||
}
|
||||
|
||||
void search(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
if (hm->body.len == 0) {
|
||||
LOG_DEBUG("serve.c", "Client sent empty body, ignoring request")
|
||||
LOG_DEBUG("serve.c", "Client sent empty body, ignoring request");
|
||||
mg_http_reply(nc, 400, HTTP_SERVER_HEADER HTTP_TEXT_TYPE_HEADER, "Invalid request");
|
||||
return;
|
||||
}
|
||||
@ -266,7 +220,7 @@ void serve_file_from_disk(cJSON *json, index_t *idx, struct mg_connection *nc, s
|
||||
|
||||
if (strcmp(MG_VERSION, EXPECTED_MONGOOSE_VERSION) != 0) {
|
||||
LOG_WARNING("serve.c", "sist2 was not linked with latest mongoose version, "
|
||||
"serving file from disk might not work as expected.")
|
||||
"serving file from disk might not work as expected.");
|
||||
}
|
||||
|
||||
const char *path = cJSON_GetObjectItem(json, "path")->valuestring;
|
||||
@ -285,7 +239,7 @@ void serve_file_from_disk(cJSON *json, index_t *idx, struct mg_connection *nc, s
|
||||
idx->desc.root, path_unescaped, strlen(path_unescaped) == 0 ? "" : "/",
|
||||
name_unescaped, strlen(ext) == 0 ? "" : ".", ext);
|
||||
|
||||
LOG_DEBUGF("serve.c", "Serving file from disk: %s", full_path)
|
||||
LOG_DEBUGF("serve.c", "Serving file from disk: %s", full_path);
|
||||
|
||||
char disposition[8192];
|
||||
snprintf(disposition, sizeof(disposition),
|
||||
@ -372,7 +326,7 @@ void index_info(struct mg_connection *nc) {
|
||||
|
||||
char *json_str = cJSON_PrintUnformatted(json);
|
||||
|
||||
send_response_line(nc, 200, strlen(json_str), "Content-Type: application/json");
|
||||
web_send_headers(nc, 200, strlen(json_str), "Content-Type: application/json");
|
||||
mg_send(nc, json_str, strlen(json_str));
|
||||
free(json_str);
|
||||
cJSON_Delete(json);
|
||||
@ -382,7 +336,7 @@ void index_info(struct mg_connection *nc) {
|
||||
void file(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
|
||||
if (hm->uri.len != SIST_DOC_ID_LEN + 2) {
|
||||
LOG_DEBUGF("serve.c", "Invalid file path: %.*s", (int) hm->uri.len, hm->uri.ptr)
|
||||
LOG_DEBUGF("serve.c", "Invalid file path: %.*s", (int) hm->uri.len, hm->uri.ptr);
|
||||
HTTP_REPLY_NOT_FOUND
|
||||
return;
|
||||
}
|
||||
@ -412,7 +366,7 @@ void file(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
next = parent->valuestring;
|
||||
}
|
||||
|
||||
index_t *idx = get_index_by_id(index_id->valuestring);
|
||||
index_t *idx = web_get_index_by_id(index_id->valuestring);
|
||||
|
||||
if (idx == NULL) {
|
||||
cJSON_Delete(doc);
|
||||
@ -431,9 +385,9 @@ void file(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
void status(struct mg_connection *nc) {
|
||||
char *status = elastic_get_status();
|
||||
if (strcmp(status, "open") == 0) {
|
||||
send_response_line(nc, 204, 0, "Content-Type: application/json");
|
||||
web_send_headers(nc, 204, 0, "Content-Type: application/json");
|
||||
} else {
|
||||
send_response_line(nc, 500, 0, "Content-Type: application/json");
|
||||
web_send_headers(nc, 500, 0, "Content-Type: application/json");
|
||||
}
|
||||
|
||||
free(status);
|
||||
@ -475,114 +429,114 @@ tag_req_t *parse_tag_request(cJSON *json) {
|
||||
}
|
||||
|
||||
void tag(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
if (hm->uri.len != SIST_INDEX_ID_LEN + 4) {
|
||||
LOG_DEBUGF("serve.c", "Invalid tag path: %.*s", (int) hm->uri.len, hm->uri.ptr)
|
||||
HTTP_REPLY_NOT_FOUND
|
||||
return;
|
||||
}
|
||||
|
||||
char arg_index[SIST_INDEX_ID_LEN];
|
||||
memcpy(arg_index, hm->uri.ptr + 5, SIST_INDEX_ID_LEN);
|
||||
*(arg_index + SIST_INDEX_ID_LEN - 1) = '\0';
|
||||
|
||||
if (hm->body.len < 2 || hm->method.len != 4 || memcmp(&hm->method, "POST", 4) == 0) {
|
||||
LOG_DEBUG("serve.c", "Invalid tag request")
|
||||
HTTP_REPLY_NOT_FOUND
|
||||
return;
|
||||
}
|
||||
|
||||
store_t *store = get_tag_store(arg_index);
|
||||
if (store == NULL) {
|
||||
LOG_DEBUGF("serve.c", "Could not get tag store for index: %s", arg_index)
|
||||
HTTP_REPLY_NOT_FOUND
|
||||
return;
|
||||
}
|
||||
|
||||
char *body = malloc(hm->body.len + 1);
|
||||
memcpy(body, hm->body.ptr, hm->body.len);
|
||||
*(body + hm->body.len) = '\0';
|
||||
cJSON *json = cJSON_Parse(body);
|
||||
|
||||
tag_req_t *arg_req = parse_tag_request(json);
|
||||
if (arg_req == NULL) {
|
||||
LOG_DEBUGF("serve.c", "Could not parse tag request", arg_index)
|
||||
cJSON_Delete(json);
|
||||
free(body);
|
||||
mg_http_reply(nc, 400, "", "Invalid request");
|
||||
return;
|
||||
}
|
||||
|
||||
cJSON *arr = NULL;
|
||||
|
||||
size_t data_len = 0;
|
||||
const char *data = store_read(store, arg_req->doc_id, SIST_DOC_ID_LEN, &data_len);
|
||||
if (data_len == 0) {
|
||||
arr = cJSON_CreateArray();
|
||||
} else {
|
||||
arr = cJSON_Parse(data);
|
||||
}
|
||||
|
||||
if (arg_req->delete) {
|
||||
|
||||
if (data_len > 0) {
|
||||
cJSON *element = NULL;
|
||||
int i = 0;
|
||||
cJSON_ArrayForEach(element, arr) {
|
||||
if (strcmp(element->valuestring, arg_req->name) == 0) {
|
||||
cJSON_DeleteItemFromArray(arr, i);
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
char *buf = malloc(sizeof(char) * 8192);
|
||||
snprintf(buf, 8192,
|
||||
"{"
|
||||
" \"script\" : {"
|
||||
" \"source\": \"if (ctx._source.tag.contains(params.tag)) { ctx._source.tag.remove(ctx._source.tag.indexOf(params.tag)) }\","
|
||||
" \"lang\": \"painless\","
|
||||
" \"params\" : {"
|
||||
" \"tag\" : \"%s\""
|
||||
" }"
|
||||
" }"
|
||||
"}", arg_req->name
|
||||
);
|
||||
|
||||
char url[4096];
|
||||
snprintf(url, sizeof(url), "%s/%s/_update/%s", WebCtx.es_url, WebCtx.es_index, arg_req->doc_id);
|
||||
nc->fn_data = web_post_async(url, buf, WebCtx.es_insecure_ssl);
|
||||
|
||||
} else {
|
||||
cJSON_AddItemToArray(arr, cJSON_CreateString(arg_req->name));
|
||||
|
||||
char *buf = malloc(sizeof(char) * 8192);
|
||||
snprintf(buf, 8192,
|
||||
"{"
|
||||
" \"script\" : {"
|
||||
" \"source\": \"if(ctx._source.tag == null) {ctx._source.tag = new ArrayList()} ctx._source.tag.add(params.tag)\","
|
||||
" \"lang\": \"painless\","
|
||||
" \"params\" : {"
|
||||
" \"tag\" : \"%s\""
|
||||
" }"
|
||||
" }"
|
||||
"}", arg_req->name
|
||||
);
|
||||
|
||||
char url[4096];
|
||||
snprintf(url, sizeof(url), "%s/%s/_update/%s", WebCtx.es_url, WebCtx.es_index, arg_req->doc_id);
|
||||
nc->fn_data = web_post_async(url, buf, WebCtx.es_insecure_ssl);
|
||||
}
|
||||
|
||||
char *json_str = cJSON_PrintUnformatted(arr);
|
||||
store_write(store, arg_req->doc_id, SIST_DOC_ID_LEN, json_str, strlen(json_str) + 1);
|
||||
store_flush(store);
|
||||
|
||||
free(arg_req);
|
||||
free(json_str);
|
||||
cJSON_Delete(json);
|
||||
cJSON_Delete(arr);
|
||||
free(body);
|
||||
// if (hm->uri.len != SIST_INDEX_ID_LEN + 4) {
|
||||
// LOG_DEBUGF("serve.c", "Invalid tag path: %.*s", (int) hm->uri.len, hm->uri.ptr)
|
||||
// HTTP_REPLY_NOT_FOUND
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// char arg_index[SIST_INDEX_ID_LEN];
|
||||
// memcpy(arg_index, hm->uri.ptr + 5, SIST_INDEX_ID_LEN);
|
||||
// *(arg_index + SIST_INDEX_ID_LEN - 1) = '\0';
|
||||
//
|
||||
// if (hm->body.len < 2 || hm->method.len != 4 || memcmp(&hm->method, "POST", 4) == 0) {
|
||||
// LOG_DEBUG("serve.c", "Invalid tag request")
|
||||
// HTTP_REPLY_NOT_FOUND
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// store_t *store = get_tag_store(arg_index);
|
||||
// if (store == NULL) {
|
||||
// LOG_DEBUGF("serve.c", "Could not get tag store for index: %s", arg_index)
|
||||
// HTTP_REPLY_NOT_FOUND
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// char *body = malloc(hm->body.len + 1);
|
||||
// memcpy(body, hm->body.ptr, hm->body.len);
|
||||
// *(body + hm->body.len) = '\0';
|
||||
// cJSON *json = cJSON_Parse(body);
|
||||
//
|
||||
// tag_req_t *arg_req = parse_tag_request(json);
|
||||
// if (arg_req == NULL) {
|
||||
// LOG_DEBUGF("serve.c", "Could not parse tag request", arg_index)
|
||||
// cJSON_Delete(json);
|
||||
// free(body);
|
||||
// mg_http_reply(nc, 400, "", "Invalid request");
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// cJSON *arr = NULL;
|
||||
//
|
||||
// size_t data_len = 0;
|
||||
// const char *data = store_read(store, arg_req->doc_id, SIST_DOC_ID_LEN, &data_len);
|
||||
// if (data_len == 0) {
|
||||
// arr = cJSON_CreateArray();
|
||||
// } else {
|
||||
// arr = cJSON_Parse(data);
|
||||
// }
|
||||
//
|
||||
// if (arg_req->delete) {
|
||||
//
|
||||
// if (data_len > 0) {
|
||||
// cJSON *element = NULL;
|
||||
// int i = 0;
|
||||
// cJSON_ArrayForEach(element, arr) {
|
||||
// if (strcmp(element->valuestring, arg_req->name) == 0) {
|
||||
// cJSON_DeleteItemFromArray(arr, i);
|
||||
// break;
|
||||
// }
|
||||
// i++;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// char *buf = malloc(sizeof(char) * 8192);
|
||||
// snprintf(buf, 8192,
|
||||
// "{"
|
||||
// " \"script\" : {"
|
||||
// " \"source\": \"if (ctx._source.tag.contains(params.tag)) { ctx._source.tag.remove(ctx._source.tag.indexOf(params.tag)) }\","
|
||||
// " \"lang\": \"painless\","
|
||||
// " \"params\" : {"
|
||||
// " \"tag\" : \"%s\""
|
||||
// " }"
|
||||
// " }"
|
||||
// "}", arg_req->name
|
||||
// );
|
||||
//
|
||||
// char url[4096];
|
||||
// snprintf(url, sizeof(url), "%s/%s/_update/%s", WebCtx.es_url, WebCtx.es_index, arg_req->doc_id);
|
||||
// nc->fn_data = web_post_async(url, buf, WebCtx.es_insecure_ssl);
|
||||
//
|
||||
// } else {
|
||||
// cJSON_AddItemToArray(arr, cJSON_CreateString(arg_req->name));
|
||||
//
|
||||
// char *buf = malloc(sizeof(char) * 8192);
|
||||
// snprintf(buf, 8192,
|
||||
// "{"
|
||||
// " \"script\" : {"
|
||||
// " \"source\": \"if(ctx._source.tag == null) {ctx._source.tag = new ArrayList()} ctx._source.tag.add(params.tag)\","
|
||||
// " \"lang\": \"painless\","
|
||||
// " \"params\" : {"
|
||||
// " \"tag\" : \"%s\""
|
||||
// " }"
|
||||
// " }"
|
||||
// "}", arg_req->name
|
||||
// );
|
||||
//
|
||||
// char url[4096];
|
||||
// snprintf(url, sizeof(url), "%s/%s/_update/%s", WebCtx.es_url, WebCtx.es_index, arg_req->doc_id);
|
||||
// nc->fn_data = web_post_async(url, buf, WebCtx.es_insecure_ssl);
|
||||
// }
|
||||
//
|
||||
// char *json_str = cJSON_PrintUnformatted(arr);
|
||||
// store_write(store, arg_req->doc_id, SIST_DOC_ID_LEN, json_str, strlen(json_str) + 1);
|
||||
// store_flush(store);
|
||||
//
|
||||
// free(arg_req);
|
||||
// free(json_str);
|
||||
// cJSON_Delete(json);
|
||||
// cJSON_Delete(arr);
|
||||
// free(body);
|
||||
}
|
||||
|
||||
int validate_auth(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
@ -601,7 +555,7 @@ int check_auth0(struct mg_http_message *hm) {
|
||||
|
||||
struct mg_str *cookie = mg_http_get_header(hm, "Cookie");
|
||||
if (cookie == NULL) {
|
||||
LOG_WARNING("serve.c", "Unauthorized request (no auth cookie)")
|
||||
LOG_WARNING("serve.c", "Unauthorized request (no auth cookie)");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
@ -610,7 +564,7 @@ int check_auth0(struct mg_http_message *hm) {
|
||||
|
||||
token = mg_http_get_header_var(*cookie, mg_str("sist2-auth0"));
|
||||
if (token.len == 0) {
|
||||
LOG_WARNING("serve.c", "Unauthorized request (no auth cookie)")
|
||||
LOG_WARNING("serve.c", "Unauthorized request (no auth cookie)");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
@ -644,28 +598,31 @@ static void ev_router(struct mg_connection *nc, int ev, void *ev_data, UNUSED(vo
|
||||
}
|
||||
}
|
||||
|
||||
char uri[256];
|
||||
memcpy(uri, hm->uri.ptr, hm->uri.len);
|
||||
*(uri + hm->uri.len) = '\0';
|
||||
LOG_DEBUGF("serve.c", "<%s> GET %s",
|
||||
address_to_string(&(nc->rem)),
|
||||
hm->uri
|
||||
)
|
||||
web_address_to_string(&(nc->rem)),
|
||||
uri
|
||||
);
|
||||
|
||||
if (mg_http_match_uri(hm, "/")) {
|
||||
search_index(nc, hm);
|
||||
serve_index_html(nc, hm);
|
||||
return;
|
||||
} else if (mg_http_match_uri(hm, "/favicon.ico")) {
|
||||
favicon(nc, hm);
|
||||
serve_favicon_ico(nc, hm);
|
||||
return;
|
||||
} else if (mg_http_match_uri(hm, "/css/index.css")) {
|
||||
style(nc, hm);
|
||||
serve_style_css(nc, hm);
|
||||
return;
|
||||
} else if (mg_http_match_uri(hm, "/css/chunk-vendors.css")) {
|
||||
style_vendor(nc, hm);
|
||||
serve_chunk_vendors_css(nc, hm);
|
||||
return;
|
||||
} else if (mg_http_match_uri(hm, "/js/index.js")) {
|
||||
javascript(nc, hm);
|
||||
serve_index_js(nc, hm);
|
||||
return;
|
||||
} else if (mg_http_match_uri(hm, "/js/chunk-vendors.js")) {
|
||||
javascript_vendor(nc, hm);
|
||||
serve_chunk_vendors_js(nc, hm);
|
||||
return;
|
||||
} else if (mg_http_match_uri(hm, "/i")) {
|
||||
index_info(nc);
|
||||
@ -683,6 +640,8 @@ static void ev_router(struct mg_connection *nc, int ev, void *ev_data, UNUSED(vo
|
||||
status(nc);
|
||||
} else if (mg_http_match_uri(hm, "/f/*")) {
|
||||
file(nc, hm);
|
||||
} else if (mg_http_match_uri(hm, "/t/*/*/*")) {
|
||||
thumbnail_with_num(nc, hm);
|
||||
} else if (mg_http_match_uri(hm, "/t/*/*")) {
|
||||
thumbnail(nc, hm);
|
||||
} else if (mg_http_match_uri(hm, "/s/*/*")) {
|
||||
@ -706,7 +665,7 @@ static void ev_router(struct mg_connection *nc, int ev, void *ev_data, UNUSED(vo
|
||||
response_t *r = ctx->response;
|
||||
|
||||
if (r->status_code == 200) {
|
||||
send_response_line(nc, 200, r->size, "Content-Type: application/json");
|
||||
web_send_headers(nc, 200, r->size, "Content-Type: application/json");
|
||||
mg_send(nc, r->body, r->size);
|
||||
} else if (r->status_code == 0) {
|
||||
sist_log("serve.c", LOG_SIST_ERROR, "Could not connect to elasticsearch!");
|
||||
@ -738,7 +697,7 @@ static void ev_router(struct mg_connection *nc, int ev, void *ev_data, UNUSED(vo
|
||||
|
||||
void serve(const char *listen_address) {
|
||||
|
||||
LOG_INFOF("serve.c", "Starting web server @ http://%s", listen_address)
|
||||
LOG_INFOF("serve.c", "Starting web server @ http://%s", listen_address);
|
||||
|
||||
struct mg_mgr mgr;
|
||||
mg_mgr_init(&mgr);
|
||||
@ -747,12 +706,12 @@ void serve(const char *listen_address) {
|
||||
|
||||
struct mg_connection *nc = mg_http_listen(&mgr, listen_address, ev_router, NULL);
|
||||
if (nc == NULL) {
|
||||
LOG_FATALF("serve.c", "Couldn't bind web server on address %s", listen_address)
|
||||
LOG_FATALF("serve.c", "Couldn't bind web server on address %s", listen_address);
|
||||
}
|
||||
|
||||
while (ok) {
|
||||
mg_mgr_poll(&mgr, 10);
|
||||
}
|
||||
mg_mgr_free(&mgr);
|
||||
LOG_INFO("serve.c", "Finished web event loop")
|
||||
LOG_INFO("serve.c", "Finished web event loop");
|
||||
}
|
||||
|
63
src/web/web_util.c
Normal file
63
src/web/web_util.c
Normal file
@ -0,0 +1,63 @@
|
||||
#include "web_util.h"
|
||||
#include "static_generated.c"
|
||||
|
||||
|
||||
void web_serve_asset_index_html(struct mg_connection *nc) {
|
||||
web_send_headers(nc, 200, sizeof(index_html), "Content-Type: text/html");
|
||||
mg_send(nc, index_html, sizeof(index_html));
|
||||
}
|
||||
|
||||
void web_serve_asset_index_js(struct mg_connection *nc) {
|
||||
web_send_headers(nc, 200, sizeof(index_js), "Content-Type: application/javascript");
|
||||
mg_send(nc, index_js, sizeof(index_js));
|
||||
}
|
||||
|
||||
void web_serve_asset_chunk_vendors_js(struct mg_connection *nc) {
|
||||
web_send_headers(nc, 200, sizeof(chunk_vendors_js), "Content-Type: application/javascript");
|
||||
mg_send(nc, chunk_vendors_js, sizeof(chunk_vendors_js));
|
||||
}
|
||||
|
||||
void web_serve_asset_favicon_ico(struct mg_connection *nc) {
|
||||
web_send_headers(nc, 200, sizeof(favicon_ico), "Content-Type: image/x-icon");
|
||||
mg_send(nc, favicon_ico, sizeof(favicon_ico));
|
||||
}
|
||||
|
||||
void web_serve_asset_style_css(struct mg_connection *nc) {
|
||||
web_send_headers(nc, 200, sizeof(index_css), "Content-Type: text/css");
|
||||
mg_send(nc, index_css, sizeof(index_css));
|
||||
}
|
||||
|
||||
void web_serve_asset_chunk_vendors_css(struct mg_connection *nc) {
|
||||
web_send_headers(nc, 200, sizeof(chunk_vendors_css), "Content-Type: text/css");
|
||||
mg_send(nc, chunk_vendors_css, sizeof(chunk_vendors_css));
|
||||
}
|
||||
|
||||
index_t *web_get_index_by_id(const char *index_id) {
|
||||
for (int i = WebCtx.index_count; i >= 0; i--) {
|
||||
if (strncmp(index_id, WebCtx.indices[i].desc.id, SIST_INDEX_ID_LEN) == 0) {
|
||||
return &WebCtx.indices[i];
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
database_t *web_get_database(const char *index_id) {
|
||||
index_t *idx = web_get_index_by_id(index_id);
|
||||
if (idx != NULL) {
|
||||
return idx->db;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void web_send_headers(struct mg_connection *nc, int status_code, size_t length, char *extra_headers) {
|
||||
mg_printf(
|
||||
nc,
|
||||
"HTTP/1.1 %d %s\r\n"
|
||||
HTTP_SERVER_HEADER
|
||||
"Content-Length: %d\r\n"
|
||||
"%s\r\n\r\n",
|
||||
status_code, "OK",
|
||||
length,
|
||||
extra_headers
|
||||
);
|
||||
}
|
32
src/web/web_util.h
Normal file
32
src/web/web_util.h
Normal file
@ -0,0 +1,32 @@
|
||||
#ifndef SIST2_WEB_UTIL_H
|
||||
#define SIST2_WEB_UTIL_H
|
||||
|
||||
#include "src/sist.h"
|
||||
#include "src/index/elastic.h"
|
||||
#include "src/ctx.h"
|
||||
#include <mongoose.h>
|
||||
|
||||
#define HTTP_SERVER_HEADER "Server: sist2/" VERSION "\r\n"
|
||||
|
||||
index_t *web_get_index_by_id(const char *index_id);
|
||||
|
||||
database_t *web_get_database(const char *index_id);
|
||||
|
||||
__always_inline
|
||||
static char *web_address_to_string(struct mg_addr *addr) {
|
||||
return "TODO";
|
||||
// static char address_to_string_buf[INET6_ADDRSTRLEN];
|
||||
//
|
||||
// return mg_ntoa(addr, address_to_string_buf, sizeof(address_to_string_buf));
|
||||
}
|
||||
|
||||
void web_send_headers(struct mg_connection *nc, int status_code, size_t length, char *extra_headers);
|
||||
|
||||
void web_serve_asset_index_html(struct mg_connection *nc);
|
||||
void web_serve_asset_index_js(struct mg_connection *nc);
|
||||
void web_serve_asset_chunk_vendors_js(struct mg_connection *nc);
|
||||
void web_serve_asset_favicon_ico(struct mg_connection *nc);
|
||||
void web_serve_asset_style_css(struct mg_connection *nc);
|
||||
void web_serve_asset_chunk_vendors_css(struct mg_connection *nc);
|
||||
|
||||
#endif //SIST2_WEB_UTIL_H
|
3
third-party/libscan/CMakeLists.txt
vendored
3
third-party/libscan/CMakeLists.txt
vendored
@ -97,7 +97,6 @@ find_package(LibLZMA REQUIRED)
|
||||
find_package(ZLIB REQUIRED)
|
||||
find_package(unofficial-pcre CONFIG REQUIRED)
|
||||
|
||||
|
||||
find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec)
|
||||
find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd)
|
||||
find_library(FREETYPE_LIB NAMES freetype freetyped)
|
||||
@ -110,6 +109,7 @@ find_library(CMS_LIB NAMES lcms2)
|
||||
find_library(JAS_LIB NAMES jasper)
|
||||
find_library(GUMBO_LIB NAMES gumbo)
|
||||
find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/11/ /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/)
|
||||
find_package(Leptonica CONFIG REQUIRED)
|
||||
|
||||
|
||||
target_compile_options(
|
||||
@ -231,6 +231,7 @@ target_link_libraries(
|
||||
antiword
|
||||
mobi
|
||||
unofficial::pcre::pcre unofficial::pcre::pcre16 unofficial::pcre::pcre32 unofficial::pcre::pcrecpp
|
||||
leptonica
|
||||
)
|
||||
|
||||
target_include_directories(
|
||||
|
37
third-party/libscan/libscan/arc/arc.c
vendored
37
third-party/libscan/libscan/arc/arc.c
vendored
@ -9,27 +9,13 @@
|
||||
|
||||
#define MAX_DECOMPRESSED_SIZE_RATIO 40.0
|
||||
|
||||
int should_parse_filtered_file(const char *filepath, int ext) {
|
||||
char tmp[PATH_MAX * 2];
|
||||
int should_parse_filtered_file(const char *filepath) {
|
||||
|
||||
if (ext == 0) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (strncmp(filepath + ext, "tgz", 3) == 0) {
|
||||
if (strstr(filepath, ".tgz")) {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
memcpy(tmp, filepath, ext - 1);
|
||||
*(tmp + ext - 1) = '\0';
|
||||
|
||||
char *idx = strrchr(tmp, '.');
|
||||
|
||||
if (idx == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (strcmp(idx, ".tar") == 0) {
|
||||
if (strstr(filepath, ".tar.")) {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
@ -206,18 +192,10 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre
|
||||
|
||||
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
|
||||
struct stat entry_stat = *archive_entry_stat(entry);
|
||||
sub_job->vfile.st_mode = entry_stat.st_mode;
|
||||
sub_job->vfile.st_size = entry_stat.st_size;
|
||||
sub_job->vfile.mtime = (int) entry_stat.st_mtim.tv_sec;
|
||||
|
||||
double decompressed_size_ratio = (double) sub_job->vfile.st_size / (double) f->st_size;
|
||||
if (decompressed_size_ratio > MAX_DECOMPRESSED_SIZE_RATIO) {
|
||||
CTX_LOG_DEBUGF("arc.c", "Skipped %s, possible zip bomb (decompressed_size_ratio=%f)", sub_job->filepath,
|
||||
decompressed_size_ratio)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (S_ISREG(sub_job->vfile.st_mode)) {
|
||||
if (S_ISREG(entry_stat.st_mode)) {
|
||||
|
||||
const char *utf8_name = archive_entry_pathname_utf8(entry);
|
||||
|
||||
@ -231,6 +209,13 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre
|
||||
}
|
||||
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
|
||||
|
||||
double decompressed_size_ratio = (double) sub_job->vfile.st_size / (double) f->st_size;
|
||||
if (decompressed_size_ratio > MAX_DECOMPRESSED_SIZE_RATIO) {
|
||||
CTX_LOG_DEBUGF("arc.c", "Skipped %s, possible zip bomb (decompressed_size_ratio=%f)", sub_job->filepath,
|
||||
decompressed_size_ratio)
|
||||
break;
|
||||
}
|
||||
|
||||
// Handle excludes
|
||||
if (exclude != NULL && EXCLUDED(sub_job->filepath)) {
|
||||
CTX_LOG_DEBUGF("arc.c", "Excluded: %s", sub_job->filepath)
|
||||
|
2
third-party/libscan/libscan/arc/arc.h
vendored
2
third-party/libscan/libscan/arc/arc.h
vendored
@ -67,7 +67,7 @@ static int vfile_close_callback(struct archive *a, void *user_data) {
|
||||
|
||||
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse);
|
||||
|
||||
int should_parse_filtered_file(const char *filepath, int ext);
|
||||
int should_parse_filtered_file(const char *filepath);
|
||||
|
||||
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra);
|
||||
|
||||
|
2
third-party/libscan/libscan/ebook/ebook.c
vendored
2
third-party/libscan/libscan/ebook/ebook.c
vendored
@ -162,7 +162,7 @@ int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_d
|
||||
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
|
||||
|
||||
APPEND_LONG_META(doc, MetaThumbnail, 1)
|
||||
ctx->store(doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size);
|
||||
ctx->store(doc->doc_id, 0, (char *) jpeg_packet.data, jpeg_packet.size);
|
||||
|
||||
free(samples);
|
||||
av_packet_unref(&jpeg_packet);
|
||||
|
2
third-party/libscan/libscan/font/font.c
vendored
2
third-party/libscan/libscan/font/font.c
vendored
@ -232,7 +232,7 @@ void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
bmp_format(&bmp_data, dimensions, bitmap);
|
||||
|
||||
APPEND_LONG_META(doc, MetaThumbnail, 1)
|
||||
ctx->store(doc->doc_id, sizeof(doc->doc_id), (char *) bmp_data.buf, bmp_data.cur);
|
||||
ctx->store(doc->doc_id, 0, bmp_data.buf, bmp_data.cur);
|
||||
|
||||
dyn_buffer_destroy(&bmp_data);
|
||||
free(bitmap);
|
||||
|
16
third-party/libscan/libscan/media/media.c
vendored
16
third-party/libscan/libscan/media/media.c
vendored
@ -468,8 +468,7 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
|
||||
if (scaled_frame == STORE_AS_IS) {
|
||||
return_value = SAVE_THUMBNAIL_OK;
|
||||
|
||||
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) frame_and_packet->packet->data,
|
||||
frame_and_packet->packet->size);
|
||||
ctx->store(doc->doc_id, 0, frame_and_packet->packet->data, frame_and_packet->packet->size);
|
||||
} else {
|
||||
// Encode frame to jpeg
|
||||
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
|
||||
@ -482,19 +481,17 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
|
||||
|
||||
// Save thumbnail
|
||||
if (thumbnail_index == 0) {
|
||||
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size);
|
||||
ctx->store(doc->doc_id, 0, jpeg_packet.data, jpeg_packet.size);
|
||||
return_value = SAVE_THUMBNAIL_OK;
|
||||
|
||||
} else if (thumbnail_index > 1) {
|
||||
return_value = SAVE_THUMBNAIL_OK;
|
||||
// TO FIX: the 2nd rendered frame is always broken, just skip it until
|
||||
// I figure out a better fix.
|
||||
thumbnail_index -= 1;
|
||||
|
||||
char tn_key[sizeof(doc->doc_id) + sizeof(char) * 4];
|
||||
snprintf(tn_key, sizeof(tn_key), "%s%04d", doc->doc_id, thumbnail_index);
|
||||
ctx->store(doc->doc_id, thumbnail_index, jpeg_packet.data, jpeg_packet.size);
|
||||
|
||||
ctx->store((char *) tn_key, sizeof(tn_key), (char *) jpeg_packet.data, jpeg_packet.size);
|
||||
return_value = SAVE_THUMBNAIL_OK;
|
||||
} else {
|
||||
return_value = SAVE_THUMBNAIL_SKIPPED;
|
||||
}
|
||||
@ -854,8 +851,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
|
||||
|
||||
if (scaled_frame == STORE_AS_IS) {
|
||||
APPEND_LONG_META(doc, MetaThumbnail, 1)
|
||||
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) frame_and_packet->packet->data,
|
||||
frame_and_packet->packet->size);
|
||||
ctx->store(doc->doc_id, 0, frame_and_packet->packet->data, frame_and_packet->packet->size);
|
||||
} else {
|
||||
// Encode frame to jpeg
|
||||
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
|
||||
@ -868,7 +864,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
|
||||
|
||||
// Save thumbnail
|
||||
APPEND_LONG_META(doc, MetaThumbnail, 1)
|
||||
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size);
|
||||
ctx->store(doc->doc_id, 0, jpeg_packet.data, jpeg_packet.size);
|
||||
|
||||
av_packet_unref(&jpeg_packet);
|
||||
avcodec_free_context(&jpeg_encoder);
|
||||
|
2
third-party/libscan/libscan/ooxml/ooxml.c
vendored
2
third-party/libscan/libscan/ooxml/ooxml.c
vendored
@ -191,7 +191,7 @@ void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, s
|
||||
archive_read_data(a, buf, entry_size);
|
||||
|
||||
APPEND_LONG_META(doc, MetaThumbnail, 1)
|
||||
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), buf, entry_size);
|
||||
ctx->store(doc->doc_id, 1, buf, entry_size);
|
||||
free(buf);
|
||||
}
|
||||
|
||||
|
10
third-party/libscan/libscan/scan.h
vendored
10
third-party/libscan/libscan/scan.h
vendored
@ -6,6 +6,7 @@
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <sys/stat.h>
|
||||
#include <openssl/md5.h>
|
||||
#include <openssl/sha.h>
|
||||
@ -16,7 +17,7 @@
|
||||
|
||||
#define UNUSED(x) __attribute__((__unused__)) x
|
||||
|
||||
typedef void (*store_callback_t)(char *key, size_t key_len, char *buf, size_t buf_len);
|
||||
typedef void (*store_callback_t)(char *key, int num, void *buf, size_t buf_len);
|
||||
|
||||
typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...);
|
||||
|
||||
@ -111,8 +112,8 @@ typedef struct document {
|
||||
unsigned long size;
|
||||
unsigned int mime;
|
||||
int mtime;
|
||||
short base;
|
||||
short ext;
|
||||
int base;
|
||||
int ext;
|
||||
meta_line_t *meta_head;
|
||||
meta_line_t *meta_tail;
|
||||
char filepath[PATH_MAX * 2 + 1];
|
||||
@ -144,7 +145,6 @@ typedef struct vfile {
|
||||
|
||||
int mtime;
|
||||
size_t st_size;
|
||||
unsigned int st_mode;
|
||||
|
||||
SHA_CTX sha1_ctx;
|
||||
unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
|
||||
@ -161,7 +161,7 @@ typedef struct vfile {
|
||||
logf_callback_t logf;
|
||||
} vfile_t;
|
||||
|
||||
typedef struct parse_job_t {
|
||||
typedef struct {
|
||||
int base;
|
||||
int ext;
|
||||
struct vfile vfile;
|
||||
|
33
third-party/libscan/libscan/util.h
vendored
33
third-party/libscan/libscan/util.h
vendored
@ -358,4 +358,37 @@ static void safe_sha1_update(SHA_CTX *ctx, void *buf, size_t size) {
|
||||
}
|
||||
}
|
||||
|
||||
static parse_job_t *create_parse_job(const char *filepath, int mtime, size_t st_size) {
|
||||
parse_job_t *job = (parse_job_t *) malloc(sizeof(parse_job_t));
|
||||
|
||||
job->parent[0] = '\0';
|
||||
|
||||
strcpy(job->filepath, filepath);
|
||||
strcpy(job->vfile.filepath, filepath);
|
||||
job->vfile.st_size = st_size;
|
||||
job->vfile.mtime = mtime;
|
||||
|
||||
const char *slash = strrchr(filepath, '/');
|
||||
if (slash == NULL) {
|
||||
job->base = 0;
|
||||
} else {
|
||||
job->base = (int) (slash - filepath + 1);
|
||||
}
|
||||
|
||||
const char *dot = strrchr(filepath + job->base, '.');
|
||||
if (dot == NULL) {
|
||||
job->ext = (int) strlen(filepath);
|
||||
} else {
|
||||
job->ext = (int) (dot - filepath + 1);
|
||||
}
|
||||
|
||||
job->vfile.fd = -1;
|
||||
job->vfile.is_fs_file = TRUE;
|
||||
job->vfile.has_checksum = FALSE;
|
||||
job->vfile.rewind_buffer_size = 0;
|
||||
job->vfile.rewind_buffer = NULL;
|
||||
|
||||
return job;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
1
third-party/libscan/test/test_util.cpp
vendored
1
third-party/libscan/test/test_util.cpp
vendored
@ -55,7 +55,6 @@ void load_file(const char *filepath, vfile_t *f) {
|
||||
|
||||
f->mtime = (int)info.st_mtim.tv_sec;
|
||||
f->st_size = info.st_size;
|
||||
f->st_mode = info.st_mode;
|
||||
|
||||
f->fd = open(filepath, O_RDONLY);
|
||||
|
||||
|
2
third-party/libscan/test/test_util.h
vendored
2
third-party/libscan/test/test_util.h
vendored
@ -21,7 +21,7 @@ static void noop_log(const char *filepath, int level, char *str) {
|
||||
|
||||
static size_t store_size = 0;
|
||||
|
||||
static void counter_store(char* key, size_t key_len, char *value, size_t value_len) {
|
||||
static void counter_store(char* key, int num, void *value, size_t value_len) {
|
||||
store_size += value_len;
|
||||
// char id[37];
|
||||
// char tmp[PATH_MAX];
|
||||
|
2
third-party/libscan/third-party/antiword
vendored
2
third-party/libscan/third-party/antiword
vendored
@ -1 +1 @@
|
||||
Subproject commit ddb042143e72a8b789e06f09dbc897dfa9f15b82
|
||||
Subproject commit badfdac84586511d4f2b626516162d62a3625349
|
Loading…
x
Reference in New Issue
Block a user