From c3b7a05dded71ecb5a9e3ea7808081d47e282826 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 26 Oct 2019 12:35:01 -0400 Subject: [PATCH] bugfixes & refactoring --- .gitignore | 2 + CMakeLists.txt | 20 +-- Diagram1.dia | Bin 2286 -> 0 bytes README.md | 14 +-- mime.csv | 6 +- scripts/before_build.sh | 2 + scripts/get_static_libs.sh | 9 -- src/cli.c | 155 +++++++++++++++++++++++ src/cli.h | 41 ++++++ src/ctx.h | 2 + src/io/store.c | 8 +- src/main.c | 238 +++++++++++++---------------------- src/parsing/media.c | 4 + src/parsing/mime_generated.c | 13 ++ src/parsing/parse.c | 16 +-- src/parsing/pdf.c | 78 +++++++++--- src/sist.h | 1 + src/tpool.c | 26 ++-- src/util.c | 27 +++- src/util.h | 1 + src/web/serve.c | 2 +- 21 files changed, 440 insertions(+), 225 deletions(-) delete mode 100644 Diagram1.dia create mode 100644 src/cli.c create mode 100644 src/cli.h diff --git a/.gitignore b/.gitignore index d897785..53a5aae 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ sist2* index.sist2/ bundle.css bundle.js +*.a +vgcore.* diff --git a/CMakeLists.txt b/CMakeLists.txt index bfcf0e1..5c33f1f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ add_executable( # LMDB lmdb/libraries/liblmdb/lmdb.h lmdb/libraries/liblmdb/mdb.c lmdb/libraries/liblmdb/midl.h lmdb/libraries/liblmdb/midl.c -) + src/cli.c src/cli.h) find_package(PkgConfig REQUIRED) set(ENV{PKG_CONFIG_PATH} "$ENV{PKG_CONFIG_PATH}:/usr/local/lib/pkgconfig/") @@ -84,8 +84,8 @@ include_directories( target_compile_options(sist2 PRIVATE - -Ofast -# -march=native + -O3 + -march=native -fno-stack-protector -fomit-frame-pointer ) @@ -93,16 +93,18 @@ target_compile_options(sist2 TARGET_LINK_LIBRARIES( sist2 + ${GLIB_LIBRARIES} ${GOBJECT_LIBRARIES} ${UUID_LIBRARIES} - ${GLIB_LIBRARIES} # ffmpeg - ${PROJECT_SOURCE_DIR}/lib/libavcodec.a - ${PROJECT_SOURCE_DIR}/lib/libavformat.a - ${PROJECT_SOURCE_DIR}/lib/libavutil.a - ${PROJECT_SOURCE_DIR}/lib/libswscale.a - ${PROJECT_SOURCE_DIR}/lib/libswresample.a +# ${PROJECT_SOURCE_DIR}/lib/libavcodec.a +# ${PROJECT_SOURCE_DIR}/lib/libavformat.a +# ${PROJECT_SOURCE_DIR}/lib/libavutil.a +# ${PROJECT_SOURCE_DIR}/lib/libswscale.a +# ${PROJECT_SOURCE_DIR}/lib/libswresample.a + ${FFMPEG_LIBRARIES} + swscale # mupdf ${PROJECT_SOURCE_DIR}/lib/libmupdf.a diff --git a/Diagram1.dia b/Diagram1.dia deleted file mode 100644 index 32a3be89869c7b195a14d943022fdf5603186770..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2286 zcmVUODGi5Jt#!R= zmSp+hIY{n?w~zKS9IcxfZMAct?PV~FCV3QmYi=6qk5X-#UPk$Dvx{X`#JWoPy4_qTtcx78!9FKpVXd(cLZauMXSINit5W0O<^fq}#r>%m}3Dk!Kx zZ)0*k+~l%w$z|d4%fiLQ!!pauJc!GE$VrwZQIJ-sW&RYk$0?>kqNmuQZguFzV_cTm zk>`I5lH!yORF}W6wsTkRERVy(3wNtVLo2H|Eawj|j)tph{nb(H&v6k?lBn+K0$O1hf#54nr+u(D*ta81Fq6qHO0GK+q_ZU2 zAkhj*Fp0*D01A~Zb6akQ93|1>VVdRXzG&#XlNQR_nEj5v*%#e4PSv}Q=zukk;@P}x z217u>txwC274vNMP@^c1)8#oR%ODK5sdA?~5U_SFm(k1KDD9e!ojOe;VFlw>y&Y|- z2|sO6mj_I86TfF^SqEmW4<3U>oV@C13esXYD9Tr3JM^-zZv8z{P%8fUIA@h>ssN`sxU`f1um}hzXZ#|=eJ(k}53OdxAP==H`$JpL8YdrjR%2EZVWX!7ZAEmI}yQr`Ff7&1yStL?%Qt$bG?MtWL0L?e;I6k ztH+D)O@*49(fVfCeDj-u+q2LG*A zb$7hH-0L9sz8XJqp9N?SEWiY&Rrr*;C4n~}!VLvrXi8igfDaJMhX^9?SUZp9H%!JK z!LNpw^Q)ijBQ@K0auL(SI-4d6B!SpO=OuAznKn2W32JPU5?uSC5&efJFuLWj8`fdn^oU&#VEG# z6&F&Bst?7;m~qai;KZ$9B(MU)oLIdmOxVhlT&u;d9rR++i^bbnlzv`C6E7DpAs2~Y zCT__w^m6e+auLBE^dixwsCrRKrDMHFC|So8tzKk;n;F9}aW5C^d)drAq65o$wfF<7 zMdd^-o{))$I`4qu3SgzAjhajBR55BKx<(?Hj0FNnN&Bu7A@OlD@p>6SpNZ!)@q8wp z&%|rX1D*F?Xe%t0^{e65R_6X`Km*YV2x2@I5C}z>VF4oB#s;r~26`Ak&jOwWJPUXh zXv+em2Ns}On;R=eAeL@vfXE67Gb{}c5MmMzB;Z7_56uJJhtLy&Cjw6do(S4Semn2X zF>mv@s1B#m*2N_aFcbW-NzDk2jqOxanvxK0cRQaP`EBfFtx+c?RHbWD?DGVEtjMqG zl3I$Ky0 zD9ERCpAhF<&L|+I$s-~FLauy5oIk_wdSXYSkJyoBdb_8BDd{+N1fku%fiWgyBW091 z!UOB|MK2IMs_ka2#M0W4S5*Z%714D%Dt*xvG$vBJ%omA_86!muZT(R z*fv=t1{h_^8YhbFo-NY)@Z(EbC*rNsMXXcpboA!we`=npvi*CL_cvRAkn;ZKzrZFK I raw_documents.ndjson ``` +**Start web interface** ```bash -sist2 web [OPTION]... INDEX... - -# Examples sist2 web --bind 0.0.0.0 --port 4321 ./my_idx1 ./my_idx2 ./my_idx3 ``` diff --git a/mime.csv b/mime.csv index fce1a1b..f603c48 100644 --- a/mime.csv +++ b/mime.csv @@ -352,4 +352,8 @@ audio/mp4, m4b !image/vnd.djvu, djvu application/x-ms-reader, lit application/CDFV2-corrupt, -text/x-vcard, vcf \ No newline at end of file +text/x-vcard, vcf +application/x-innosetup, +application/winhelp, hlp +image/x-tga, +application/x-wine-extension-ini, \ No newline at end of file diff --git a/scripts/before_build.sh b/scripts/before_build.sh index 4ea706d..aba2601 100755 --- a/scripts/before_build.sh +++ b/scripts/before_build.sh @@ -1,5 +1,7 @@ #!/bin/bash +rm -rf index.sist2/ + rm web/js/bundle.js 2> /dev/null cat `ls -v web/js/*.min.js` > web/js/bundle.js cat web/js/{util,dom,search}.js >> web/js/bundle.js diff --git a/scripts/get_static_libs.sh b/scripts/get_static_libs.sh index 8794301..3494bf8 100755 --- a/scripts/get_static_libs.sh +++ b/scripts/get_static_libs.sh @@ -8,15 +8,6 @@ cd .. mv mupdf/build/release/libmupdf.a . mv mupdf/build/release/libmupdf-third.a . -# libpcre -cd libpcre -./autogen.sh -./configure --disable-shared -make -j 4 -cd .. - -mv libpcre/.libs/libpcre.a . - # ffmpeg cd ffmpeg ./configure --disable-shared --enable-static --disable-ffmpeg --disable-ffplay \ diff --git a/src/cli.c b/src/cli.c new file mode 100644 index 0000000..06798d7 --- /dev/null +++ b/src/cli.c @@ -0,0 +1,155 @@ +#include "cli.h" + +#define DEFAULT_OUTPUT "index.sist2/" +#define DEFAULT_CONTENT_SIZE 4096 +#define DEFAULT_QUALITY 15 +#define DEFAULT_SIZE 200 +#define DEFAULT_REWRITE_URL "" + +#define DEFAULT_ES_URL "http://localhost:9200" + +#define DEFAULT_BIND_ADDR "localhost" +#define DEFAULT_PORT "4090" + + +scan_args_t *scan_args_create() { + scan_args_t *args = calloc(sizeof(scan_args_t), 1); + return args; +} + +index_args_t *index_args_create() { + index_args_t *args = calloc(sizeof(index_args_t), 1); + return args; +} + +web_args_t *web_args_create() { + web_args_t *args = calloc(sizeof(web_args_t), 1); + return args; +} + +int scan_args_validate(scan_args_t *args, int argc, const char **argv) { + if (argc < 2) { + fprintf(stderr, "Required positional argument: PATH.\n"); + return 1; + } + + char *abs_path = abspath(argv[1]); + if (abs_path == NULL) { + fprintf(stderr, "File not found: %s", argv[1]); + return 1; + } else { + args->path = abs_path; + } + + if (args->incremental != NULL) { + abs_path = abspath(args->incremental); + if (abs_path == NULL) { + fprintf(stderr, "File not found: %s", args->incremental); + return 1; + } + } + + if (args->quality == 0) { + args->quality = DEFAULT_QUALITY; + } else if (args->quality < 1 || args->quality > 31) { + fprintf(stderr, "Invalid quality: %f\n", args->quality); + return 1; + } + + if (args->size == 0) { + args->size = DEFAULT_SIZE; + } else if (args->size <= 0) { + fprintf(stderr, "Invalid size: %d\n", args->size); + return 1; + } + + if (args->content_size == 0) { + args->content_size = DEFAULT_CONTENT_SIZE; + } else if (args->content_size <= 0) { + fprintf(stderr, "Invalid content-size: %d\n", args->content_size); + return 1; + } + + if (args->threads == 0) { + args->threads = 1; + } else if (args->threads < 0) { + fprintf(stderr, "Invalid threads: %d\n", args->threads); + return 1; + } + + if (args->output == NULL) { + args->output = malloc(strlen(DEFAULT_OUTPUT) + 1); + strcpy(args->output, DEFAULT_OUTPUT); + } else { + args->output = expandpath(args->output); + } + + int ret = mkdir(args->output, S_IRUSR | S_IWUSR | S_IXUSR); + if (ret != 0) { + fprintf(stderr, "Invalid output: '%s' (%s).\n", args->output, strerror(errno)); + return 1; + } + + if (args->name == NULL) { + args->name = g_path_get_basename(args->output); + } + + if (args->rewrite_url == NULL) { + args->rewrite_url = DEFAULT_REWRITE_URL; + } + return 0; +} + +int index_args_validate(index_args_t *args, int argc, const char **argv) { + + if (argc < 2) { + fprintf(stderr, "Required positional argument: PATH.\n"); + return 1; + } + + char *index_path = abspath(argv[1]); + if (index_path == NULL) { + fprintf(stderr, "File not found: %s", argv[1]); + return 1; + } else { + args->index_path = argv[1]; + } + + if (args->es_url == NULL) { + args->es_url = DEFAULT_ES_URL; + } + return 0; +} + +int web_args_validate(web_args_t *args, int argc, const char **argv) { + + if (argc < 2) { + fprintf(stderr, "Required positional argument: PATH.\n"); + return 1; + } + + if (args->es_url == NULL) { + args->es_url = DEFAULT_ES_URL; + } + + if (args->bind == NULL) { + args->bind = DEFAULT_BIND_ADDR; + } + + if (args->port == NULL) { + args->port = DEFAULT_PORT; + } + + args->index_count = argc - 1; + args->indices = argv + 1; + + for (int i = 0; i < args->index_count; i++) { + char *abs_path = abspath(args->indices[i]); + if (abs_path == NULL) { + fprintf(stderr, "File not found: %s", abs_path); + return 1; + } + } + return 0; +} + diff --git a/src/cli.h b/src/cli.h new file mode 100644 index 0000000..bb1514f --- /dev/null +++ b/src/cli.h @@ -0,0 +1,41 @@ +#ifndef SIST2_CLI_H +#define SIST2_CLI_H + +#include "sist.h" + +typedef struct scan_args { + float quality; + int size; + int content_size; + int threads; + char *incremental; + char *output; + char *rewrite_url; + char *name; + char *path; +} scan_args_t; + +typedef struct index_args { + char *es_url; + const char *index_path; + int print; + int force_reset; +} index_args_t; + +typedef struct web_args { + char *es_url; + char *bind; + char *port; + int index_count; + const char **indices; +} web_args_t; + +scan_args_t *scan_args_create(); +index_args_t *index_args_create(); +web_args_t *web_args_create(); + +int scan_args_validate(scan_args_t *args, int argc, const char **argv); +int index_args_validate(index_args_t *args, int argc, const char **argv); +int web_args_validate(web_args_t *args, int argc, const char **argv); + +#endif diff --git a/src/ctx.h b/src/ctx.h index 87d8f9d..a29fc8a 100644 --- a/src/ctx.h +++ b/src/ctx.h @@ -21,6 +21,8 @@ struct { GHashTable *original_table; GHashTable *copy_table; + + pthread_mutex_t mupdf_mu; } ScanCtx; diff --git a/src/io/store.c b/src/io/store.c index ce143d6..c1179ef 100644 --- a/src/io/store.c +++ b/src/io/store.c @@ -20,13 +20,13 @@ store_t *store_create(char *path) { } store->size = (size_t) 1024 * 1024 * 5; - ScanCtx.stat_tn_size = store->size; + ScanCtx.stat_tn_size = 0; mdb_env_set_mapsize(store->env, store->size); // Open dbi MDB_txn *txn; - int r3 = mdb_txn_begin(store->env, NULL, 0, &txn); - int r4 = mdb_dbi_open(txn, NULL, 0, &store->dbi); + mdb_txn_begin(store->env, NULL, 0, &txn); + mdb_dbi_open(txn, NULL, 0, &store->dbi); mdb_txn_commit(txn); return store; @@ -55,6 +55,7 @@ void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t bu mdb_txn_begin(store->env, NULL, 0, &txn); int put_ret = mdb_put(txn, store->dbi, &mdb_key, &mdb_value, 0); + ScanCtx.stat_tn_size += buf_len; if (put_ret == MDB_MAP_FULL) { mdb_txn_abort(txn); @@ -67,7 +68,6 @@ void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t bu mdb_env_set_mapsize(store->env, store->size); mdb_txn_begin(store->env, NULL, 0, &txn); put_ret = mdb_put(txn, store->dbi, &mdb_key, &mdb_value, 0); - ScanCtx.stat_tn_size = store->size; } mdb_txn_commit(txn); diff --git a/src/main.c b/src/main.c index 7dd4358..e374da8 100644 --- a/src/main.c +++ b/src/main.c @@ -1,8 +1,11 @@ #include "sist.h" #include "ctx.h" -static const char *const Version = "1.0.0"; +#define DESCRIPTION "Lightning-fast file system indexer and search tool." +#define EPILOG "Made by simon987 . Released under GPL-3.0" + +static const char *const Version = "1.0.0"; static const char *const usage[] = { "sist2 scan [OPTION]... PATH", "sist2 index [OPTION]... INDEX", @@ -10,6 +13,11 @@ static const char *const usage[] = { NULL, }; +void global_init() { + curl_global_init(CURL_GLOBAL_NOTHING); + av_log_set_level(AV_LOG_QUIET); +} + void init_dir(const char *dirpath) { char path[PATH_MAX]; snprintf(path, PATH_MAX, "%sdescriptor.json", dirpath); @@ -32,11 +40,16 @@ void scan_print_header() { printf("output\t\t%s\n", ScanCtx.index.path); } -void sist2_scan(const char *path, const char *incremental_from) { +void sist2_scan(scan_args_t *args) { - av_log_set_level(AV_LOG_QUIET); - - strcpy(ScanCtx.index.desc.root, abspath(path)); + ScanCtx.tn_qscale = args->quality; + ScanCtx.tn_size = args->size; + ScanCtx.content_size = args->content_size; + ScanCtx.pool = tpool_create(args->threads, serializer_cleanup); + ScanCtx.threads = args->threads; + strncpy(ScanCtx.index.path, args->output, sizeof(ScanCtx.index.path)); + strncpy(ScanCtx.index.desc.name, args->name, sizeof(ScanCtx.index.desc.name)); + strcpy(ScanCtx.index.desc.root, args->path); ScanCtx.index.desc.root_len = (short) strlen(ScanCtx.index.desc.root); init_dir(ScanCtx.index.path); @@ -51,12 +64,11 @@ void sist2_scan(const char *path, const char *incremental_from) { scan_print_header(); - if (incremental_from != NULL) { - incremental_from = abspath(incremental_from); + if (args->incremental != NULL) { ScanCtx.original_table = incremental_get_table(); ScanCtx.copy_table = incremental_get_table(); - DIR *dir = opendir(incremental_from); + DIR *dir = opendir(args->incremental); if (dir == NULL) { perror("opendir"); return; @@ -65,7 +77,7 @@ void sist2_scan(const char *path, const char *incremental_from) { while ((de = readdir(dir)) != NULL) { if (strncmp(de->d_name, "_index_", sizeof("_index_") - 1) == 0) { char file_path[PATH_MAX]; - snprintf(file_path, PATH_MAX, "%s/%s", incremental_from, de->d_name); + snprintf(file_path, PATH_MAX, "%s/%s", args->incremental, de->d_name); incremental_read(ScanCtx.original_table, file_path); } } @@ -76,14 +88,15 @@ void sist2_scan(const char *path, const char *incremental_from) { walk_directory_tree(ScanCtx.index.desc.root); tpool_wait(ScanCtx.pool); + tpool_destroy(ScanCtx.pool); - if (incremental_from != NULL) { + if (args->incremental != NULL) { char dst_path[PATH_MAX]; - snprintf(store_path, PATH_MAX, "%sthumbs", incremental_from); + snprintf(store_path, PATH_MAX, "%sthumbs", args->incremental); snprintf(dst_path, PATH_MAX, "%s_index_original", ScanCtx.index.path); store_t *source = store_create(store_path); - DIR *dir = opendir(incremental_from); + DIR *dir = opendir(args->incremental); if (dir == NULL) { perror("opendir"); return; @@ -92,7 +105,7 @@ void sist2_scan(const char *path, const char *incremental_from) { while ((de = readdir(dir)) != NULL) { if (strncmp(de->d_name, "_index_", sizeof("_index_") - 1) == 0) { char file_path[PATH_MAX]; - snprintf(file_path, PATH_MAX, "%s/%s", incremental_from, de->d_name); + snprintf(file_path, PATH_MAX, "%s/%s", args->incremental, de->d_name); incremental_copy(source, ScanCtx.index.store, file_path, dst_path, ScanCtx.copy_table); } } @@ -101,16 +114,18 @@ void sist2_scan(const char *path, const char *incremental_from) { } store_destroy(ScanCtx.index.store); - tpool_destroy(ScanCtx.pool); } -void sist2_index(const char *path, int print_index, int arg_force_reset) { - if (!print_index) { - elastic_init(arg_force_reset); +void sist2_index(index_args_t *args) { + + IndexCtx.es_url = args->es_url; + + if (!args->print) { + elastic_init(args->force_reset); } - char *index_path = abspath(path); + char descriptor_path[PATH_MAX]; - snprintf(descriptor_path, PATH_MAX, "%s/descriptor.json", index_path); + snprintf(descriptor_path, PATH_MAX, "%s/descriptor.json", args->index_path); index_descriptor_t desc = read_index_descriptor(descriptor_path); if (strcmp(desc.version, Version) != 0) { @@ -118,14 +133,14 @@ void sist2_index(const char *path, int print_index, int arg_force_reset) { return; } - DIR *dir = opendir(index_path); + DIR *dir = opendir(args->index_path); if (dir == NULL) { perror("opendir"); return; } index_func f; - if (print_index) { + if (args->print) { f = print_json; } else { f = index_json; @@ -135,22 +150,27 @@ void sist2_index(const char *path, int print_index, int arg_force_reset) { while ((de = readdir(dir)) != NULL) { if (strncmp(de->d_name, "_index_", sizeof("_index_") - 1) == 0) { char file_path[PATH_MAX]; - snprintf(file_path, PATH_MAX, "%s/%s", index_path, de->d_name); + snprintf(file_path, PATH_MAX, "%s/%s", args->index_path, de->d_name); read_index(file_path, desc.uuid, f); } } - if (!print_index) { + if (!args->print) { elastic_flush(); destroy_indexer(); } } -void sist2_web(const char *indices[], int index_count, const char *host, const char *port) { +void sist2_web(web_args_t *args) { - for (int i = 0; i < index_count; i++) { - char *abs_path = abspath(indices[i]); + WebCtx.es_url = args->es_url; + WebCtx.index_count = args->index_count; + for (int i = 0; i < args->index_count; i++) { + char *abs_path = abspath(args->indices[i]); + if (abs_path == NULL) { + return; + } char path_tmp[PATH_MAX]; snprintf(path_tmp, PATH_MAX, "%sthumbs", abs_path); @@ -161,168 +181,88 @@ void sist2_web(const char *indices[], int index_count, const char *host, const c strcpy(WebCtx.indices[i].path, abs_path); printf("Loaded index: %s\n", WebCtx.indices[i].desc.name); + free(abs_path); } - WebCtx.index_count = index_count; - - serve(host, port); + serve(args->bind, args->port); } + int main(int argc, const char *argv[]) { - curl_global_init(CURL_GLOBAL_NOTHING); + global_init(); - float arg_quality = 0; - int arg_size = 0; - int arg_content_size = 0; - int arg_threads = 0; - char *arg_incremental = NULL; - char *arg_output = NULL; - char *arg_rewrite_url = NULL; - char *arg_name = NULL; + scan_args_t *scan_args = scan_args_create(); + index_args_t *index_args = index_args_create(); + web_args_t *web_args = web_args_create(); - char *arg_es_url = NULL; - int arg_print_index = 0; - int arg_force_reset = 0; - - char *arg_web_host = NULL; - char *arg_web_port = NULL; + char * common_es_url; struct argparse_option options[] = { OPT_HELP(), OPT_GROUP("Scan options"), - OPT_INTEGER('t', "threads", &arg_threads, "Number of threads. DEFAULT=1"), - OPT_FLOAT('q', "quality", &arg_quality, + OPT_INTEGER('t', "threads", &scan_args->threads, "Number of threads. DEFAULT=1"), + OPT_FLOAT('q', "quality", &scan_args->quality, "Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=15"), - OPT_INTEGER(0, "size", &arg_size, "Thumbnail size, in pixels. DEFAULT=200"), - OPT_INTEGER(0, "content-size", &arg_content_size, + OPT_INTEGER(0, "size", &scan_args->size, "Thumbnail size, in pixels. DEFAULT=200"), + OPT_INTEGER(0, "content-size", &scan_args->content_size, "Number of bytes to be extracted from text documents. DEFAULT=4096"), - OPT_STRING(0, "incremental", &arg_incremental, "Reuse an existing index and only scan modified files."), - OPT_STRING('o', "output", &arg_output, "Output directory. DEFAULT=index.sist2/"), - OPT_STRING(0, "rewrite-url", &arg_rewrite_url, "Serve files from this url instead of from disk."), - OPT_STRING(0, "name", &arg_name, "Index display name. DEFAULT: (name of the directory)"), + OPT_STRING(0, "incremental", &scan_args->incremental, + "Reuse an existing index and only scan modified files."), + OPT_STRING('o', "output", &scan_args->output, "Output directory. DEFAULT=index.sist2/"), + OPT_STRING(0, "rewrite-url", &scan_args->rewrite_url, "Serve files from this url instead of from disk."), + OPT_STRING(0, "name", &scan_args->name, "Index display name. DEFAULT: (name of the directory)"), OPT_GROUP("Index options"), - OPT_STRING(0, "es-url", &arg_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"), - OPT_BOOLEAN('p', "print", &arg_print_index, "Just print JSON documents to stdout."), - OPT_BOOLEAN('f', "force-reset", &arg_force_reset, "Reset Elasticsearch mappings and settings. " + OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"), + OPT_BOOLEAN('p', "print", &index_args->print, "Just print JSON documents to stdout."), + OPT_BOOLEAN('f', "force-reset", &index_args->force_reset, "Reset Elasticsearch mappings and settings. " "(You must use this option the first time you use the index command)"), OPT_GROUP("Web options"), - OPT_STRING(0, "es-url", &arg_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"), - OPT_STRING(0, "bind", &arg_web_host, "Listen on this address. DEFAULT=localhost"), - OPT_STRING(0, "port", &arg_web_port, "Listen on this port. DEFAULT=4090"), + OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"), + OPT_STRING(0, "bind", &web_args->bind, "Listen on this address. DEFAULT=localhost"), + OPT_STRING(0, "port", &web_args->port, "Listen on this port. DEFAULT=4090"), OPT_END(), }; struct argparse argparse; argparse_init(&argparse, options, usage, 0); - argparse_describe( - &argparse, - "\nLightning-fast file system indexer and search tool.", - "\nMade by simon987 . Released under GPL-3.0" - ); - + argparse_describe(&argparse, DESCRIPTION, EPILOG); argc = argparse_parse(&argparse, argc, argv); - //Set defaults - if (arg_quality == 0) { - arg_quality = 15; - } else if (arg_quality < 1 || arg_quality > 31) { - fprintf(stderr, "Invalid quality: %f\n", arg_quality); - return 1; - } + web_args->es_url = common_es_url; + index_args->es_url = common_es_url; - if (arg_size == 0) { - arg_size = 200; - } else if (arg_size <= 0) { - fprintf(stderr, "Invalid size: %d\n", arg_size); - return 1; - } - - if (arg_content_size == 0) { - arg_content_size = 4096; - } else if (arg_content_size <= 0) { - fprintf(stderr, "Invalid content-size: %d\n", arg_content_size); - return 1; - } - - if (arg_threads == 0) { - arg_threads = 1; - } else if (arg_threads < 0) { - fprintf(stderr, "Invalid threads: %d\n", arg_threads); - return 1; - } - - if (arg_output == NULL) { - arg_output = "index.sist2/"; - } - - if (arg_es_url == NULL) { - arg_es_url = "http://localhost:9200"; - } - - if (arg_web_host == NULL) { - arg_web_host = "localhost"; - } - - if (arg_web_port == NULL) { - arg_web_port = "4090"; - } - - // Commands if (argc == 0) { argparse_usage(&argparse); + return 1; } else if (strcmp(argv[0], "scan") == 0) { - if (argc < 2) { - fprintf(stderr, "Required positional argument: PATH.\n"); - argparse_usage(&argparse); - return 1; - } - if (arg_name == NULL) { - arg_name = g_path_get_basename(argv[1]); + int err = scan_args_validate(scan_args, argc, argv); + if (err != 0) { + return err; } + sist2_scan(scan_args); - int ret = mkdir(arg_output, S_IRUSR | S_IWUSR | S_IXUSR); - if (ret != 0) { - fprintf(stderr, "Invalid output: '%s' (%s).\n", arg_output, strerror(errno)); - return 1; - } - - ScanCtx.tn_qscale = arg_quality; - ScanCtx.tn_size = arg_size; - ScanCtx.content_size = arg_content_size; - ScanCtx.pool = tpool_create(arg_threads, serializer_cleanup); - ScanCtx.threads = arg_threads; - strncpy(ScanCtx.index.path, arg_output, sizeof(ScanCtx.index.path)); - strncpy(ScanCtx.index.desc.name, arg_name, sizeof(ScanCtx.index.desc.name)); - if (arg_rewrite_url == NULL) { - strcpy(ScanCtx.index.desc.rewrite_url, ""); - } else { - strcpy(ScanCtx.index.desc.rewrite_url, arg_rewrite_url); - } - sist2_scan(argv[1], arg_incremental); } else if (strcmp(argv[0], "index") == 0) { - if (argc < 2) { - fprintf(stderr, "Required positional argument: PATH.\n"); - argparse_usage(&argparse); - return 1; - } - IndexCtx.es_url = arg_es_url; - sist2_index(argv[1], arg_print_index, arg_force_reset); + int err = index_args_validate(index_args, argc, argv); + if (err != 0) { + return err; + } + sist2_index(index_args); + } else if (strcmp(argv[0], "web") == 0) { - if (argc < 2) { - fprintf(stderr, "Required positional argument: PATH.\n"); - argparse_usage(&argparse); - return 1; - } - WebCtx.es_url = arg_es_url; - sist2_web(argv + 1, argc - 1, arg_web_host, arg_web_port); + int err = web_args_validate(web_args, argc, argv); + if (err != 0) { + return err; + } + sist2_web(web_args); + } else { fprintf(stderr, "Invalid command: '%s'\n", argv[0]); argparse_usage(&argparse); diff --git a/src/parsing/media.c b/src/parsing/media.c index 91b53f8..ff83eb2 100644 --- a/src/parsing/media.c +++ b/src/parsing/media.c @@ -153,6 +153,10 @@ void parse_media(const char *filepath, document_t *doc) { int video_stream = -1; AVFormatContext *pFormatCtx = avformat_alloc_context(); + if (pFormatCtx == NULL) { + fprintf(stderr, "Could not allocate AVFormatContext! %s \n", filepath); + return; + } int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL); if (res < 0) { printf("ERR%s %s\n", filepath, av_err2str(res)); diff --git a/src/parsing/mime_generated.c b/src/parsing/mime_generated.c index 692e9f2..9e1ed29 100644 --- a/src/parsing/mime_generated.c +++ b/src/parsing/mime_generated.c @@ -360,6 +360,10 @@ enum mime { application_x_ms_reader=655712, application_CDFV2_corrupt=655713, text_x_vcard=590178, + application_x_innosetup=655715, + application_winhelp=655716, + image_x_tga=524645, + application_x_wine_extension_ini=655718, }; char *mime_get_mime_text(unsigned int mime_id) {switch (mime_id) { case application_arj: return "application/arj"; @@ -716,6 +720,10 @@ case image_vnd_djvu: return "image/vnd.djvu"; case application_x_ms_reader: return "application/x-ms-reader"; case application_CDFV2_corrupt: return "application/CDFV2-corrupt"; case text_x_vcard: return "text/x-vcard"; +case application_x_innosetup: return "application/x-innosetup"; +case application_winhelp: return "application/winhelp"; +case image_x_tga: return "image/x-tga"; +case application_x_wine_extension_ini: return "application/x-wine-extension-ini"; default: return NULL;}} GHashTable *mime_get_ext_table() {GHashTable *ext_table = g_hash_table_new(g_str_hash, g_str_equal); g_hash_table_insert(ext_table, "arj", (gpointer)application_arj); @@ -1192,6 +1200,7 @@ g_hash_table_insert(ext_table, "m4b", (gpointer)audio_mp4); g_hash_table_insert(ext_table, "djvu", (gpointer)image_vnd_djvu); g_hash_table_insert(ext_table, "lit", (gpointer)application_x_ms_reader); g_hash_table_insert(ext_table, "vcf", (gpointer)text_x_vcard); +g_hash_table_insert(ext_table, "hlp", (gpointer)application_winhelp); return ext_table;} GHashTable *mime_get_mime_table() {GHashTable *mime_table = g_hash_table_new(g_str_hash, g_str_equal); g_hash_table_insert(mime_table, "application/arj", (gpointer)application_arj); @@ -1548,5 +1557,9 @@ g_hash_table_insert(mime_table, "image/vnd.djvu", (gpointer)image_vnd_djvu); g_hash_table_insert(mime_table, "application/x-ms-reader", (gpointer)application_x_ms_reader); g_hash_table_insert(mime_table, "application/CDFV2-corrupt", (gpointer)application_CDFV2_corrupt); g_hash_table_insert(mime_table, "text/x-vcard", (gpointer)text_x_vcard); +g_hash_table_insert(mime_table, "application/x-innosetup", (gpointer)application_x_innosetup); +g_hash_table_insert(mime_table, "application/winhelp", (gpointer)application_winhelp); +g_hash_table_insert(mime_table, "image/x-tga", (gpointer)image_x_tga); +g_hash_table_insert(mime_table, "application/x-wine-extension-ini", (gpointer)application_x_wine_extension_ini); return mime_table;} #endif diff --git a/src/parsing/parse.c b/src/parsing/parse.c index dbf4e2b..7bb59f0 100644 --- a/src/parsing/parse.c +++ b/src/parsing/parse.c @@ -94,7 +94,7 @@ void parse(void *arg) { if (!(SHOULD_PARSE(doc.mime))) { } else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) || mmime == MimeAudio || mmime == MimeImage) { - parse_media(job->filepath, &doc); +// parse_media(job->filepath, &doc); } else if (IS_PDF(doc.mime)) { void *pdf_buf = read_all(job, (char *) buf, bytes_read, &fd); @@ -105,15 +105,15 @@ void parse(void *arg) { } } else if (mmime == MimeText && ScanCtx.content_size > 0) { - parse_text(bytes_read, &fd, (char *) buf, &doc); +// parse_text(bytes_read, &fd, (char *) buf, &doc); } else if (IS_FONT(doc.mime)) { - void *font_buf = read_all(job, (char *) buf, bytes_read, &fd); - parse_font(font_buf, doc.size, &doc); - - if (font_buf != buf) { - free(font_buf); - } +// void *font_buf = read_all(job, (char *) buf, bytes_read, &fd); +// parse_font(font_buf, doc.size, &doc); +// +// if (font_buf != buf) { +// free(font_buf); +// } } write_document(&doc); diff --git a/src/parsing/pdf.c b/src/parsing/pdf.c index 43c4482..5f4a058 100644 --- a/src/parsing/pdf.c +++ b/src/parsing/pdf.c @@ -1,13 +1,13 @@ #include "pdf.h" #include "src/ctx.h" +__always_inline fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) { fz_page *cover = fz_load_page(ctx, fzdoc, 0); fz_rect bounds = fz_bound_page(ctx, cover); float scale; - unsigned char *tn_buf; float w = (float) bounds.x1 - bounds.x0; float h = (float) bounds.y1 - bounds.y0; if (w > h) { @@ -17,11 +17,21 @@ fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) { } fz_matrix m = fz_scale(scale, scale); - fz_pixmap *pixmap; - fz_colorspace *color_space = fz_device_rgb(ctx); - pixmap = fz_new_pixmap_from_page(ctx, cover, m, color_space, 0); + bounds = fz_transform_rect(bounds, m); + fz_irect bbox = fz_round_rect(bounds); + fz_pixmap *pixmap = fz_new_pixmap_with_bbox(ctx, ctx->colorspace->rgb, bbox, NULL, 0); + + fz_clear_pixmap_with_value(ctx, pixmap, 0xFF); + fz_device *dev = fz_new_draw_device(ctx, m, pixmap); + + pthread_mutex_lock(&ScanCtx.mupdf_mu); + fz_run_page(ctx, cover, dev, fz_identity, NULL); + pthread_mutex_unlock(&ScanCtx.mupdf_mu); + + fz_drop_device(ctx, dev); fz_buffer *fzbuf = fz_new_buffer_from_pixmap_as_png(ctx, pixmap, fz_default_color_params); + unsigned char *tn_buf; size_t tn_len = fz_buffer_storage(ctx, fzbuf, &tn_buf); store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) tn_buf, tn_len); @@ -32,29 +42,38 @@ fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) { return cover; } - -void fz_noop_callback(void *user, const char *message) { -} +void fz_noop_callback(void *user, const char *message) {} void parse_pdf(void *buf, size_t buf_len, document_t *doc) { - //TODO error handling + static int mu_is_initialized = 0; + if (!mu_is_initialized) { + pthread_mutex_init(&ScanCtx.mupdf_mu, NULL); + mu_is_initialized = 1; + } fz_context *ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED); + fz_stream *stream = NULL; + fz_document *fzdoc = NULL; + + fz_var(stream); + fz_var(fzdoc); + fz_try(ctx) { + fz_disable_icc(ctx); fz_register_document_handlers(ctx); - ctx->warn.print = fz_noop_callback; //disable warnings + //disable warnings + ctx->warn.print = fz_noop_callback; ctx->error.print = fz_noop_callback; - fz_stream *stream = fz_open_memory(ctx, buf, buf_len); - fz_document *fzdoc = fz_open_document_with_stream(ctx, mime_get_mime_text(doc->mime), stream); + stream = fz_open_memory(ctx, buf, buf_len); + fzdoc = fz_open_document_with_stream(ctx, mime_get_mime_text(doc->mime), stream); int page_count = fz_count_pages(ctx, fzdoc); fz_page *cover = render_cover(ctx, doc, fzdoc); fz_stext_options opts; - fz_parse_stext_options(ctx, &opts, "preserve-ligatures"); text_buffer_t text_buf = text_buffer_create(ScanCtx.content_size); @@ -65,12 +84,34 @@ void parse_pdf(void *buf, size_t buf_len, document_t *doc) { } else { page = fz_load_page(ctx, fzdoc, current_page); } - fz_stext_page *stext = fz_new_stext_page_from_page(ctx, page, &opts); + + fz_stext_page *stext; + fz_device *dev = NULL; + + fz_var(dev); + + stext = fz_new_stext_page(ctx, fz_bound_page(ctx, page)); + fz_try(ctx) + { + dev = fz_new_stext_device(ctx, stext, &opts); + pthread_mutex_lock(&ScanCtx.mupdf_mu); + fz_run_page_contents(ctx, page, dev, fz_identity, NULL); + pthread_mutex_unlock(&ScanCtx.mupdf_mu); + fz_close_device(ctx, dev); + } + fz_always(ctx) + fz_drop_device(ctx, dev); + fz_catch(ctx) + { + fz_drop_stext_page(ctx, stext); + fz_rethrow(ctx); + } fz_stext_block *block = stext->first_block; while (block != NULL) { if (block->type != FZ_STEXT_BLOCK_TEXT) { + block = block->next; continue; } @@ -100,15 +141,12 @@ void parse_pdf(void *buf, size_t buf_len, document_t *doc) { memcpy(meta_content->strval, text_buf.dyn_buffer.buf, text_buf.dyn_buffer.cur); text_buffer_destroy(&text_buf); APPEND_META(doc, meta_content) - + } + fz_always(ctx) + { fz_drop_stream(ctx, stream); fz_drop_document(ctx, fzdoc); fz_drop_context(ctx); - - } - fz_catch(ctx) - { -// printf("err"); - } + } fz_catch(ctx) {} } diff --git a/src/sist.h b/src/sist.h index fff5eab..d42aa00 100644 --- a/src/sist.h +++ b/src/sist.h @@ -49,6 +49,7 @@ #include "parsing/font.h" #include "index/web.h" #include "web/serve.h" +#include "cli.h" ; diff --git a/src/tpool.c b/src/tpool.c index 7694199..8dea675 100644 --- a/src/tpool.c +++ b/src/tpool.c @@ -18,7 +18,8 @@ typedef struct tpool { pthread_cond_t has_work_cond; pthread_cond_t working_cond; - int working_cnt; + pthread_t *threads; + int thread_cnt; int work_cnt; int done_cnt; @@ -109,7 +110,6 @@ static void *tpool_worker(void *arg) { } tpool_work_t *work = tpool_work_get(pool); - pool->working_cnt++; pthread_mutex_unlock(&(pool->work_mutex)); if (work != NULL) { @@ -118,12 +118,11 @@ static void *tpool_worker(void *arg) { } pthread_mutex_lock(&(pool->work_mutex)); - pool->working_cnt--; pool->done_cnt++; progress_bar_print((double)pool->done_cnt / pool->work_cnt, ScanCtx.stat_tn_size, ScanCtx.stat_index_size); - if (pool->working_cnt == 0 && pool->work_head == NULL) { + if (pool->work_head == NULL) { pthread_cond_signal(&(pool->working_cond)); } pthread_mutex_unlock(&(pool->work_mutex)); @@ -131,7 +130,6 @@ static void *tpool_worker(void *arg) { pool->cleanup_func(); - pool->thread_cnt--; pthread_cond_signal(&(pool->working_cond)); pthread_mutex_unlock(&(pool->work_mutex)); return NULL; @@ -140,13 +138,13 @@ static void *tpool_worker(void *arg) { void tpool_wait(tpool_t *pool) { pthread_mutex_lock(&(pool->work_mutex)); while (1) { - usleep(1000000); - if (pool->working_cnt != 0) { + if (pool->done_cnt < pool->work_cnt) { pthread_cond_wait(&(pool->working_cond), &(pool->work_mutex)); } else { pool->stop = 1; break; } + progress_bar_print(100.0, ScanCtx.stat_tn_size, ScanCtx.stat_index_size); } pthread_mutex_unlock(&(pool->work_mutex)); } @@ -163,16 +161,20 @@ void tpool_destroy(tpool_t *pool) { free(work); work = tmp; } - pool->stop = 1; + pthread_cond_broadcast(&(pool->has_work_cond)); pthread_mutex_unlock(&(pool->work_mutex)); - tpool_wait(pool); + for (size_t i = 0; i < pool->thread_cnt; i++) { + pthread_t thread = pool->threads[i]; + pthread_cancel(thread); + } pthread_mutex_destroy(&(pool->work_mutex)); pthread_cond_destroy(&(pool->has_work_cond)); pthread_cond_destroy(&(pool->working_cond)); + free(pool->threads); free(pool); } @@ -184,9 +186,11 @@ tpool_t *tpool_create(size_t thread_cnt, void cleanup_func()) { tpool_t *pool = malloc(sizeof(tpool_t)); pool->thread_cnt = thread_cnt; - pool->working_cnt = 0; + pool->work_cnt =0; + pool->done_cnt =0; pool->stop = 0; pool->cleanup_func = cleanup_func; + pool->threads = malloc(sizeof(pthread_t) * thread_cnt); pthread_mutex_init(&(pool->work_mutex), NULL); @@ -197,7 +201,7 @@ tpool_t *tpool_create(size_t thread_cnt, void cleanup_func()) { pool->work_tail = NULL; for (size_t i = 0; i < thread_cnt; i++) { - pthread_t thread; + pthread_t thread = pool->threads[i]; pthread_create(&thread, NULL, tpool_worker, pool); pthread_detach(thread); } diff --git a/src/util.c b/src/util.c index a44cde5..6a1ee11 100644 --- a/src/util.c +++ b/src/util.c @@ -6,18 +6,37 @@ #define PBWIDTH 40 char *abspath(const char *path) { - char *abs = canonicalize_file_name(path); - abs = realloc(abs, strlen(abs) + 1); + wordexp_t w; + wordexp(path, &w, 0); + + char *abs = canonicalize_file_name(w.we_wordv[0]); + if (abs == NULL) { + return NULL; + } + abs = realloc(abs, strlen(abs) + 2); strcat(abs, "/"); + wordfree(&w); return abs; } +char *expandpath(const char *path) { + wordexp_t w; + wordexp(path, &w, 0); + + char * expanded = malloc(strlen(w.we_wordv[0]) + 2); + strcpy(expanded, w.we_wordv[0]); + strcat(expanded, "/"); + + wordfree(&w); + return expanded; +} + void progress_bar_print(double percentage, size_t tn_size, size_t index_size) { static int last_val = 0; int val = (int) (percentage * 100); - if (last_val == val || val >= 100) { + if (last_val == val || val > 100 || index_size < 1024) { return; } last_val = val; @@ -44,7 +63,7 @@ void progress_bar_print(double percentage, size_t tn_size, size_t index_size) { } printf( - "\r%2d%%[%.*s>%*s] TN:%3d%c IDX:%3d%c", + "\r%3d%%[%.*s>%*s] TN:%3d%c IDX:%3d%c", val, lpad, PBSTR, rpad, "", (int) tn_size, tn_unit, (int) index_size, index_unit diff --git a/src/util.h b/src/util.h index 7961a66..a868602 100644 --- a/src/util.h +++ b/src/util.h @@ -148,6 +148,7 @@ int text_buffer_append_char(text_buffer_t *buf, int c) { } char *abspath(const char * path); +char *expandpath(const char *path); void progress_bar_print(double percentage, size_t tn_size, size_t index_size); diff --git a/src/web/serve.c b/src/web/serve.c index 7a359d2..9f4535e 100644 --- a/src/web/serve.c +++ b/src/web/serve.c @@ -153,7 +153,7 @@ int chunked_response_file(const char *filename, const char *mime, // %d- ends = MIN(starts + CHUNK_SIZE, length); } - if (ends >= length || starts >= length || starts < 0) { + if (ends > length || starts >= length || starts < 0) { close(fd); return OCS_INTERNAL_ERROR; }