bugfixes & refactoring

This commit is contained in:
simon 2019-10-26 12:35:01 -04:00
parent 564a17a8fa
commit c3b7a05dde
21 changed files with 440 additions and 225 deletions

2
.gitignore vendored
View File

@ -13,3 +13,5 @@ sist2*
index.sist2/
bundle.css
bundle.js
*.a
vgcore.*

View File

@ -32,7 +32,7 @@ add_executable(
# LMDB
lmdb/libraries/liblmdb/lmdb.h lmdb/libraries/liblmdb/mdb.c
lmdb/libraries/liblmdb/midl.h lmdb/libraries/liblmdb/midl.c
)
src/cli.c src/cli.h)
find_package(PkgConfig REQUIRED)
set(ENV{PKG_CONFIG_PATH} "$ENV{PKG_CONFIG_PATH}:/usr/local/lib/pkgconfig/")
@ -84,8 +84,8 @@ include_directories(
target_compile_options(sist2
PRIVATE
-Ofast
# -march=native
-O3
-march=native
-fno-stack-protector
-fomit-frame-pointer
)
@ -93,16 +93,18 @@ target_compile_options(sist2
TARGET_LINK_LIBRARIES(
sist2
${GLIB_LIBRARIES}
${GOBJECT_LIBRARIES}
${UUID_LIBRARIES}
${GLIB_LIBRARIES}
# ffmpeg
${PROJECT_SOURCE_DIR}/lib/libavcodec.a
${PROJECT_SOURCE_DIR}/lib/libavformat.a
${PROJECT_SOURCE_DIR}/lib/libavutil.a
${PROJECT_SOURCE_DIR}/lib/libswscale.a
${PROJECT_SOURCE_DIR}/lib/libswresample.a
# ${PROJECT_SOURCE_DIR}/lib/libavcodec.a
# ${PROJECT_SOURCE_DIR}/lib/libavformat.a
# ${PROJECT_SOURCE_DIR}/lib/libavutil.a
# ${PROJECT_SOURCE_DIR}/lib/libswscale.a
# ${PROJECT_SOURCE_DIR}/lib/libswresample.a
${FFMPEG_LIBRARIES}
swscale
# mupdf
${PROJECT_SOURCE_DIR}/lib/libmupdf.a

Binary file not shown.

View File

@ -11,27 +11,23 @@ sist2 (Simple incremental search tool)
## Example usage
```bash
sist2 scan [OPTION]... PATH
See help page `sist2 --help` for more details.
# Examples
**Scan a directory**
```bash
sist2 scan ~/Documents -o ./orig_idx/
sist2 scan --threads 4 --content-size 16384 /mnt/Pictures
sist2 scan -i ./orig_idx/ -o ./updated_idx/ ~/Documents
```
**Push index to Elasticsearch or file**
```bash
sist2 index [OPTION]... INDEX
# Examples
sist2 index --force-reset ./my_idx
sist2 index --print ./my_idx > raw_documents.ndjson
```
**Start web interface**
```bash
sist2 web [OPTION]... INDEX...
# Examples
sist2 web --bind 0.0.0.0 --port 4321 ./my_idx1 ./my_idx2 ./my_idx3
```

View File

@ -352,4 +352,8 @@ audio/mp4, m4b
!image/vnd.djvu, djvu
application/x-ms-reader, lit
application/CDFV2-corrupt,
text/x-vcard, vcf
text/x-vcard, vcf
application/x-innosetup,
application/winhelp, hlp
image/x-tga,
application/x-wine-extension-ini,
1 application/arj arj
352 !image/vnd.djvu djvu
353 application/x-ms-reader lit
354 application/CDFV2-corrupt
355 text/x-vcard vcf
356 application/x-innosetup
357 application/winhelp hlp
358 image/x-tga
359 application/x-wine-extension-ini

View File

@ -1,5 +1,7 @@
#!/bin/bash
rm -rf index.sist2/
rm web/js/bundle.js 2> /dev/null
cat `ls -v web/js/*.min.js` > web/js/bundle.js
cat web/js/{util,dom,search}.js >> web/js/bundle.js

View File

@ -8,15 +8,6 @@ cd ..
mv mupdf/build/release/libmupdf.a .
mv mupdf/build/release/libmupdf-third.a .
# libpcre
cd libpcre
./autogen.sh
./configure --disable-shared
make -j 4
cd ..
mv libpcre/.libs/libpcre.a .
# ffmpeg
cd ffmpeg
./configure --disable-shared --enable-static --disable-ffmpeg --disable-ffplay \

155
src/cli.c Normal file
View File

@ -0,0 +1,155 @@
#include "cli.h"
#define DEFAULT_OUTPUT "index.sist2/"
#define DEFAULT_CONTENT_SIZE 4096
#define DEFAULT_QUALITY 15
#define DEFAULT_SIZE 200
#define DEFAULT_REWRITE_URL ""
#define DEFAULT_ES_URL "http://localhost:9200"
#define DEFAULT_BIND_ADDR "localhost"
#define DEFAULT_PORT "4090"
scan_args_t *scan_args_create() {
scan_args_t *args = calloc(sizeof(scan_args_t), 1);
return args;
}
index_args_t *index_args_create() {
index_args_t *args = calloc(sizeof(index_args_t), 1);
return args;
}
web_args_t *web_args_create() {
web_args_t *args = calloc(sizeof(web_args_t), 1);
return args;
}
int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
if (argc < 2) {
fprintf(stderr, "Required positional argument: PATH.\n");
return 1;
}
char *abs_path = abspath(argv[1]);
if (abs_path == NULL) {
fprintf(stderr, "File not found: %s", argv[1]);
return 1;
} else {
args->path = abs_path;
}
if (args->incremental != NULL) {
abs_path = abspath(args->incremental);
if (abs_path == NULL) {
fprintf(stderr, "File not found: %s", args->incremental);
return 1;
}
}
if (args->quality == 0) {
args->quality = DEFAULT_QUALITY;
} else if (args->quality < 1 || args->quality > 31) {
fprintf(stderr, "Invalid quality: %f\n", args->quality);
return 1;
}
if (args->size == 0) {
args->size = DEFAULT_SIZE;
} else if (args->size <= 0) {
fprintf(stderr, "Invalid size: %d\n", args->size);
return 1;
}
if (args->content_size == 0) {
args->content_size = DEFAULT_CONTENT_SIZE;
} else if (args->content_size <= 0) {
fprintf(stderr, "Invalid content-size: %d\n", args->content_size);
return 1;
}
if (args->threads == 0) {
args->threads = 1;
} else if (args->threads < 0) {
fprintf(stderr, "Invalid threads: %d\n", args->threads);
return 1;
}
if (args->output == NULL) {
args->output = malloc(strlen(DEFAULT_OUTPUT) + 1);
strcpy(args->output, DEFAULT_OUTPUT);
} else {
args->output = expandpath(args->output);
}
int ret = mkdir(args->output, S_IRUSR | S_IWUSR | S_IXUSR);
if (ret != 0) {
fprintf(stderr, "Invalid output: '%s' (%s).\n", args->output, strerror(errno));
return 1;
}
if (args->name == NULL) {
args->name = g_path_get_basename(args->output);
}
if (args->rewrite_url == NULL) {
args->rewrite_url = DEFAULT_REWRITE_URL;
}
return 0;
}
int index_args_validate(index_args_t *args, int argc, const char **argv) {
if (argc < 2) {
fprintf(stderr, "Required positional argument: PATH.\n");
return 1;
}
char *index_path = abspath(argv[1]);
if (index_path == NULL) {
fprintf(stderr, "File not found: %s", argv[1]);
return 1;
} else {
args->index_path = argv[1];
}
if (args->es_url == NULL) {
args->es_url = DEFAULT_ES_URL;
}
return 0;
}
int web_args_validate(web_args_t *args, int argc, const char **argv) {
if (argc < 2) {
fprintf(stderr, "Required positional argument: PATH.\n");
return 1;
}
if (args->es_url == NULL) {
args->es_url = DEFAULT_ES_URL;
}
if (args->bind == NULL) {
args->bind = DEFAULT_BIND_ADDR;
}
if (args->port == NULL) {
args->port = DEFAULT_PORT;
}
args->index_count = argc - 1;
args->indices = argv + 1;
for (int i = 0; i < args->index_count; i++) {
char *abs_path = abspath(args->indices[i]);
if (abs_path == NULL) {
fprintf(stderr, "File not found: %s", abs_path);
return 1;
}
}
return 0;
}

41
src/cli.h Normal file
View File

@ -0,0 +1,41 @@
#ifndef SIST2_CLI_H
#define SIST2_CLI_H
#include "sist.h"
typedef struct scan_args {
float quality;
int size;
int content_size;
int threads;
char *incremental;
char *output;
char *rewrite_url;
char *name;
char *path;
} scan_args_t;
typedef struct index_args {
char *es_url;
const char *index_path;
int print;
int force_reset;
} index_args_t;
typedef struct web_args {
char *es_url;
char *bind;
char *port;
int index_count;
const char **indices;
} web_args_t;
scan_args_t *scan_args_create();
index_args_t *index_args_create();
web_args_t *web_args_create();
int scan_args_validate(scan_args_t *args, int argc, const char **argv);
int index_args_validate(index_args_t *args, int argc, const char **argv);
int web_args_validate(web_args_t *args, int argc, const char **argv);
#endif

View File

@ -21,6 +21,8 @@ struct {
GHashTable *original_table;
GHashTable *copy_table;
pthread_mutex_t mupdf_mu;
} ScanCtx;

View File

@ -20,13 +20,13 @@ store_t *store_create(char *path) {
}
store->size = (size_t) 1024 * 1024 * 5;
ScanCtx.stat_tn_size = store->size;
ScanCtx.stat_tn_size = 0;
mdb_env_set_mapsize(store->env, store->size);
// Open dbi
MDB_txn *txn;
int r3 = mdb_txn_begin(store->env, NULL, 0, &txn);
int r4 = mdb_dbi_open(txn, NULL, 0, &store->dbi);
mdb_txn_begin(store->env, NULL, 0, &txn);
mdb_dbi_open(txn, NULL, 0, &store->dbi);
mdb_txn_commit(txn);
return store;
@ -55,6 +55,7 @@ void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t bu
mdb_txn_begin(store->env, NULL, 0, &txn);
int put_ret = mdb_put(txn, store->dbi, &mdb_key, &mdb_value, 0);
ScanCtx.stat_tn_size += buf_len;
if (put_ret == MDB_MAP_FULL) {
mdb_txn_abort(txn);
@ -67,7 +68,6 @@ void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t bu
mdb_env_set_mapsize(store->env, store->size);
mdb_txn_begin(store->env, NULL, 0, &txn);
put_ret = mdb_put(txn, store->dbi, &mdb_key, &mdb_value, 0);
ScanCtx.stat_tn_size = store->size;
}
mdb_txn_commit(txn);

View File

@ -1,8 +1,11 @@
#include "sist.h"
#include "ctx.h"
static const char *const Version = "1.0.0";
#define DESCRIPTION "Lightning-fast file system indexer and search tool."
#define EPILOG "Made by simon987 <me@simon987.net>. Released under GPL-3.0"
static const char *const Version = "1.0.0";
static const char *const usage[] = {
"sist2 scan [OPTION]... PATH",
"sist2 index [OPTION]... INDEX",
@ -10,6 +13,11 @@ static const char *const usage[] = {
NULL,
};
void global_init() {
curl_global_init(CURL_GLOBAL_NOTHING);
av_log_set_level(AV_LOG_QUIET);
}
void init_dir(const char *dirpath) {
char path[PATH_MAX];
snprintf(path, PATH_MAX, "%sdescriptor.json", dirpath);
@ -32,11 +40,16 @@ void scan_print_header() {
printf("output\t\t%s\n", ScanCtx.index.path);
}
void sist2_scan(const char *path, const char *incremental_from) {
void sist2_scan(scan_args_t *args) {
av_log_set_level(AV_LOG_QUIET);
strcpy(ScanCtx.index.desc.root, abspath(path));
ScanCtx.tn_qscale = args->quality;
ScanCtx.tn_size = args->size;
ScanCtx.content_size = args->content_size;
ScanCtx.pool = tpool_create(args->threads, serializer_cleanup);
ScanCtx.threads = args->threads;
strncpy(ScanCtx.index.path, args->output, sizeof(ScanCtx.index.path));
strncpy(ScanCtx.index.desc.name, args->name, sizeof(ScanCtx.index.desc.name));
strcpy(ScanCtx.index.desc.root, args->path);
ScanCtx.index.desc.root_len = (short) strlen(ScanCtx.index.desc.root);
init_dir(ScanCtx.index.path);
@ -51,12 +64,11 @@ void sist2_scan(const char *path, const char *incremental_from) {
scan_print_header();
if (incremental_from != NULL) {
incremental_from = abspath(incremental_from);
if (args->incremental != NULL) {
ScanCtx.original_table = incremental_get_table();
ScanCtx.copy_table = incremental_get_table();
DIR *dir = opendir(incremental_from);
DIR *dir = opendir(args->incremental);
if (dir == NULL) {
perror("opendir");
return;
@ -65,7 +77,7 @@ void sist2_scan(const char *path, const char *incremental_from) {
while ((de = readdir(dir)) != NULL) {
if (strncmp(de->d_name, "_index_", sizeof("_index_") - 1) == 0) {
char file_path[PATH_MAX];
snprintf(file_path, PATH_MAX, "%s/%s", incremental_from, de->d_name);
snprintf(file_path, PATH_MAX, "%s/%s", args->incremental, de->d_name);
incremental_read(ScanCtx.original_table, file_path);
}
}
@ -76,14 +88,15 @@ void sist2_scan(const char *path, const char *incremental_from) {
walk_directory_tree(ScanCtx.index.desc.root);
tpool_wait(ScanCtx.pool);
tpool_destroy(ScanCtx.pool);
if (incremental_from != NULL) {
if (args->incremental != NULL) {
char dst_path[PATH_MAX];
snprintf(store_path, PATH_MAX, "%sthumbs", incremental_from);
snprintf(store_path, PATH_MAX, "%sthumbs", args->incremental);
snprintf(dst_path, PATH_MAX, "%s_index_original", ScanCtx.index.path);
store_t *source = store_create(store_path);
DIR *dir = opendir(incremental_from);
DIR *dir = opendir(args->incremental);
if (dir == NULL) {
perror("opendir");
return;
@ -92,7 +105,7 @@ void sist2_scan(const char *path, const char *incremental_from) {
while ((de = readdir(dir)) != NULL) {
if (strncmp(de->d_name, "_index_", sizeof("_index_") - 1) == 0) {
char file_path[PATH_MAX];
snprintf(file_path, PATH_MAX, "%s/%s", incremental_from, de->d_name);
snprintf(file_path, PATH_MAX, "%s/%s", args->incremental, de->d_name);
incremental_copy(source, ScanCtx.index.store, file_path, dst_path, ScanCtx.copy_table);
}
}
@ -101,16 +114,18 @@ void sist2_scan(const char *path, const char *incremental_from) {
}
store_destroy(ScanCtx.index.store);
tpool_destroy(ScanCtx.pool);
}
void sist2_index(const char *path, int print_index, int arg_force_reset) {
if (!print_index) {
elastic_init(arg_force_reset);
void sist2_index(index_args_t *args) {
IndexCtx.es_url = args->es_url;
if (!args->print) {
elastic_init(args->force_reset);
}
char *index_path = abspath(path);
char descriptor_path[PATH_MAX];
snprintf(descriptor_path, PATH_MAX, "%s/descriptor.json", index_path);
snprintf(descriptor_path, PATH_MAX, "%s/descriptor.json", args->index_path);
index_descriptor_t desc = read_index_descriptor(descriptor_path);
if (strcmp(desc.version, Version) != 0) {
@ -118,14 +133,14 @@ void sist2_index(const char *path, int print_index, int arg_force_reset) {
return;
}
DIR *dir = opendir(index_path);
DIR *dir = opendir(args->index_path);
if (dir == NULL) {
perror("opendir");
return;
}
index_func f;
if (print_index) {
if (args->print) {
f = print_json;
} else {
f = index_json;
@ -135,22 +150,27 @@ void sist2_index(const char *path, int print_index, int arg_force_reset) {
while ((de = readdir(dir)) != NULL) {
if (strncmp(de->d_name, "_index_", sizeof("_index_") - 1) == 0) {
char file_path[PATH_MAX];
snprintf(file_path, PATH_MAX, "%s/%s", index_path, de->d_name);
snprintf(file_path, PATH_MAX, "%s/%s", args->index_path, de->d_name);
read_index(file_path, desc.uuid, f);
}
}
if (!print_index) {
if (!args->print) {
elastic_flush();
destroy_indexer();
}
}
void sist2_web(const char *indices[], int index_count, const char *host, const char *port) {
void sist2_web(web_args_t *args) {
for (int i = 0; i < index_count; i++) {
char *abs_path = abspath(indices[i]);
WebCtx.es_url = args->es_url;
WebCtx.index_count = args->index_count;
for (int i = 0; i < args->index_count; i++) {
char *abs_path = abspath(args->indices[i]);
if (abs_path == NULL) {
return;
}
char path_tmp[PATH_MAX];
snprintf(path_tmp, PATH_MAX, "%sthumbs", abs_path);
@ -161,168 +181,88 @@ void sist2_web(const char *indices[], int index_count, const char *host, const c
strcpy(WebCtx.indices[i].path, abs_path);
printf("Loaded index: %s\n", WebCtx.indices[i].desc.name);
free(abs_path);
}
WebCtx.index_count = index_count;
serve(host, port);
serve(args->bind, args->port);
}
int main(int argc, const char *argv[]) {
curl_global_init(CURL_GLOBAL_NOTHING);
global_init();
float arg_quality = 0;
int arg_size = 0;
int arg_content_size = 0;
int arg_threads = 0;
char *arg_incremental = NULL;
char *arg_output = NULL;
char *arg_rewrite_url = NULL;
char *arg_name = NULL;
scan_args_t *scan_args = scan_args_create();
index_args_t *index_args = index_args_create();
web_args_t *web_args = web_args_create();
char *arg_es_url = NULL;
int arg_print_index = 0;
int arg_force_reset = 0;
char *arg_web_host = NULL;
char *arg_web_port = NULL;
char * common_es_url;
struct argparse_option options[] = {
OPT_HELP(),
OPT_GROUP("Scan options"),
OPT_INTEGER('t', "threads", &arg_threads, "Number of threads. DEFAULT=1"),
OPT_FLOAT('q', "quality", &arg_quality,
OPT_INTEGER('t', "threads", &scan_args->threads, "Number of threads. DEFAULT=1"),
OPT_FLOAT('q', "quality", &scan_args->quality,
"Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=15"),
OPT_INTEGER(0, "size", &arg_size, "Thumbnail size, in pixels. DEFAULT=200"),
OPT_INTEGER(0, "content-size", &arg_content_size,
OPT_INTEGER(0, "size", &scan_args->size, "Thumbnail size, in pixels. DEFAULT=200"),
OPT_INTEGER(0, "content-size", &scan_args->content_size,
"Number of bytes to be extracted from text documents. DEFAULT=4096"),
OPT_STRING(0, "incremental", &arg_incremental, "Reuse an existing index and only scan modified files."),
OPT_STRING('o', "output", &arg_output, "Output directory. DEFAULT=index.sist2/"),
OPT_STRING(0, "rewrite-url", &arg_rewrite_url, "Serve files from this url instead of from disk."),
OPT_STRING(0, "name", &arg_name, "Index display name. DEFAULT: (name of the directory)"),
OPT_STRING(0, "incremental", &scan_args->incremental,
"Reuse an existing index and only scan modified files."),
OPT_STRING('o', "output", &scan_args->output, "Output directory. DEFAULT=index.sist2/"),
OPT_STRING(0, "rewrite-url", &scan_args->rewrite_url, "Serve files from this url instead of from disk."),
OPT_STRING(0, "name", &scan_args->name, "Index display name. DEFAULT: (name of the directory)"),
OPT_GROUP("Index options"),
OPT_STRING(0, "es-url", &arg_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"),
OPT_BOOLEAN('p', "print", &arg_print_index, "Just print JSON documents to stdout."),
OPT_BOOLEAN('f', "force-reset", &arg_force_reset, "Reset Elasticsearch mappings and settings. "
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"),
OPT_BOOLEAN('p', "print", &index_args->print, "Just print JSON documents to stdout."),
OPT_BOOLEAN('f', "force-reset", &index_args->force_reset, "Reset Elasticsearch mappings and settings. "
"(You must use this option the first time you use the index command)"),
OPT_GROUP("Web options"),
OPT_STRING(0, "es-url", &arg_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"),
OPT_STRING(0, "bind", &arg_web_host, "Listen on this address. DEFAULT=localhost"),
OPT_STRING(0, "port", &arg_web_port, "Listen on this port. DEFAULT=4090"),
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"),
OPT_STRING(0, "bind", &web_args->bind, "Listen on this address. DEFAULT=localhost"),
OPT_STRING(0, "port", &web_args->port, "Listen on this port. DEFAULT=4090"),
OPT_END(),
};
struct argparse argparse;
argparse_init(&argparse, options, usage, 0);
argparse_describe(
&argparse,
"\nLightning-fast file system indexer and search tool.",
"\nMade by simon987 <me@simon987.net>. Released under GPL-3.0"
);
argparse_describe(&argparse, DESCRIPTION, EPILOG);
argc = argparse_parse(&argparse, argc, argv);
//Set defaults
if (arg_quality == 0) {
arg_quality = 15;
} else if (arg_quality < 1 || arg_quality > 31) {
fprintf(stderr, "Invalid quality: %f\n", arg_quality);
return 1;
}
web_args->es_url = common_es_url;
index_args->es_url = common_es_url;
if (arg_size == 0) {
arg_size = 200;
} else if (arg_size <= 0) {
fprintf(stderr, "Invalid size: %d\n", arg_size);
return 1;
}
if (arg_content_size == 0) {
arg_content_size = 4096;
} else if (arg_content_size <= 0) {
fprintf(stderr, "Invalid content-size: %d\n", arg_content_size);
return 1;
}
if (arg_threads == 0) {
arg_threads = 1;
} else if (arg_threads < 0) {
fprintf(stderr, "Invalid threads: %d\n", arg_threads);
return 1;
}
if (arg_output == NULL) {
arg_output = "index.sist2/";
}
if (arg_es_url == NULL) {
arg_es_url = "http://localhost:9200";
}
if (arg_web_host == NULL) {
arg_web_host = "localhost";
}
if (arg_web_port == NULL) {
arg_web_port = "4090";
}
// Commands
if (argc == 0) {
argparse_usage(&argparse);
return 1;
} else if (strcmp(argv[0], "scan") == 0) {
if (argc < 2) {
fprintf(stderr, "Required positional argument: PATH.\n");
argparse_usage(&argparse);
return 1;
}
if (arg_name == NULL) {
arg_name = g_path_get_basename(argv[1]);
int err = scan_args_validate(scan_args, argc, argv);
if (err != 0) {
return err;
}
sist2_scan(scan_args);
int ret = mkdir(arg_output, S_IRUSR | S_IWUSR | S_IXUSR);
if (ret != 0) {
fprintf(stderr, "Invalid output: '%s' (%s).\n", arg_output, strerror(errno));
return 1;
}
ScanCtx.tn_qscale = arg_quality;
ScanCtx.tn_size = arg_size;
ScanCtx.content_size = arg_content_size;
ScanCtx.pool = tpool_create(arg_threads, serializer_cleanup);
ScanCtx.threads = arg_threads;
strncpy(ScanCtx.index.path, arg_output, sizeof(ScanCtx.index.path));
strncpy(ScanCtx.index.desc.name, arg_name, sizeof(ScanCtx.index.desc.name));
if (arg_rewrite_url == NULL) {
strcpy(ScanCtx.index.desc.rewrite_url, "");
} else {
strcpy(ScanCtx.index.desc.rewrite_url, arg_rewrite_url);
}
sist2_scan(argv[1], arg_incremental);
} else if (strcmp(argv[0], "index") == 0) {
if (argc < 2) {
fprintf(stderr, "Required positional argument: PATH.\n");
argparse_usage(&argparse);
return 1;
}
IndexCtx.es_url = arg_es_url;
sist2_index(argv[1], arg_print_index, arg_force_reset);
int err = index_args_validate(index_args, argc, argv);
if (err != 0) {
return err;
}
sist2_index(index_args);
} else if (strcmp(argv[0], "web") == 0) {
if (argc < 2) {
fprintf(stderr, "Required positional argument: PATH.\n");
argparse_usage(&argparse);
return 1;
}
WebCtx.es_url = arg_es_url;
sist2_web(argv + 1, argc - 1, arg_web_host, arg_web_port);
int err = web_args_validate(web_args, argc, argv);
if (err != 0) {
return err;
}
sist2_web(web_args);
} else {
fprintf(stderr, "Invalid command: '%s'\n", argv[0]);
argparse_usage(&argparse);

View File

@ -153,6 +153,10 @@ void parse_media(const char *filepath, document_t *doc) {
int video_stream = -1;
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
fprintf(stderr, "Could not allocate AVFormatContext! %s \n", filepath);
return;
}
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
if (res < 0) {
printf("ERR%s %s\n", filepath, av_err2str(res));

View File

@ -360,6 +360,10 @@ enum mime {
application_x_ms_reader=655712,
application_CDFV2_corrupt=655713,
text_x_vcard=590178,
application_x_innosetup=655715,
application_winhelp=655716,
image_x_tga=524645,
application_x_wine_extension_ini=655718,
};
char *mime_get_mime_text(unsigned int mime_id) {switch (mime_id) {
case application_arj: return "application/arj";
@ -716,6 +720,10 @@ case image_vnd_djvu: return "image/vnd.djvu";
case application_x_ms_reader: return "application/x-ms-reader";
case application_CDFV2_corrupt: return "application/CDFV2-corrupt";
case text_x_vcard: return "text/x-vcard";
case application_x_innosetup: return "application/x-innosetup";
case application_winhelp: return "application/winhelp";
case image_x_tga: return "image/x-tga";
case application_x_wine_extension_ini: return "application/x-wine-extension-ini";
default: return NULL;}}
GHashTable *mime_get_ext_table() {GHashTable *ext_table = g_hash_table_new(g_str_hash, g_str_equal);
g_hash_table_insert(ext_table, "arj", (gpointer)application_arj);
@ -1192,6 +1200,7 @@ g_hash_table_insert(ext_table, "m4b", (gpointer)audio_mp4);
g_hash_table_insert(ext_table, "djvu", (gpointer)image_vnd_djvu);
g_hash_table_insert(ext_table, "lit", (gpointer)application_x_ms_reader);
g_hash_table_insert(ext_table, "vcf", (gpointer)text_x_vcard);
g_hash_table_insert(ext_table, "hlp", (gpointer)application_winhelp);
return ext_table;}
GHashTable *mime_get_mime_table() {GHashTable *mime_table = g_hash_table_new(g_str_hash, g_str_equal);
g_hash_table_insert(mime_table, "application/arj", (gpointer)application_arj);
@ -1548,5 +1557,9 @@ g_hash_table_insert(mime_table, "image/vnd.djvu", (gpointer)image_vnd_djvu);
g_hash_table_insert(mime_table, "application/x-ms-reader", (gpointer)application_x_ms_reader);
g_hash_table_insert(mime_table, "application/CDFV2-corrupt", (gpointer)application_CDFV2_corrupt);
g_hash_table_insert(mime_table, "text/x-vcard", (gpointer)text_x_vcard);
g_hash_table_insert(mime_table, "application/x-innosetup", (gpointer)application_x_innosetup);
g_hash_table_insert(mime_table, "application/winhelp", (gpointer)application_winhelp);
g_hash_table_insert(mime_table, "image/x-tga", (gpointer)image_x_tga);
g_hash_table_insert(mime_table, "application/x-wine-extension-ini", (gpointer)application_x_wine_extension_ini);
return mime_table;}
#endif

View File

@ -94,7 +94,7 @@ void parse(void *arg) {
if (!(SHOULD_PARSE(doc.mime))) {
} else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) || mmime == MimeAudio || mmime == MimeImage) {
parse_media(job->filepath, &doc);
// parse_media(job->filepath, &doc);
} else if (IS_PDF(doc.mime)) {
void *pdf_buf = read_all(job, (char *) buf, bytes_read, &fd);
@ -105,15 +105,15 @@ void parse(void *arg) {
}
} else if (mmime == MimeText && ScanCtx.content_size > 0) {
parse_text(bytes_read, &fd, (char *) buf, &doc);
// parse_text(bytes_read, &fd, (char *) buf, &doc);
} else if (IS_FONT(doc.mime)) {
void *font_buf = read_all(job, (char *) buf, bytes_read, &fd);
parse_font(font_buf, doc.size, &doc);
if (font_buf != buf) {
free(font_buf);
}
// void *font_buf = read_all(job, (char *) buf, bytes_read, &fd);
// parse_font(font_buf, doc.size, &doc);
//
// if (font_buf != buf) {
// free(font_buf);
// }
}
write_document(&doc);

View File

@ -1,13 +1,13 @@
#include "pdf.h"
#include "src/ctx.h"
__always_inline
fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) {
fz_page *cover = fz_load_page(ctx, fzdoc, 0);
fz_rect bounds = fz_bound_page(ctx, cover);
float scale;
unsigned char *tn_buf;
float w = (float) bounds.x1 - bounds.x0;
float h = (float) bounds.y1 - bounds.y0;
if (w > h) {
@ -17,11 +17,21 @@ fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) {
}
fz_matrix m = fz_scale(scale, scale);
fz_pixmap *pixmap;
fz_colorspace *color_space = fz_device_rgb(ctx);
pixmap = fz_new_pixmap_from_page(ctx, cover, m, color_space, 0);
bounds = fz_transform_rect(bounds, m);
fz_irect bbox = fz_round_rect(bounds);
fz_pixmap *pixmap = fz_new_pixmap_with_bbox(ctx, ctx->colorspace->rgb, bbox, NULL, 0);
fz_clear_pixmap_with_value(ctx, pixmap, 0xFF);
fz_device *dev = fz_new_draw_device(ctx, m, pixmap);
pthread_mutex_lock(&ScanCtx.mupdf_mu);
fz_run_page(ctx, cover, dev, fz_identity, NULL);
pthread_mutex_unlock(&ScanCtx.mupdf_mu);
fz_drop_device(ctx, dev);
fz_buffer *fzbuf = fz_new_buffer_from_pixmap_as_png(ctx, pixmap, fz_default_color_params);
unsigned char *tn_buf;
size_t tn_len = fz_buffer_storage(ctx, fzbuf, &tn_buf);
store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) tn_buf, tn_len);
@ -32,29 +42,38 @@ fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) {
return cover;
}
void fz_noop_callback(void *user, const char *message) {
}
void fz_noop_callback(void *user, const char *message) {}
void parse_pdf(void *buf, size_t buf_len, document_t *doc) {
//TODO error handling
static int mu_is_initialized = 0;
if (!mu_is_initialized) {
pthread_mutex_init(&ScanCtx.mupdf_mu, NULL);
mu_is_initialized = 1;
}
fz_context *ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
fz_stream *stream = NULL;
fz_document *fzdoc = NULL;
fz_var(stream);
fz_var(fzdoc);
fz_try(ctx)
{
fz_disable_icc(ctx);
fz_register_document_handlers(ctx);
ctx->warn.print = fz_noop_callback; //disable warnings
//disable warnings
ctx->warn.print = fz_noop_callback;
ctx->error.print = fz_noop_callback;
fz_stream *stream = fz_open_memory(ctx, buf, buf_len);
fz_document *fzdoc = fz_open_document_with_stream(ctx, mime_get_mime_text(doc->mime), stream);
stream = fz_open_memory(ctx, buf, buf_len);
fzdoc = fz_open_document_with_stream(ctx, mime_get_mime_text(doc->mime), stream);
int page_count = fz_count_pages(ctx, fzdoc);
fz_page *cover = render_cover(ctx, doc, fzdoc);
fz_stext_options opts;
fz_parse_stext_options(ctx, &opts, "preserve-ligatures");
text_buffer_t text_buf = text_buffer_create(ScanCtx.content_size);
@ -65,12 +84,34 @@ void parse_pdf(void *buf, size_t buf_len, document_t *doc) {
} else {
page = fz_load_page(ctx, fzdoc, current_page);
}
fz_stext_page *stext = fz_new_stext_page_from_page(ctx, page, &opts);
fz_stext_page *stext;
fz_device *dev = NULL;
fz_var(dev);
stext = fz_new_stext_page(ctx, fz_bound_page(ctx, page));
fz_try(ctx)
{
dev = fz_new_stext_device(ctx, stext, &opts);
pthread_mutex_lock(&ScanCtx.mupdf_mu);
fz_run_page_contents(ctx, page, dev, fz_identity, NULL);
pthread_mutex_unlock(&ScanCtx.mupdf_mu);
fz_close_device(ctx, dev);
}
fz_always(ctx)
fz_drop_device(ctx, dev);
fz_catch(ctx)
{
fz_drop_stext_page(ctx, stext);
fz_rethrow(ctx);
}
fz_stext_block *block = stext->first_block;
while (block != NULL) {
if (block->type != FZ_STEXT_BLOCK_TEXT) {
block = block->next;
continue;
}
@ -100,15 +141,12 @@ void parse_pdf(void *buf, size_t buf_len, document_t *doc) {
memcpy(meta_content->strval, text_buf.dyn_buffer.buf, text_buf.dyn_buffer.cur);
text_buffer_destroy(&text_buf);
APPEND_META(doc, meta_content)
}
fz_always(ctx)
{
fz_drop_stream(ctx, stream);
fz_drop_document(ctx, fzdoc);
fz_drop_context(ctx);
}
fz_catch(ctx)
{
// printf("err");
}
} fz_catch(ctx) {}
}

View File

@ -49,6 +49,7 @@
#include "parsing/font.h"
#include "index/web.h"
#include "web/serve.h"
#include "cli.h"
;

View File

@ -18,7 +18,8 @@ typedef struct tpool {
pthread_cond_t has_work_cond;
pthread_cond_t working_cond;
int working_cnt;
pthread_t *threads;
int thread_cnt;
int work_cnt;
int done_cnt;
@ -109,7 +110,6 @@ static void *tpool_worker(void *arg) {
}
tpool_work_t *work = tpool_work_get(pool);
pool->working_cnt++;
pthread_mutex_unlock(&(pool->work_mutex));
if (work != NULL) {
@ -118,12 +118,11 @@ static void *tpool_worker(void *arg) {
}
pthread_mutex_lock(&(pool->work_mutex));
pool->working_cnt--;
pool->done_cnt++;
progress_bar_print((double)pool->done_cnt / pool->work_cnt, ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
if (pool->working_cnt == 0 && pool->work_head == NULL) {
if (pool->work_head == NULL) {
pthread_cond_signal(&(pool->working_cond));
}
pthread_mutex_unlock(&(pool->work_mutex));
@ -131,7 +130,6 @@ static void *tpool_worker(void *arg) {
pool->cleanup_func();
pool->thread_cnt--;
pthread_cond_signal(&(pool->working_cond));
pthread_mutex_unlock(&(pool->work_mutex));
return NULL;
@ -140,13 +138,13 @@ static void *tpool_worker(void *arg) {
void tpool_wait(tpool_t *pool) {
pthread_mutex_lock(&(pool->work_mutex));
while (1) {
usleep(1000000);
if (pool->working_cnt != 0) {
if (pool->done_cnt < pool->work_cnt) {
pthread_cond_wait(&(pool->working_cond), &(pool->work_mutex));
} else {
pool->stop = 1;
break;
}
progress_bar_print(100.0, ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
}
pthread_mutex_unlock(&(pool->work_mutex));
}
@ -163,16 +161,20 @@ void tpool_destroy(tpool_t *pool) {
free(work);
work = tmp;
}
pool->stop = 1;
pthread_cond_broadcast(&(pool->has_work_cond));
pthread_mutex_unlock(&(pool->work_mutex));
tpool_wait(pool);
for (size_t i = 0; i < pool->thread_cnt; i++) {
pthread_t thread = pool->threads[i];
pthread_cancel(thread);
}
pthread_mutex_destroy(&(pool->work_mutex));
pthread_cond_destroy(&(pool->has_work_cond));
pthread_cond_destroy(&(pool->working_cond));
free(pool->threads);
free(pool);
}
@ -184,9 +186,11 @@ tpool_t *tpool_create(size_t thread_cnt, void cleanup_func()) {
tpool_t *pool = malloc(sizeof(tpool_t));
pool->thread_cnt = thread_cnt;
pool->working_cnt = 0;
pool->work_cnt =0;
pool->done_cnt =0;
pool->stop = 0;
pool->cleanup_func = cleanup_func;
pool->threads = malloc(sizeof(pthread_t) * thread_cnt);
pthread_mutex_init(&(pool->work_mutex), NULL);
@ -197,7 +201,7 @@ tpool_t *tpool_create(size_t thread_cnt, void cleanup_func()) {
pool->work_tail = NULL;
for (size_t i = 0; i < thread_cnt; i++) {
pthread_t thread;
pthread_t thread = pool->threads[i];
pthread_create(&thread, NULL, tpool_worker, pool);
pthread_detach(thread);
}

View File

@ -6,18 +6,37 @@
#define PBWIDTH 40
char *abspath(const char *path) {
char *abs = canonicalize_file_name(path);
abs = realloc(abs, strlen(abs) + 1);
wordexp_t w;
wordexp(path, &w, 0);
char *abs = canonicalize_file_name(w.we_wordv[0]);
if (abs == NULL) {
return NULL;
}
abs = realloc(abs, strlen(abs) + 2);
strcat(abs, "/");
wordfree(&w);
return abs;
}
char *expandpath(const char *path) {
wordexp_t w;
wordexp(path, &w, 0);
char * expanded = malloc(strlen(w.we_wordv[0]) + 2);
strcpy(expanded, w.we_wordv[0]);
strcat(expanded, "/");
wordfree(&w);
return expanded;
}
void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {
static int last_val = 0;
int val = (int) (percentage * 100);
if (last_val == val || val >= 100) {
if (last_val == val || val > 100 || index_size < 1024) {
return;
}
last_val = val;
@ -44,7 +63,7 @@ void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {
}
printf(
"\r%2d%%[%.*s>%*s] TN:%3d%c IDX:%3d%c",
"\r%3d%%[%.*s>%*s] TN:%3d%c IDX:%3d%c",
val, lpad, PBSTR, rpad, "",
(int) tn_size, tn_unit,
(int) index_size, index_unit

View File

@ -148,6 +148,7 @@ int text_buffer_append_char(text_buffer_t *buf, int c) {
}
char *abspath(const char * path);
char *expandpath(const char *path);
void progress_bar_print(double percentage, size_t tn_size, size_t index_size);

View File

@ -153,7 +153,7 @@ int chunked_response_file(const char *filename, const char *mime,
// %d-
ends = MIN(starts + CHUNK_SIZE, length);
}
if (ends >= length || starts >= length || starts < 0) {
if (ends > length || starts >= length || starts < 0) {
close(fd);
return OCS_INTERNAL_ERROR;
}