Files
sist2/src/cli.c
2020-02-01 20:03:59 -05:00

306 lines
8.5 KiB
C

#include "cli.h"
#include "ctx.h"
#include <tesseract/capi.h>
#define DEFAULT_OUTPUT "index.sist2/"
#define DEFAULT_CONTENT_SIZE 32768
#define DEFAULT_QUALITY 5
#define DEFAULT_SIZE 500
#define DEFAULT_REWRITE_URL ""
#define DEFAULT_ES_URL "http://localhost:9200"
#define DEFAULT_BATCH_SIZE 100
#define DEFAULT_BIND_ADDR "localhost"
#define DEFAULT_PORT "4090"
const char* TESS_DATAPATHS[] = {
"/usr/share/tessdata/",
"/usr/share/tesseract-ocr/tessdata/",
"./",
NULL
};
scan_args_t *scan_args_create() {
scan_args_t *args = calloc(sizeof(scan_args_t), 1);
args->depth = -1;
return args;
}
void scan_args_destroy(scan_args_t *args) {
if (args->name != NULL) {
free(args->name);
}
if (args->path != NULL) {
free(args->path);
}
if (args->output != NULL) {
free(args->output);
}
free(args);
}
void index_args_destroy(index_args_t *args) {
//todo
free(args);
}
void web_args_destroy(web_args_t *args) {
//todo
free(args);
}
int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
if (argc < 2) {
fprintf(stderr, "Required positional argument: PATH.\n");
return 1;
}
char *abs_path = abspath(argv[1]);
if (abs_path == NULL) {
fprintf(stderr, "File not found: %s\n", argv[1]);
return 1;
} else {
args->path = abs_path;
}
if (args->incremental != NULL) {
abs_path = abspath(args->incremental);
if (abs_path == NULL) {
fprintf(stderr, "File not found: %s\n", args->incremental);
return 1;
}
}
if (args->quality == 0) {
args->quality = DEFAULT_QUALITY;
} else if (args->quality < 1 || args->quality > 31) {
fprintf(stderr, "Invalid quality: %f\n", args->quality);
return 1;
}
if (args->size == 0) {
args->size = DEFAULT_SIZE;
} else if (args->size > 0 && args->size < 32) {
printf("Invalid size: %d\n", args->content_size);
return 1;
}
if (args->content_size == 0) {
args->content_size = DEFAULT_CONTENT_SIZE;
}
if (args->threads == 0) {
args->threads = 1;
} else if (args->threads < 0) {
fprintf(stderr, "Invalid threads: %d\n", args->threads);
return 1;
}
if (args->output == NULL) {
args->output = malloc(strlen(DEFAULT_OUTPUT) + 1);
strcpy(args->output, DEFAULT_OUTPUT);
} else {
args->output = expandpath(args->output);
}
int ret = mkdir(args->output, S_IRUSR | S_IWUSR | S_IXUSR);
if (ret != 0) {
fprintf(stderr, "Invalid output: '%s' (%s).\n", args->output, strerror(errno));
return 1;
}
if (args->depth < 0) {
args->depth = G_MAXINT32;
} else {
args->depth += 1;
}
if (args->name == NULL) {
args->name = g_path_get_basename(args->output);
}
if (args->rewrite_url == NULL) {
args->rewrite_url = DEFAULT_REWRITE_URL;
}
if (args->archive == NULL || strcmp(args->archive, "recurse") == 0) {
args->archive_mode = ARC_MODE_RECURSE;
} else if (strcmp(args->archive, "list") == 0) {
args->archive_mode = ARC_MODE_LIST;
} else if (strcmp(args->archive, "shallow") == 0) {
args->archive_mode = ARC_MODE_SHALLOW;
} else if (strcmp(args->archive, "skip") == 0) {
args->archive_mode = ARC_MODE_SKIP;
} else {
fprintf(stderr, "Archive mode must be one of (skip, list, shallow, recurse), got '%s'", args->archive);
return 1;
}
if (args->tesseract_lang != NULL) {
TessBaseAPI *api = TessBaseAPICreate();
char filename[128];
sprintf(filename, "%s.traineddata", args->tesseract_lang);
const char * path = find_file_in_paths(TESS_DATAPATHS, filename);
if (path == NULL) {
LOG_FATAL("cli.c", "Could not find tesseract language file!");
}
ret = TessBaseAPIInit3(api, path, args->tesseract_lang);
if (ret != 0) {
fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang);
return 1;
}
TessBaseAPIEnd(api);
TessBaseAPIDelete(api);
args->tesseract_path = path;
}
LOG_DEBUGF("cli.c", "arg quality=%f", args->quality)
LOG_DEBUGF("cli.c", "arg size=%d", args->size)
LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size)
LOG_DEBUGF("cli.c", "arg threads=%d", args->threads)
LOG_DEBUGF("cli.c", "arg incremental=%s", args->incremental)
LOG_DEBUGF("cli.c", "arg output=%s", args->output)
LOG_DEBUGF("cli.c", "arg rewrite_url=%s", args->rewrite_url)
LOG_DEBUGF("cli.c", "arg name=%s", args->name)
LOG_DEBUGF("cli.c", "arg depth=%d", args->depth)
LOG_DEBUGF("cli.c", "arg path=%s", args->path)
LOG_DEBUGF("cli.c", "arg archive=%s", args->archive)
LOG_DEBUGF("cli.c", "arg tesseract_lang=%s", args->tesseract_lang)
LOG_DEBUGF("cli.c", "arg tesseract_path=%s", args->tesseract_path)
return 0;
}
int index_args_validate(index_args_t *args, int argc, const char **argv) {
LogCtx.verbose = 1;
if (argc < 2) {
fprintf(stderr, "Required positional argument: PATH.\n");
return 1;
}
char *index_path = abspath(argv[1]);
if (index_path == NULL) {
fprintf(stderr, "File not found: %s\n", argv[1]);
return 1;
} else {
args->index_path = argv[1];
free(index_path);
}
if (args->es_url == NULL) {
args->es_url = DEFAULT_ES_URL;
}
if (args->script_path != NULL) {
struct stat info;
int res = stat(args->script_path, &info);
if (res == -1) {
fprintf(stderr, "Error opening script file '%s': %s\n", args->script_path, strerror(errno));
return 1;
}
int fd = open(args->script_path, O_RDONLY);
if (fd == -1) {
fprintf(stderr, "Error opening script file '%s': %s\n", args->script_path, strerror(errno));
return 1;
}
args->script = malloc(info.st_size + 1);
res = read(fd, args->script, info.st_size);
if (res == -1) {
fprintf(stderr, "Error reading script file '%s': %s\n", args->script_path, strerror(errno));
return 1;
}
*(args->script + info.st_size) = '\0';
close(fd);
}
if (args->batch_size == 0) {
args->batch_size = DEFAULT_BATCH_SIZE;
}
LOG_DEBUGF("cli.c", "arg es_url=%s", args->es_url)
LOG_DEBUGF("cli.c", "arg index_path=%s", args->index_path)
LOG_DEBUGF("cli.c", "arg script_path=%s", args->script_path)
LOG_DEBUGF("cli.c", "arg script=%s", args->script)
LOG_DEBUGF("cli.c", "arg print=%d", args->print)
LOG_DEBUGF("cli.c", "arg batch_size=%d", args->batch_size)
LOG_DEBUGF("cli.c", "arg force_reset=%d", args->force_reset)
return 0;
}
int web_args_validate(web_args_t *args, int argc, const char **argv) {
LogCtx.verbose = 1;
if (argc < 2) {
fprintf(stderr, "Required positional argument: PATH.\n");
return 1;
}
if (args->es_url == NULL) {
args->es_url = DEFAULT_ES_URL;
}
if (args->bind == NULL) {
args->bind = DEFAULT_BIND_ADDR;
}
if (args->port == NULL) {
args->port = DEFAULT_PORT;
}
if (args->credentials != NULL) {
args->b64credentials = onion_base64_encode(args->credentials, (int) strlen(args->credentials));
//Remove trailing newline
*(args->b64credentials + strlen(args->b64credentials) - 1) = '\0';
}
args->index_count = argc - 1;
args->indices = argv + 1;
for (int i = 0; i < args->index_count; i++) {
char *abs_path = abspath(args->indices[i]);
if (abs_path == NULL) {
fprintf(stderr, "File not found: %s\n", args->indices[i]);
return 1;
}
}
LOG_DEBUGF("cli.c", "arg es_url=%s", args->es_url)
LOG_DEBUGF("cli.c", "arg bind=%s", args->bind)
LOG_DEBUGF("cli.c", "arg port=%s", args->port)
LOG_DEBUGF("cli.c", "arg credentials=%s", args->credentials)
LOG_DEBUGF("cli.c", "arg b64credentials=%s", args->b64credentials)
LOG_DEBUGF("cli.c", "arg index_count=%d", args->index_count)
for (int i = 0; i < args->index_count; i++) {
LOG_DEBUGF("cli.c", "arg indices[%d]=%s", i, args->indices[i])
}
return 0;
}
index_args_t *index_args_create() {
index_args_t *args = calloc(sizeof(index_args_t), 1);
return args;
}
web_args_t *web_args_create() {
web_args_t *args = calloc(sizeof(web_args_t), 1);
return args;
}