--exclude argument #26

This commit is contained in:
simon 2020-02-22 16:55:35 -05:00
parent 018ac86640
commit 483a454c8d
7 changed files with 46 additions and 6 deletions

View File

@ -137,6 +137,8 @@ TARGET_LINK_LIBRARIES(
${PROJECT_SOURCE_DIR}/lib/libcrypto.a
${PROJECT_SOURCE_DIR}/lib/libssl.a
dl
pcre
)
add_custom_target(

View File

@ -162,6 +162,26 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
args->tesseract_path = path;
}
if (args->exclude_regex != NULL) {
const char *error;
int error_offset;
pcre *re = pcre_compile(args->exclude_regex, 0, &error, &error_offset, 0);
if (error != NULL) {
LOG_FATALF("cli.c", "pcre_compile returned error: %s (offset:%d)", error, error_offset)
}
pcre_extra *re_extra = pcre_study(re, 0, &error);
if (error != NULL) {
LOG_FATALF("cli.c", "pcre_study returned error: %s", error)
}
ScanCtx.exclude = re;
ScanCtx.exclude_extra = re_extra;
} else {
ScanCtx.exclude = NULL;
}
LOG_DEBUGF("cli.c", "arg quality=%f", args->quality)
LOG_DEBUGF("cli.c", "arg size=%d", args->size)
LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size)
@ -175,6 +195,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
LOG_DEBUGF("cli.c", "arg archive=%s", args->archive)
LOG_DEBUGF("cli.c", "arg tesseract_lang=%s", args->tesseract_lang)
LOG_DEBUGF("cli.c", "arg tesseract_path=%s", args->tesseract_path)
LOG_DEBUGF("cli.c", "arg exclude=%s", args->exclude_regex)
return 0;
}

View File

@ -18,10 +18,13 @@ typedef struct scan_args {
archive_mode_t archive_mode;
char *tesseract_lang;
const char *tesseract_path;
char *exclude_regex;
} scan_args_t;
scan_args_t *scan_args_create();
void scan_args_destroy(scan_args_t *args);
int scan_args_validate(scan_args_t *args, int argc, const char **argv);
typedef struct index_args {
@ -45,12 +48,15 @@ typedef struct web_args {
} web_args_t;
index_args_t *index_args_create();
void index_args_destroy(index_args_t *args);
web_args_t *web_args_create();
void web_args_destroy(web_args_t *args);
int index_args_validate(index_args_t *args, int argc, const char **argv);
int web_args_validate(web_args_t *args, int argc, const char **argv);
#endif

View File

@ -29,6 +29,8 @@ struct {
pthread_mutex_t mupdf_mu;
char * tesseract_lang;
const char * tesseract_path;
pcre *exclude;
pcre_extra *exclude_extra;
} ScanCtx;
struct {

View File

@ -28,8 +28,18 @@ parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info,
return job;
}
int sub_strings[30];
#define EXCLUDED(str) (pcre_exec(ScanCtx.exclude, ScanCtx.exclude_extra, filepath, strlen(filepath), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
int handle_entry(const char *filepath, const struct stat *info, int typeflag, struct FTW *ftw) {
if (ftw->level <= ScanCtx.depth && typeflag == FTW_F && S_ISREG(info->st_mode)) {
if (typeflag == FTW_F && S_ISREG(info->st_mode) && ftw->level <= ScanCtx.depth) {
if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) {
LOG_DEBUGF("walk.c", "Excluded: %s", filepath)
return 0;
}
parse_job_t *job = create_fs_parse_job(filepath, info, ftw->base);
tpool_add_work(ScanCtx.pool, parse, job);
}

View File

@ -240,6 +240,7 @@ int main(int argc, const char *argv[]) {
"shallow: Don't parse archives inside archives. DEFAULT: recurse"),
OPT_STRING(0, "ocr", &scan_args->tesseract_lang, "Tesseract language (use tesseract --list-langs to see "
"which are installed on your machine)"),
OPT_STRING('e', "exclude", &scan_args->exclude_regex, "Files that match this regex will not be scanned"),
OPT_GROUP("Index options"),
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url with port. DEFAULT=http://localhost:9200"),
@ -286,9 +287,7 @@ int main(int argc, const char *argv[]) {
}
sist2_scan(scan_args);
}
else if (strcmp(argv[0], "index") == 0) {
} else if (strcmp(argv[0], "index") == 0) {
int err = index_args_validate(index_args, argc, argv);
if (err != 0) {
@ -304,8 +303,7 @@ int main(int argc, const char *argv[]) {
}
sist2_web(web_args);
}
else {
} else {
fprintf(stderr, "Invalid command: '%s'\n", argv[0]);
argparse_usage(&argparse);
return 1;

View File

@ -35,6 +35,7 @@
#include <libxml/xmlstring.h>
#define BOOL int
#include <tesseract/capi.h>
#include <pcre.h>
#include <onion/onion.h>
#include <onion/handler.h>