From 483a454c8dfaf7c5eaa9a514ad6728431250189b Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 22 Feb 2020 16:55:35 -0500 Subject: [PATCH] --exclude argument #26 --- CMakeLists.txt | 2 ++ src/cli.c | 21 +++++++++++++++++++++ src/cli.h | 6 ++++++ src/ctx.h | 2 ++ src/io/walk.c | 12 +++++++++++- src/main.c | 8 +++----- src/sist.h | 1 + 7 files changed, 46 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 80b03c2..46900dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -137,6 +137,8 @@ TARGET_LINK_LIBRARIES( ${PROJECT_SOURCE_DIR}/lib/libcrypto.a ${PROJECT_SOURCE_DIR}/lib/libssl.a dl + + pcre ) add_custom_target( diff --git a/src/cli.c b/src/cli.c index d0663df..4a53198 100644 --- a/src/cli.c +++ b/src/cli.c @@ -162,6 +162,26 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) { args->tesseract_path = path; } + if (args->exclude_regex != NULL) { + const char *error; + int error_offset; + + pcre *re = pcre_compile(args->exclude_regex, 0, &error, &error_offset, 0); + if (error != NULL) { + LOG_FATALF("cli.c", "pcre_compile returned error: %s (offset:%d)", error, error_offset) + } + + pcre_extra *re_extra = pcre_study(re, 0, &error); + if (error != NULL) { + LOG_FATALF("cli.c", "pcre_study returned error: %s", error) + } + + ScanCtx.exclude = re; + ScanCtx.exclude_extra = re_extra; + } else { + ScanCtx.exclude = NULL; + } + LOG_DEBUGF("cli.c", "arg quality=%f", args->quality) LOG_DEBUGF("cli.c", "arg size=%d", args->size) LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size) @@ -175,6 +195,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) { LOG_DEBUGF("cli.c", "arg archive=%s", args->archive) LOG_DEBUGF("cli.c", "arg tesseract_lang=%s", args->tesseract_lang) LOG_DEBUGF("cli.c", "arg tesseract_path=%s", args->tesseract_path) + LOG_DEBUGF("cli.c", "arg exclude=%s", args->exclude_regex) return 0; } diff --git a/src/cli.h b/src/cli.h index f0eb163..0bfecf3 100644 --- a/src/cli.h +++ b/src/cli.h @@ -18,10 +18,13 @@ typedef struct scan_args { archive_mode_t archive_mode; char *tesseract_lang; const char *tesseract_path; + char *exclude_regex; } scan_args_t; scan_args_t *scan_args_create(); + void scan_args_destroy(scan_args_t *args); + int scan_args_validate(scan_args_t *args, int argc, const char **argv); typedef struct index_args { @@ -45,12 +48,15 @@ typedef struct web_args { } web_args_t; index_args_t *index_args_create(); + void index_args_destroy(index_args_t *args); web_args_t *web_args_create(); + void web_args_destroy(web_args_t *args); int index_args_validate(index_args_t *args, int argc, const char **argv); + int web_args_validate(web_args_t *args, int argc, const char **argv); #endif diff --git a/src/ctx.h b/src/ctx.h index fc08744..f0da63b 100644 --- a/src/ctx.h +++ b/src/ctx.h @@ -29,6 +29,8 @@ struct { pthread_mutex_t mupdf_mu; char * tesseract_lang; const char * tesseract_path; + pcre *exclude; + pcre_extra *exclude_extra; } ScanCtx; struct { diff --git a/src/io/walk.c b/src/io/walk.c index d1debc4..afd1cec 100644 --- a/src/io/walk.c +++ b/src/io/walk.c @@ -28,8 +28,18 @@ parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info, return job; } +int sub_strings[30]; +#define EXCLUDED(str) (pcre_exec(ScanCtx.exclude, ScanCtx.exclude_extra, filepath, strlen(filepath), 0, 0, sub_strings, sizeof(sub_strings)) >= 0) + int handle_entry(const char *filepath, const struct stat *info, int typeflag, struct FTW *ftw) { - if (ftw->level <= ScanCtx.depth && typeflag == FTW_F && S_ISREG(info->st_mode)) { + + if (typeflag == FTW_F && S_ISREG(info->st_mode) && ftw->level <= ScanCtx.depth) { + + if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) { + LOG_DEBUGF("walk.c", "Excluded: %s", filepath) + return 0; + } + parse_job_t *job = create_fs_parse_job(filepath, info, ftw->base); tpool_add_work(ScanCtx.pool, parse, job); } diff --git a/src/main.c b/src/main.c index 7a0760a..7a63369 100644 --- a/src/main.c +++ b/src/main.c @@ -240,6 +240,7 @@ int main(int argc, const char *argv[]) { "shallow: Don't parse archives inside archives. DEFAULT: recurse"), OPT_STRING(0, "ocr", &scan_args->tesseract_lang, "Tesseract language (use tesseract --list-langs to see " "which are installed on your machine)"), + OPT_STRING('e', "exclude", &scan_args->exclude_regex, "Files that match this regex will not be scanned"), OPT_GROUP("Index options"), OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url with port. DEFAULT=http://localhost:9200"), @@ -286,9 +287,7 @@ int main(int argc, const char *argv[]) { } sist2_scan(scan_args); - } - - else if (strcmp(argv[0], "index") == 0) { + } else if (strcmp(argv[0], "index") == 0) { int err = index_args_validate(index_args, argc, argv); if (err != 0) { @@ -304,8 +303,7 @@ int main(int argc, const char *argv[]) { } sist2_web(web_args); - } - else { + } else { fprintf(stderr, "Invalid command: '%s'\n", argv[0]); argparse_usage(&argparse); return 1; diff --git a/src/sist.h b/src/sist.h index 79d1662..dc4209e 100644 --- a/src/sist.h +++ b/src/sist.h @@ -35,6 +35,7 @@ #include #define BOOL int #include +#include #include #include