diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ccf96f..394fcb2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,7 +62,9 @@ add_executable( src/database/database_schema.c src/database/database_fts.c src/web/web_fts.c - src/database/database_embeddings.c) + src/database/database_embeddings.c + src/ignorelist.c + src/ignorelist.h) set_target_properties(sist2 PROPERTIES LINKER_LANGUAGE C) target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/) @@ -76,6 +78,7 @@ find_package(CURL CONFIG REQUIRED) find_library(MAGIC_LIB NAMES libmagic.a REQUIRED) find_package(unofficial-sqlite3 CONFIG REQUIRED) find_package(OpenBLAS CONFIG REQUIRED) +find_package(libgit2 CONFIG REQUIRED) target_include_directories( @@ -149,6 +152,7 @@ target_link_libraries( # m z + libgit2::libgit2package argparse unofficial::mongoose::mongoose CURL::libcurl diff --git a/docs/USAGE.md b/docs/USAGE.md index e1a5b60..1a54f29 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -108,6 +108,27 @@ sist scan ~/Documents -o ./documents.sist2 --incremental sist scan ~/Documents -o ./documents.sist2 --incremental ``` +### Excluding files + +You can use the `--exclude` option to specify exclude patterns. For more complex setups, you can create a +`.sist2ignore` file at the root of the scan path (For example, `~/Documents/.sist2ignore` for the example above). + +The syntax for sist2ignore is the same as .gitignore for Git (reference [here](https://git-scm.com/docs/gitignore)). + +Example: + +**.sist2ignore** +```gitignore +# Ignore all PDF files +*.pdf + +# But don't ignore them for the /important_files/ directory +!/important_videos/*.pdf + +# Ignore all files in _staging/ directories +_staging/ +``` + ### Index documents to Elasticsearch search backend ```bash diff --git a/src/ctx.h b/src/ctx.h index 809ca26..af6a6d6 100644 --- a/src/ctx.h +++ b/src/ctx.h @@ -19,6 +19,7 @@ #include "src/database/database.h" #include "src/index/elastic.h" #include "sqlite3.h" +#include "ignorelist.h" #include @@ -34,6 +35,7 @@ typedef struct { pcre *exclude; pcre_extra *exclude_extra; int fast; + ignorelist_t *ignorelist; scan_arc_ctx_t arc_ctx; scan_comic_ctx_t comic_ctx; diff --git a/src/ignorelist.c b/src/ignorelist.c new file mode 100644 index 0000000..9b1b8e9 --- /dev/null +++ b/src/ignorelist.c @@ -0,0 +1,106 @@ +#include "ignorelist.h" +#include "ctx.h" +#include + +typedef struct ignorelist { + git_repository *repo; + char repo_path[PATH_MAX]; + int has_rules; +} ignorelist_t; + +char *get_tempdir() { + char *tempdir_env = getenv("TMPDIR"); + + if (tempdir_env != NULL) { + return tempdir_env; + } + + return "/tmp/"; +} + +void ignorelist_destroy(ignorelist_t* ignorelist) { + git_libgit2_shutdown(); + + if (ignorelist->repo != NULL) { + git_repository_free(ignorelist->repo); + } + + free(ignorelist); +} + +ignorelist_t *ignorelist_create() { + git_libgit2_init(); + + ignorelist_t *ignorelist = malloc(sizeof(ignorelist_t)); + + ignorelist->repo = NULL; + ignorelist->has_rules = FALSE; + + char *tempdir = get_tempdir(); + + if (tempdir[strlen(tempdir) - 1] == '/') { + sprintf(ignorelist->repo_path, "%ssist2-ignorelist-%d", tempdir, getpid()); + } else { + sprintf(ignorelist->repo_path, "%s/sist2-ignorelist-%d", tempdir, getpid()); + } + + return ignorelist; +} + +void ignorelist_load_ignore_file(ignorelist_t *ignorelist, const char *filepath) { + + FILE *file; + char line[PATH_MAX * 2]; + + file = fopen(filepath, "r"); + + if(file == NULL) { + // No ignore list + return; + } + + LOG_DEBUGF("ignorelist.c", "Opening temporary git repository %s", ignorelist->repo_path); + int init_result = git_repository_init(&ignorelist->repo, ignorelist->repo_path, TRUE); + + if (init_result != 0) { + LOG_FATALF("ignorelist.c", "Got error code from git_repository_init(): %d", init_result); + } + + git_ignore_clear_internal_rules(ignorelist->repo); + + while(fgets(line, PATH_MAX * 2, file)){ + + line[strlen(line) - 1] = '\0'; // Strip trailing newline + char *rules = {line,}; + + int result = git_ignore_add_rule(ignorelist->repo, rules); + + if (result == 0) { + LOG_DEBUGF("ignorelist.c", "Load ignore rule: %s", line); + ignorelist->has_rules = TRUE; + } else { + LOG_FATALF("ignorelist.c", "Invalid ignore rule: %s", line); + } + } + + fclose(file); +} + +int ignorelist_is_ignored(ignorelist_t *ignorelist, const char *filepath) { + + if (!ignorelist->has_rules) { + return FALSE; + } + + const char *rel_path = filepath + ScanCtx.index.desc.root_len; + + int ignored = -1; + + int result = git_ignore_path_is_ignored(&ignored, ignorelist->repo, rel_path); + + if (result != 0) { + LOG_FATALF("ignorelist.c", "git_ignore_path_is_ignored returned error code: %d", result); + } + + return ignored; +} diff --git a/src/ignorelist.h b/src/ignorelist.h new file mode 100644 index 0000000..0655572 --- /dev/null +++ b/src/ignorelist.h @@ -0,0 +1,16 @@ +#ifndef SIST2_IGNORELIST_H +#define SIST2_IGNORELIST_H + +#include "src/sist.h" + +typedef struct ignorelist ignorelist_t; + +ignorelist_t *ignorelist_create(); + +void ignorelist_destroy(ignorelist_t* ignorelist); + +void ignorelist_load_ignore_file(ignorelist_t* ignorelist, const char* filepath); + +int ignorelist_is_ignored(ignorelist_t* ignorelist, const char* filepath); + +#endif //SIST2_IGNORELIST_H diff --git a/src/io/walk.c b/src/io/walk.c index dfeb2b3..be569f7 100644 --- a/src/io/walk.c +++ b/src/io/walk.c @@ -23,8 +23,17 @@ int handle_entry(const char *filepath, const struct stat *info, int typeflag, st if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) { LOG_DEBUGF("walk.c", "Excluded: %s", filepath); - if (typeflag == FTW_F && S_ISREG(info->st_mode)) { - } else if (typeflag == FTW_D) { + if (typeflag == FTW_D) { + return FTW_SKIP_SUBTREE; + } + + return FTW_CONTINUE; + } + + if (ignorelist_is_ignored(ScanCtx.ignorelist, filepath)) { + LOG_DEBUGF("walk.c", "Ignored: %s", filepath); + + if (typeflag == FTW_D) { return FTW_SKIP_SUBTREE; } diff --git a/src/main.c b/src/main.c index 63abd7b..6c82b7b 100644 --- a/src/main.c +++ b/src/main.c @@ -11,6 +11,7 @@ #include "web/serve.h" #include "parsing/mime.h" #include "parsing/parse.h" +#include "ignorelist.h" #include #include @@ -239,6 +240,13 @@ void sist2_scan(scan_args_t *args) { LOG_INFOF("main.c", "sist2 v%s", Version); + ScanCtx.ignorelist = ignorelist_create(); + + char ignore_filepath[PATH_MAX]; + sprintf(ignore_filepath, "%s.sist2ignore", args->path); + + ignorelist_load_ignore_file(ScanCtx.ignorelist, ignore_filepath); + ScanCtx.pool = tpool_create(ScanCtx.threads, TRUE); tpool_start(ScanCtx.pool); @@ -268,6 +276,7 @@ void sist2_scan(scan_args_t *args) { database_generate_stats(db, args->treemap_threshold); database_close(db, args->optimize_database); + ignorelist_destroy(ScanCtx.ignorelist); } void sist2_index(index_args_t *args) { diff --git a/src/sist.h b/src/sist.h index c5b0630..b6547f5 100644 --- a/src/sist.h +++ b/src/sist.h @@ -51,11 +51,11 @@ #include #include "git_hash.h" -#define VERSION "3.4.7" +#define VERSION "3.5.0" static const char *const Version = VERSION; static const int VersionMajor = 3; -static const int VersionMinor = 4; -static const int VersionPatch = 7; +static const int VersionMinor = 5; +static const int VersionPatch = 0; #ifndef SIST_PLATFORM #define SIST_PLATFORM unknown