Add --list-file argument

This commit is contained in:
simon987 2021-12-29 18:54:13 -05:00
parent 52466d5d8a
commit 81008d8936
10 changed files with 113 additions and 24 deletions

View File

@ -22,9 +22,6 @@ add_subdirectory(third-party/argparse)
add_executable(sist2
# argparse
third-party/argparse/argparse.h third-party/argparse/argparse.c
src/main.c
src/sist.h
src/io/walk.h src/io/walk.c
@ -41,7 +38,11 @@ add_executable(sist2
src/log.c src/log.h
src/cli.c src/cli.h
src/stats.c src/stats.h src/ctx.c
src/parsing/sidecar.c src/parsing/sidecar.h)
src/parsing/sidecar.c src/parsing/sidecar.h
# argparse
third-party/argparse/argparse.h third-party/argparse/argparse.c
)
target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/)
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib)

View File

@ -219,6 +219,19 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
args->max_memory_buffer = DEFAULT_MAX_MEM_BUFFER;
}
if (args->list_path != NULL) {
if(strcmp(args->list_path, "-") == 0) {
args->list_file = stdin;
LOG_DEBUG("cli.c", "Using stdin as list file")
} else {
args->list_file = fopen(args->list_path, "r");
if (args->list_file == NULL) {
LOG_FATALF("main.c", "List file could not be opened: %s (%s)", args->list_path, errno);
}
}
}
LOG_DEBUGF("cli.c", "arg quality=%f", args->quality)
LOG_DEBUGF("cli.c", "arg size=%d", args->size)
LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size)
@ -238,6 +251,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
LOG_DEBUGF("cli.c", "arg fast_epub=%d", args->fast_epub)
LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold)
LOG_DEBUGF("cli.c", "arg max_memory_buffer=%d", args->max_memory_buffer)
LOG_DEBUGF("cli.c", "arg list_path=%s", args->list_path)
return 0;
}

View File

@ -29,6 +29,8 @@ typedef struct scan_args {
int read_subtitles;
int fast_epub;
int calculate_checksums;
char *list_path;
FILE *list_file;
} scan_args_t;
scan_args_t *scan_args_create();

View File

@ -4,6 +4,8 @@
#include <ftw.h>
#define STR_STARTS_WITH(x, y) (strncmp(y, x, strlen(y) - 1) == 0)
__always_inline
parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info, int base) {
int len = (int) strlen(filepath);
@ -77,3 +79,57 @@ int handle_entry(const char *filepath, const struct stat *info, int typeflag, st
int walk_directory_tree(const char *dirpath) {
return nftw(dirpath, handle_entry, MAX_FILE_DESCRIPTORS, FTW_PHYS | FTW_ACTIONRETVAL);
}
int iterate_file_list(void *input_file) {
char buf[PATH_MAX];
struct stat info;
while (fgets(buf, sizeof(buf), input_file) != NULL) {
// Remove trailing newline
*(buf + strlen(buf) - 1) = '\0';
int stat_ret = stat(buf, &info);
if (stat_ret != 0) {
LOG_ERRORF("walk.c", "Could not stat file %s (%s)", buf, strerror(errno));
continue;
}
if (!S_ISREG(info.st_mode)) {
LOG_ERRORF("walk.c", "Is not a regular file: %s", buf);
continue;
}
char *absolute_path = canonicalize_file_name(buf);
if (absolute_path == NULL) {
LOG_FATALF("walk.c", "FIXME: Could not get absolute path of %s", buf);
}
if (ScanCtx.exclude != NULL && EXCLUDED(absolute_path)) {
LOG_DEBUGF("walk.c", "Excluded: %s", absolute_path)
if (S_ISREG(info.st_mode)) {
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
ScanCtx.dbg_excluded_files_count += 1;
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
}
continue;
}
if (!STR_STARTS_WITH(absolute_path, ScanCtx.index.desc.root)) {
LOG_FATALF("walk.c", "File is not a children of root folder (%s): %s", ScanCtx.index.desc.root, buf);
}
int base = (int) (strrchr(buf, '/') - buf) + 1;
parse_job_t *job = create_fs_parse_job(absolute_path, &info, base);
free(absolute_path);
tpool_add_work(ScanCtx.pool, parse, job);
}
return 0;
}

View File

@ -5,4 +5,6 @@
int walk_directory_tree(const char *);
int iterate_file_list(void* input_file);
#endif

View File

@ -14,6 +14,9 @@
#include "parsing/mime.h"
#include "parsing/parse.h"
#include <signal.h>
#include <unistd.h>
#include "stats.h"
#define DESCRIPTION "Lightning-fast file system indexer and search tool."
@ -29,8 +32,6 @@ static const char *const usage[] = {
NULL,
};
#include<signal.h>
#include<unistd.h>
static __sighandler_t sigsegv_handler = NULL;
static __sighandler_t sigabrt_handler = NULL;
@ -334,10 +335,20 @@ void sist2_scan(scan_args_t *args) {
ScanCtx.writer_pool = tpool_create(1, writer_cleanup, TRUE, FALSE);
tpool_start(ScanCtx.writer_pool);
int walk_ret = walk_directory_tree(ScanCtx.index.desc.root);
if (walk_ret == -1) {
LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno)
if (args->list_path) {
// Scan using file list
int list_ret = iterate_file_list(args->list_file);
if (list_ret != 0) {
LOG_FATALF("main.c", "iterate_file_list() failed! (%d)", list_ret)
}
} else {
// Scan directory recursively
int walk_ret = walk_directory_tree(ScanCtx.index.desc.root);
if (walk_ret == -1) {
LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno)
}
}
tpool_wait(ScanCtx.pool);
tpool_destroy(ScanCtx.pool);
@ -577,6 +588,9 @@ int main(int argc, const char *argv[]) {
OPT_BOOLEAN(0, "fast-epub", &scan_args->fast_epub,
"Faster but less accurate EPUB parsing (no thumbnails, metadata)"),
OPT_BOOLEAN(0, "checksums", &scan_args->calculate_checksums, "Calculate file checksums when scanning."),
OPT_STRING(0, "list-file", &scan_args->list_path, "Specify a list of newline-delimited paths to be scanned"
" instead of normal directory traversal. Use '-' to read"
" from stdin."),
OPT_GROUP("Index options"),
OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"),

File diff suppressed because one or more lines are too long

View File

@ -15,18 +15,18 @@ static int should_read_part(const char *part) {
}
if ( // Word
STR_STARTS_WITH(part, "word/document.xml")
|| STR_STARTS_WITH(part, "word/footnotes.xml")
|| STR_STARTS_WITH(part, "word/endnotes.xml")
|| STR_STARTS_WITH(part, "word/footer")
|| STR_STARTS_WITH(part, "word/header")
STR_STARTS_WITH_CONSTANT(part, "word/document.xml")
|| STR_STARTS_WITH_CONSTANT(part, "word/footnotes.xml")
|| STR_STARTS_WITH_CONSTANT(part, "word/endnotes.xml")
|| STR_STARTS_WITH_CONSTANT(part, "word/footer")
|| STR_STARTS_WITH_CONSTANT(part, "word/header")
// PowerPoint
|| STR_STARTS_WITH(part, "ppt/slides/slide")
|| STR_STARTS_WITH(part, "ppt/notesSlides/slide")
|| STR_STARTS_WITH_CONSTANT(part, "ppt/slides/slide")
|| STR_STARTS_WITH_CONSTANT(part, "ppt/notesSlides/slide")
// Excel
|| STR_STARTS_WITH(part, "xl/worksheets/sheet")
|| STR_STARTS_WITH(part, "xl/sharedStrings.xml")
|| STR_STARTS_WITH(part, "xl/workbook.xml")
|| STR_STARTS_WITH_CONSTANT(part, "xl/worksheets/sheet")
|| STR_STARTS_WITH_CONSTANT(part, "xl/sharedStrings.xml")
|| STR_STARTS_WITH_CONSTANT(part, "xl/workbook.xml")
) {
return TRUE;
}

View File

@ -7,7 +7,7 @@
#include "../third-party/utf8.h/utf8.h"
#include "macros.h"
#define STR_STARTS_WITH(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
#define STR_STARTS_WITH_CONSTANT(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
#define TEXT_BUF_FULL (-1)
#define INITIAL_BUF_SIZE (1024 * 16)

View File

@ -227,7 +227,7 @@ TEST(Ebook, Utf8Pdf) {
parse_ebook(&ebook_500_ctx, &f, "application/pdf", &doc);
ASSERT_TRUE(STR_STARTS_WITH(get_meta(&doc, MetaContent)->str_val, "最後測試 "));
ASSERT_TRUE(STR_STARTS_WITH_CONSTANT(get_meta(&doc, MetaContent)->str_val, "最後測試 "));
cleanup(&doc, &f);
}
@ -245,7 +245,7 @@ TEST(Ebook, Utf8PdfInvalidChars) {
// It should say "HART is a group of highly qualified ..." but the PDF
// text is been intentionally fucked with by the authors
// We can at least filter out the non-printable/invalid characters like '<27>' etc
ASSERT_TRUE(STR_STARTS_WITH(get_meta(&doc, MetaContent)->str_val, "HART i a g f highl alified "));
ASSERT_TRUE(STR_STARTS_WITH_CONSTANT(get_meta(&doc, MetaContent)->str_val, "HART i a g f highl alified "));
cleanup(&doc, &f);
}