mirror of
https://github.com/simon987/sist2.git
synced 2025-04-22 03:36:48 +00:00
Compare commits
No commits in common. "d40f5052f986090d2f9497b0612c50c34823b10e" and "52466d5d8af8758ea5b9fa284526f456e2ca267c" have entirely different histories.
d40f5052f9
...
52466d5d8a
@ -22,6 +22,9 @@ add_subdirectory(third-party/argparse)
|
|||||||
|
|
||||||
add_executable(sist2
|
add_executable(sist2
|
||||||
|
|
||||||
|
# argparse
|
||||||
|
third-party/argparse/argparse.h third-party/argparse/argparse.c
|
||||||
|
|
||||||
src/main.c
|
src/main.c
|
||||||
src/sist.h
|
src/sist.h
|
||||||
src/io/walk.h src/io/walk.c
|
src/io/walk.h src/io/walk.c
|
||||||
@ -38,11 +41,7 @@ add_executable(sist2
|
|||||||
src/log.c src/log.h
|
src/log.c src/log.h
|
||||||
src/cli.c src/cli.h
|
src/cli.c src/cli.h
|
||||||
src/stats.c src/stats.h src/ctx.c
|
src/stats.c src/stats.h src/ctx.c
|
||||||
src/parsing/sidecar.c src/parsing/sidecar.h
|
src/parsing/sidecar.c src/parsing/sidecar.h)
|
||||||
|
|
||||||
# argparse
|
|
||||||
third-party/argparse/argparse.h third-party/argparse/argparse.c
|
|
||||||
)
|
|
||||||
|
|
||||||
target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/)
|
target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/)
|
||||||
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib)
|
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib)
|
||||||
@ -87,7 +86,6 @@ if (SIST_DEBUG)
|
|||||||
sist2
|
sist2
|
||||||
PRIVATE
|
PRIVATE
|
||||||
-fsanitize=address
|
-fsanitize=address
|
||||||
-static-libasan
|
|
||||||
)
|
)
|
||||||
set_target_properties(
|
set_target_properties(
|
||||||
sist2
|
sist2
|
||||||
|
14
src/cli.c
14
src/cli.c
@ -219,19 +219,6 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
|||||||
args->max_memory_buffer = DEFAULT_MAX_MEM_BUFFER;
|
args->max_memory_buffer = DEFAULT_MAX_MEM_BUFFER;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (args->list_path != NULL) {
|
|
||||||
if(strcmp(args->list_path, "-") == 0) {
|
|
||||||
args->list_file = stdin;
|
|
||||||
LOG_DEBUG("cli.c", "Using stdin as list file")
|
|
||||||
} else {
|
|
||||||
args->list_file = fopen(args->list_path, "r");
|
|
||||||
|
|
||||||
if (args->list_file == NULL) {
|
|
||||||
LOG_FATALF("main.c", "List file could not be opened: %s (%s)", args->list_path, errno);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_DEBUGF("cli.c", "arg quality=%f", args->quality)
|
LOG_DEBUGF("cli.c", "arg quality=%f", args->quality)
|
||||||
LOG_DEBUGF("cli.c", "arg size=%d", args->size)
|
LOG_DEBUGF("cli.c", "arg size=%d", args->size)
|
||||||
LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size)
|
LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size)
|
||||||
@ -251,7 +238,6 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
|||||||
LOG_DEBUGF("cli.c", "arg fast_epub=%d", args->fast_epub)
|
LOG_DEBUGF("cli.c", "arg fast_epub=%d", args->fast_epub)
|
||||||
LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold)
|
LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold)
|
||||||
LOG_DEBUGF("cli.c", "arg max_memory_buffer=%d", args->max_memory_buffer)
|
LOG_DEBUGF("cli.c", "arg max_memory_buffer=%d", args->max_memory_buffer)
|
||||||
LOG_DEBUGF("cli.c", "arg list_path=%s", args->list_path)
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -29,8 +29,6 @@ typedef struct scan_args {
|
|||||||
int read_subtitles;
|
int read_subtitles;
|
||||||
int fast_epub;
|
int fast_epub;
|
||||||
int calculate_checksums;
|
int calculate_checksums;
|
||||||
char *list_path;
|
|
||||||
FILE *list_file;
|
|
||||||
} scan_args_t;
|
} scan_args_t;
|
||||||
|
|
||||||
scan_args_t *scan_args_create();
|
scan_args_t *scan_args_create();
|
||||||
|
@ -41,7 +41,6 @@ typedef struct {
|
|||||||
|
|
||||||
GHashTable *original_table;
|
GHashTable *original_table;
|
||||||
GHashTable *copy_table;
|
GHashTable *copy_table;
|
||||||
pthread_mutex_t copy_table_mu;
|
|
||||||
|
|
||||||
pcre *exclude;
|
pcre *exclude;
|
||||||
pcre_extra *exclude_extra;
|
pcre_extra *exclude_extra;
|
||||||
|
@ -4,8 +4,6 @@
|
|||||||
|
|
||||||
#include <ftw.h>
|
#include <ftw.h>
|
||||||
|
|
||||||
#define STR_STARTS_WITH(x, y) (strncmp(y, x, strlen(y) - 1) == 0)
|
|
||||||
|
|
||||||
__always_inline
|
__always_inline
|
||||||
parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info, int base) {
|
parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info, int base) {
|
||||||
int len = (int) strlen(filepath);
|
int len = (int) strlen(filepath);
|
||||||
@ -79,57 +77,3 @@ int handle_entry(const char *filepath, const struct stat *info, int typeflag, st
|
|||||||
int walk_directory_tree(const char *dirpath) {
|
int walk_directory_tree(const char *dirpath) {
|
||||||
return nftw(dirpath, handle_entry, MAX_FILE_DESCRIPTORS, FTW_PHYS | FTW_ACTIONRETVAL);
|
return nftw(dirpath, handle_entry, MAX_FILE_DESCRIPTORS, FTW_PHYS | FTW_ACTIONRETVAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
int iterate_file_list(void *input_file) {
|
|
||||||
|
|
||||||
char buf[PATH_MAX];
|
|
||||||
struct stat info;
|
|
||||||
|
|
||||||
while (fgets(buf, sizeof(buf), input_file) != NULL) {
|
|
||||||
|
|
||||||
// Remove trailing newline
|
|
||||||
*(buf + strlen(buf) - 1) = '\0';
|
|
||||||
|
|
||||||
int stat_ret = stat(buf, &info);
|
|
||||||
|
|
||||||
if (stat_ret != 0) {
|
|
||||||
LOG_ERRORF("walk.c", "Could not stat file %s (%s)", buf, strerror(errno));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!S_ISREG(info.st_mode)) {
|
|
||||||
LOG_ERRORF("walk.c", "Is not a regular file: %s", buf);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
char *absolute_path = canonicalize_file_name(buf);
|
|
||||||
|
|
||||||
if (absolute_path == NULL) {
|
|
||||||
LOG_FATALF("walk.c", "FIXME: Could not get absolute path of %s", buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ScanCtx.exclude != NULL && EXCLUDED(absolute_path)) {
|
|
||||||
LOG_DEBUGF("walk.c", "Excluded: %s", absolute_path)
|
|
||||||
|
|
||||||
if (S_ISREG(info.st_mode)) {
|
|
||||||
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
|
|
||||||
ScanCtx.dbg_excluded_files_count += 1;
|
|
||||||
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
|
|
||||||
}
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!STR_STARTS_WITH(absolute_path, ScanCtx.index.desc.root)) {
|
|
||||||
LOG_FATALF("walk.c", "File is not a children of root folder (%s): %s", ScanCtx.index.desc.root, buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
int base = (int) (strrchr(buf, '/') - buf) + 1;
|
|
||||||
|
|
||||||
parse_job_t *job = create_fs_parse_job(absolute_path, &info, base);
|
|
||||||
free(absolute_path);
|
|
||||||
tpool_add_work(ScanCtx.pool, parse, job);
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
@ -5,6 +5,4 @@
|
|||||||
|
|
||||||
int walk_directory_tree(const char *);
|
int walk_directory_tree(const char *);
|
||||||
|
|
||||||
int iterate_file_list(void* input_file);
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
25
src/main.c
25
src/main.c
@ -14,9 +14,6 @@
|
|||||||
#include "parsing/mime.h"
|
#include "parsing/mime.h"
|
||||||
#include "parsing/parse.h"
|
#include "parsing/parse.h"
|
||||||
|
|
||||||
#include <signal.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
#include "stats.h"
|
#include "stats.h"
|
||||||
|
|
||||||
#define DESCRIPTION "Lightning-fast file system indexer and search tool."
|
#define DESCRIPTION "Lightning-fast file system indexer and search tool."
|
||||||
@ -32,6 +29,8 @@ static const char *const usage[] = {
|
|||||||
NULL,
|
NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#include<signal.h>
|
||||||
|
#include<unistd.h>
|
||||||
|
|
||||||
static __sighandler_t sigsegv_handler = NULL;
|
static __sighandler_t sigsegv_handler = NULL;
|
||||||
static __sighandler_t sigabrt_handler = NULL;
|
static __sighandler_t sigabrt_handler = NULL;
|
||||||
@ -170,7 +169,6 @@ void initialize_scan_context(scan_args_t *args) {
|
|||||||
ScanCtx.dbg_current_files = g_hash_table_new_full(g_int64_hash, g_int64_equal, NULL, NULL);
|
ScanCtx.dbg_current_files = g_hash_table_new_full(g_int64_hash, g_int64_equal, NULL, NULL);
|
||||||
pthread_mutex_init(&ScanCtx.dbg_current_files_mu, NULL);
|
pthread_mutex_init(&ScanCtx.dbg_current_files_mu, NULL);
|
||||||
pthread_mutex_init(&ScanCtx.dbg_file_counts_mu, NULL);
|
pthread_mutex_init(&ScanCtx.dbg_file_counts_mu, NULL);
|
||||||
pthread_mutex_init(&ScanCtx.copy_table_mu, NULL);
|
|
||||||
|
|
||||||
ScanCtx.calculate_checksums = args->calculate_checksums;
|
ScanCtx.calculate_checksums = args->calculate_checksums;
|
||||||
|
|
||||||
@ -336,20 +334,10 @@ void sist2_scan(scan_args_t *args) {
|
|||||||
ScanCtx.writer_pool = tpool_create(1, writer_cleanup, TRUE, FALSE);
|
ScanCtx.writer_pool = tpool_create(1, writer_cleanup, TRUE, FALSE);
|
||||||
tpool_start(ScanCtx.writer_pool);
|
tpool_start(ScanCtx.writer_pool);
|
||||||
|
|
||||||
if (args->list_path) {
|
int walk_ret = walk_directory_tree(ScanCtx.index.desc.root);
|
||||||
// Scan using file list
|
if (walk_ret == -1) {
|
||||||
int list_ret = iterate_file_list(args->list_file);
|
LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno)
|
||||||
if (list_ret != 0) {
|
|
||||||
LOG_FATALF("main.c", "iterate_file_list() failed! (%d)", list_ret)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Scan directory recursively
|
|
||||||
int walk_ret = walk_directory_tree(ScanCtx.index.desc.root);
|
|
||||||
if (walk_ret == -1) {
|
|
||||||
LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
tpool_wait(ScanCtx.pool);
|
tpool_wait(ScanCtx.pool);
|
||||||
tpool_destroy(ScanCtx.pool);
|
tpool_destroy(ScanCtx.pool);
|
||||||
|
|
||||||
@ -589,9 +577,6 @@ int main(int argc, const char *argv[]) {
|
|||||||
OPT_BOOLEAN(0, "fast-epub", &scan_args->fast_epub,
|
OPT_BOOLEAN(0, "fast-epub", &scan_args->fast_epub,
|
||||||
"Faster but less accurate EPUB parsing (no thumbnails, metadata)"),
|
"Faster but less accurate EPUB parsing (no thumbnails, metadata)"),
|
||||||
OPT_BOOLEAN(0, "checksums", &scan_args->calculate_checksums, "Calculate file checksums when scanning."),
|
OPT_BOOLEAN(0, "checksums", &scan_args->calculate_checksums, "Calculate file checksums when scanning."),
|
||||||
OPT_STRING(0, "list-file", &scan_args->list_path, "Specify a list of newline-delimited paths to be scanned"
|
|
||||||
" instead of normal directory traversal. Use '-' to read"
|
|
||||||
" from stdin."),
|
|
||||||
|
|
||||||
OPT_GROUP("Index options"),
|
OPT_GROUP("Index options"),
|
||||||
OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"),
|
OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"),
|
||||||
|
@ -79,9 +79,7 @@ void parse(void *arg) {
|
|||||||
|
|
||||||
int inc_ts = incremental_get(ScanCtx.original_table, doc->path_md5);
|
int inc_ts = incremental_get(ScanCtx.original_table, doc->path_md5);
|
||||||
if (inc_ts != 0 && inc_ts == job->vfile.info.st_mtim.tv_sec) {
|
if (inc_ts != 0 && inc_ts == job->vfile.info.st_mtim.tv_sec) {
|
||||||
pthread_mutex_lock(&ScanCtx.copy_table_mu);
|
|
||||||
incremental_mark_file_for_copy(ScanCtx.copy_table, doc->path_md5);
|
incremental_mark_file_for_copy(ScanCtx.copy_table, doc->path_md5);
|
||||||
pthread_mutex_unlock(&ScanCtx.copy_table_mu);
|
|
||||||
|
|
||||||
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
|
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
|
||||||
ScanCtx.dbg_skipped_files_count += 1;
|
ScanCtx.dbg_skipped_files_count += 1;
|
||||||
|
@ -133,9 +133,6 @@ static int incremental_get_str(GHashTable *table, const char *path_md5) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Not thread safe!
|
|
||||||
*/
|
|
||||||
__always_inline
|
__always_inline
|
||||||
static int incremental_mark_file_for_copy(GHashTable *table, const unsigned char path_md5[MD5_DIGEST_LENGTH]) {
|
static int incremental_mark_file_for_copy(GHashTable *table, const unsigned char path_md5[MD5_DIGEST_LENGTH]) {
|
||||||
char *ptr = malloc(MD5_STR_LENGTH);
|
char *ptr = malloc(MD5_STR_LENGTH);
|
||||||
|
4
src/web/static_generated.c
vendored
4
src/web/static_generated.c
vendored
File diff suppressed because one or more lines are too long
20
third-party/libscan/libscan/ooxml/ooxml.c
vendored
20
third-party/libscan/libscan/ooxml/ooxml.c
vendored
@ -15,18 +15,18 @@ static int should_read_part(const char *part) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if ( // Word
|
if ( // Word
|
||||||
STR_STARTS_WITH_CONSTANT(part, "word/document.xml")
|
STR_STARTS_WITH(part, "word/document.xml")
|
||||||
|| STR_STARTS_WITH_CONSTANT(part, "word/footnotes.xml")
|
|| STR_STARTS_WITH(part, "word/footnotes.xml")
|
||||||
|| STR_STARTS_WITH_CONSTANT(part, "word/endnotes.xml")
|
|| STR_STARTS_WITH(part, "word/endnotes.xml")
|
||||||
|| STR_STARTS_WITH_CONSTANT(part, "word/footer")
|
|| STR_STARTS_WITH(part, "word/footer")
|
||||||
|| STR_STARTS_WITH_CONSTANT(part, "word/header")
|
|| STR_STARTS_WITH(part, "word/header")
|
||||||
// PowerPoint
|
// PowerPoint
|
||||||
|| STR_STARTS_WITH_CONSTANT(part, "ppt/slides/slide")
|
|| STR_STARTS_WITH(part, "ppt/slides/slide")
|
||||||
|| STR_STARTS_WITH_CONSTANT(part, "ppt/notesSlides/slide")
|
|| STR_STARTS_WITH(part, "ppt/notesSlides/slide")
|
||||||
// Excel
|
// Excel
|
||||||
|| STR_STARTS_WITH_CONSTANT(part, "xl/worksheets/sheet")
|
|| STR_STARTS_WITH(part, "xl/worksheets/sheet")
|
||||||
|| STR_STARTS_WITH_CONSTANT(part, "xl/sharedStrings.xml")
|
|| STR_STARTS_WITH(part, "xl/sharedStrings.xml")
|
||||||
|| STR_STARTS_WITH_CONSTANT(part, "xl/workbook.xml")
|
|| STR_STARTS_WITH(part, "xl/workbook.xml")
|
||||||
) {
|
) {
|
||||||
return TRUE;
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
2
third-party/libscan/libscan/util.h
vendored
2
third-party/libscan/libscan/util.h
vendored
@ -7,7 +7,7 @@
|
|||||||
#include "../third-party/utf8.h/utf8.h"
|
#include "../third-party/utf8.h/utf8.h"
|
||||||
#include "macros.h"
|
#include "macros.h"
|
||||||
|
|
||||||
#define STR_STARTS_WITH_CONSTANT(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
|
#define STR_STARTS_WITH(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
|
||||||
|
|
||||||
#define TEXT_BUF_FULL (-1)
|
#define TEXT_BUF_FULL (-1)
|
||||||
#define INITIAL_BUF_SIZE (1024 * 16)
|
#define INITIAL_BUF_SIZE (1024 * 16)
|
||||||
|
4
third-party/libscan/test/main.cpp
vendored
4
third-party/libscan/test/main.cpp
vendored
@ -227,7 +227,7 @@ TEST(Ebook, Utf8Pdf) {
|
|||||||
|
|
||||||
parse_ebook(&ebook_500_ctx, &f, "application/pdf", &doc);
|
parse_ebook(&ebook_500_ctx, &f, "application/pdf", &doc);
|
||||||
|
|
||||||
ASSERT_TRUE(STR_STARTS_WITH_CONSTANT(get_meta(&doc, MetaContent)->str_val, "最後測試 "));
|
ASSERT_TRUE(STR_STARTS_WITH(get_meta(&doc, MetaContent)->str_val, "最後測試 "));
|
||||||
cleanup(&doc, &f);
|
cleanup(&doc, &f);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -245,7 +245,7 @@ TEST(Ebook, Utf8PdfInvalidChars) {
|
|||||||
// It should say "HART is a group of highly qualified ..." but the PDF
|
// It should say "HART is a group of highly qualified ..." but the PDF
|
||||||
// text is been intentionally fucked with by the authors
|
// text is been intentionally fucked with by the authors
|
||||||
// We can at least filter out the non-printable/invalid characters like '<27>' etc
|
// We can at least filter out the non-printable/invalid characters like '<27>' etc
|
||||||
ASSERT_TRUE(STR_STARTS_WITH_CONSTANT(get_meta(&doc, MetaContent)->str_val, "HART i a g f highl alified "));
|
ASSERT_TRUE(STR_STARTS_WITH(get_meta(&doc, MetaContent)->str_val, "HART i a g f highl alified "));
|
||||||
cleanup(&doc, &f);
|
cleanup(&doc, &f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user