diff --git a/README.md b/README.md index 71bd1e7..db2f9ce 100644 --- a/README.md +++ b/README.md @@ -113,18 +113,16 @@ sist2 scan --ocr eng ~/Books/Textbooks/ ## Build from source You can compile **sist2** by yourself if you don't want to use the pre-compiled -binaries. +binaries (GCC 7+ required). 1. Install compile-time dependencies ```bash - vcpkg install lmdb cjson glib + vcpkg install lmdb cjson glib libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 ffmpeg zstd ``` 2. Build ```bash - git clone --recurse-submodules https://github.com/simon987/sist2 - ./scripts/get_static_libs.sh - cmake . + cmake -D /scripts/buildsystems/vcpkg.cmake . make ``` diff --git a/ci/build.sh b/ci/build.sh index 194b508..4b5f4dc 100755 --- a/ci/build.sh +++ b/ci/build.sh @@ -1,12 +1,11 @@ #!/usr/bin/env bash -./scripts/get_static_libs.sh rm -rf CMakeFiles CmakeCache.txt -cmake -DSIST_DEBUG=off . +cmake -DSIST_DEBUG=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake . make strip sist2 rm -rf CMakeFiles CmakeCache.txt -cmake -DSIST_DEBUG=on . +cmake -DSIST_DEBUG=on -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake . make diff --git a/src/cli.h b/src/cli.h index 81e7511..f0e5b29 100644 --- a/src/cli.h +++ b/src/cli.h @@ -3,7 +3,7 @@ #include "sist.h" -#include "libscan/scan.h" +#include "libscan/arc/arc.h" typedef struct scan_args { float quality; diff --git a/src/ctx.h b/src/ctx.h index b12e150..9cab8f7 100644 --- a/src/ctx.h +++ b/src/ctx.h @@ -4,6 +4,13 @@ #include "sist.h" #include "tpool.h" #include "libscan/scan.h" +#include "libscan/arc/arc.h" +#include "libscan/cbr/cbr.h" +#include "libscan/ebook/ebook.h" +#include "libscan/font/font.h" +#include "libscan/media/media.h" +#include "libscan/ooxml/ooxml.h" +#include "libscan/text/text.h" #include #include @@ -17,12 +24,8 @@ struct { tpool_t *pool; - int tn_size; int threads; - int content_size; - float tn_qscale; int depth; - archive_mode_t archive_mode; int verbose; int very_verbose; @@ -32,28 +35,30 @@ struct { GHashTable *original_table; GHashTable *copy_table; - pthread_mutex_t mupdf_mu; - char * tesseract_lang; - const char * tesseract_path; pcre *exclude; pcre_extra *exclude_extra; int fast; + + scan_arc_ctx_t arc_ctx; + scan_cbr_ctx_t cbr_ctx; + scan_ebook_ctx_t ebook_ctx; + scan_font_ctx_t font_ctx; + scan_media_ctx_t media_ctx; + scan_ooxml_ctx_t ooxml_ctx; + scan_text_ctx_t text_ctx; } ScanCtx; -//TODO Move to log.h struct { int verbose; int very_verbose; int no_color; } LogCtx; -//TODO Move to index.h ? struct { char *es_url; int batch_size; } IndexCtx; -//TODO Move to serve.h ? struct { char *es_url; int index_count; diff --git a/src/io/store.h b/src/io/store.h index f749c0d..f2db3a9 100644 --- a/src/io/store.h +++ b/src/io/store.h @@ -11,8 +11,6 @@ typedef struct store_t { pthread_rwlock_t lock; } store_t; -#include "../sist.h" - store_t *store_create(char *path); void store_destroy(store_t *store); diff --git a/src/io/walk.h b/src/io/walk.h index aeb332c..6afc258 100644 --- a/src/io/walk.h +++ b/src/io/walk.h @@ -3,8 +3,6 @@ #define _XOPEN_SOURCE 500 -#include "../sist.h" - int walk_directory_tree(const char *); #endif diff --git a/src/main.c b/src/main.c index 8336d34..e7951f2 100644 --- a/src/main.c +++ b/src/main.c @@ -2,7 +2,6 @@ #include "ctx.h" #include -#include #include #include "cli.h" @@ -10,10 +9,10 @@ #include "io/store.h" #include "tpool.h" #include "io/walk.h" -#include "io/walk.h" #include "index/elastic.h" #include "web/serve.h" #include "parsing/mime.h" +#include "parsing/parse.h" #define DESCRIPTION "Lightning-fast file system indexer and search tool." @@ -28,11 +27,6 @@ static const char *const usage[] = { NULL, }; -void global_init() { - //TODO -// curl_global_init(CURL_GLOBAL_NOTHING); -} - void init_dir(const char *dirpath) { char path[PATH_MAX]; snprintf(path, PATH_MAX, "%sdescriptor.json", dirpath); @@ -51,28 +45,74 @@ void scan_print_header() { LOG_INFOF("main.c", "sist2 v%s", Version) } -void sist2_scan(scan_args_t *args) { +void _store(char *key, size_t key_len, char *buf, size_t buf_len) { + store_write(ScanCtx.index.store, key, key_len, buf, buf_len); +} + +void initialize_scan_context(scan_args_t *args) { + + // Arc + ScanCtx.arc_ctx.mode = args->archive_mode; + ScanCtx.arc_ctx.log = sist_log; + ScanCtx.arc_ctx.logf = sist_logf; + ScanCtx.arc_ctx.parse = (parse_callback_t) parse; + + // Cbr + ScanCtx.cbr_ctx.log = sist_log; + ScanCtx.cbr_ctx.logf = sist_logf; + ScanCtx.cbr_ctx.store = _store; + ScanCtx.cbr_ctx.cbr_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/x-cbr"); + + // Ebook + pthread_mutex_init(&ScanCtx.ebook_ctx.mupdf_mutex, NULL); + ScanCtx.ebook_ctx.content_size = args->content_size; + ScanCtx.ebook_ctx.tn_size = args->size; + ScanCtx.ebook_ctx.tesseract_lang = args->tesseract_lang; + ScanCtx.ebook_ctx.tesseract_path = args->tesseract_path; + ScanCtx.ebook_ctx.log = sist_log; + ScanCtx.ebook_ctx.logf = sist_logf; + ScanCtx.ebook_ctx.store = _store; + + // Font + ScanCtx.font_ctx.enable_tn = args->size > 0; + ScanCtx.font_ctx.log = sist_log; + ScanCtx.font_ctx.logf = sist_logf; + ScanCtx.font_ctx.store = _store; + + // Media + ScanCtx.media_ctx.tn_qscale = args->quality; + ScanCtx.media_ctx.tn_size = args->size; + ScanCtx.media_ctx.content_size = args->content_size; + ScanCtx.media_ctx.log = sist_log; + ScanCtx.media_ctx.logf = sist_logf; + ScanCtx.media_ctx.store = _store; + + // OOXML + ScanCtx.ooxml_ctx.content_size = args->content_size; + ScanCtx.ooxml_ctx.log = sist_log; + ScanCtx.ooxml_ctx.logf = sist_logf; - ScanCtx.tn_qscale = args->quality; - ScanCtx.tn_size = args->size; - ScanCtx.content_size = args->content_size; ScanCtx.threads = args->threads; ScanCtx.depth = args->depth; - ScanCtx.archive_mode = args->archive_mode; + strncpy(ScanCtx.index.path, args->output, sizeof(ScanCtx.index.path)); strncpy(ScanCtx.index.desc.name, args->name, sizeof(ScanCtx.index.desc.name)); strncpy(ScanCtx.index.desc.root, args->path, sizeof(ScanCtx.index.desc.root)); strncpy(ScanCtx.index.desc.rewrite_url, args->rewrite_url, sizeof(ScanCtx.index.desc.rewrite_url)); ScanCtx.index.desc.root_len = (short) strlen(ScanCtx.index.desc.root); - ScanCtx.tesseract_lang = args->tesseract_lang; - ScanCtx.tesseract_path = args->tesseract_path; ScanCtx.fast = args->fast; +} - init_dir(ScanCtx.index.path); + +void sist2_scan(scan_args_t *args) { ScanCtx.mime_table = mime_get_mime_table(); ScanCtx.ext_table = mime_get_ext_table(); + initialize_scan_context(args); + + init_dir(ScanCtx.index.path); + char store_path[PATH_MAX]; snprintf(store_path, PATH_MAX, "%sthumbs", ScanCtx.index.path); mkdir(store_path, S_IWUSR | S_IRUSR | S_IXUSR); @@ -222,8 +262,6 @@ void sist2_web(web_args_t *args) { int main(int argc, const char *argv[]) { - global_init(); - scan_args_t *scan_args = scan_args_create(); index_args_t *index_args = index_args_create(); web_args_t *web_args = web_args_create(); diff --git a/src/parsing/parse.c b/src/parsing/parse.c index f3d72d4..fb9f94a 100644 --- a/src/parsing/parse.c +++ b/src/parsing/parse.c @@ -11,6 +11,9 @@ __thread magic_t Magic = NULL; +#define MIN_VIDEO_SIZE 1024 * 64 +#define MIN_IMAGE_SIZE 1024 * 2 + int fs_read(struct vfile *f, void *buf, size_t size) { if (f->fd == -1) { @@ -112,34 +115,28 @@ void parse(void *arg) { } else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) || (mmime == MimeImage && doc.size >= MIN_IMAGE_SIZE) || mmime == MimeAudio) { - scan_media_ctx_t media_ctx; - media_ctx.tn_qscale = ScanCtx.tn_qscale; - media_ctx.tn_size = ScanCtx.tn_size; - media_ctx.content_size = ScanCtx.content_size; - - parse_media(&media_ctx, &job->vfile, &doc); + parse_media(&ScanCtx.media_ctx, &job->vfile, &doc); } else if (IS_PDF(doc.mime)) { -// parse_ebook(pdf_buf, doc.size, &doc); + parse_ebook(&ScanCtx.ebook_ctx, &job->vfile, mime_get_mime_text(doc.mime), &doc); - } else if (mmime == MimeText && ScanCtx.content_size > 0) { -// parse_text(bytes_read, &job->vfile, (char *) buf, &doc); + } else if (mmime == MimeText && ScanCtx.text_ctx.content_size > 0) { + parse_text(&ScanCtx.text_ctx, &job->vfile, &doc); } else if (IS_FONT(doc.mime)) { -// parse_font(font_buf, doc.size, &doc); + parse_font(&ScanCtx.font_ctx, &job->vfile, &doc); } else if ( - ScanCtx.archive_mode != ARC_MODE_SKIP && ( + ScanCtx.arc_ctx.mode != ARC_MODE_SKIP && ( IS_ARC(doc.mime) || (IS_ARC_FILTER(doc.mime) && should_parse_filtered_file(doc.filepath, doc.ext)) )) { -// parse_archive(&job->vfile, &doc); - } else if (ScanCtx.content_size > 0 && IS_DOC(doc.mime)) { -// parse_doc(doc_buf, doc.size, &doc); - - } else if (is_cbr(doc.mime)) { -// parse_cbr(cbr_buf, doc.size, &doc); + parse_archive(&ScanCtx.arc_ctx, &job->vfile, &doc); + } else if (ScanCtx.ooxml_ctx.content_size > 0 && IS_DOC(doc.mime)) { + parse_ooxml(&ScanCtx.ooxml_ctx, &job->vfile, &doc); + } else if (is_cbr(&ScanCtx.cbr_ctx, doc.mime)) { + parse_cbr(&ScanCtx.cbr_ctx, &job->vfile, &doc); } //Parent meta diff --git a/src/util.h b/src/util.h index c2e0e09..6c45d09 100644 --- a/src/util.h +++ b/src/util.h @@ -11,7 +11,6 @@ #include "libscan/scan.h" - char *abspath(const char *path); char *expandpath(const char *path); diff --git a/src/web/serve.c b/src/web/serve.c index 4fd7f3e..5aa1160 100644 --- a/src/web/serve.c +++ b/src/web/serve.c @@ -157,108 +157,6 @@ void thumbnail(struct mg_connection *nc, struct http_message *hm, struct mg_str nc->flags |= MG_F_SEND_AND_CLOSE; } -/** - * Modified version of onion_shortcut_response_file that allows - * browsers to seek media files. - */ -int chunked_response_file(const char *filename, const char *mime, - int partial, onion_request *request, onion_response *res) { - int fd = open(filename, O_RDONLY | O_CLOEXEC); - struct stat st; - - if (fd < 0 || stat(filename, &st) != 0 || S_ISDIR(st.st_mode)) { - close(fd); - return OCS_NOT_PROCESSED; - } - - size_t length = st.st_size; - size_t ends; - - const char *range = onion_request_get_header(request, "Range"); - if (partial && range && strncmp(range, "bytes=", 6) == 0) { - onion_response_set_header(res, "Accept-Ranges", "bytes"); - - onion_response_set_code(res, HTTP_PARTIAL_CONTENT); - - char tmp[1024]; - if (strlen(range + 6) >= sizeof(tmp)) { - close(fd); - return OCS_INTERNAL_ERROR; - } - strncpy(tmp, range + 6, sizeof(tmp) - 1); - char *start = tmp; - char *end = tmp; - - while (*end != '-' && *end) { - end++; - } - - if (*end == '-') { - *end = '\0'; - end++; - - size_t starts; - starts = atol(start); - if (*end) { - // %d-%d - ends = atol(end); - } else { - // %d- - ends = MIN(starts + CHUNK_SIZE, length); - } - if (ends > length || starts >= length || starts < 0) { - close(fd); - return OCS_INTERNAL_ERROR; - } - length = ends - starts; - - if (starts != 0) { - lseek(fd, starts, SEEK_SET); - } - snprintf(tmp, sizeof(tmp), "bytes %ld-%ld/%ld", - starts, ends - 1, st.st_size); - onion_response_set_header(res, "Content-Range", tmp); - } - } - onion_response_set_length(res, length); - if (mime != NULL) { - onion_response_set_header(res, "Content-Type", mime); - } else { - onion_response_set_header(res, "Content-Type", "application/octet-stream"); - } - - onion_response_write_headers(res); - if ((onion_request_get_flags(request) & OR_HEAD) == OR_HEAD) { - length = 0; - } - - if (length) { - int bytes_read = 0, bytes_written; - size_t total_read = 0; - char buf[4046]; - if (length > sizeof(buf)) { - size_t max = length - sizeof(buf); - while (total_read < max) { - bytes_read = read(fd, buf, sizeof(buf)); - if (bytes_read < 0) { - break; - } - total_read += bytes_read; - bytes_written = onion_response_write(res, buf, bytes_read); - if (bytes_written != bytes_read) { - break; - } - } - } - if (sizeof(buf) >= (length - total_read)) { - bytes_read = read(fd, buf, length - total_read); - onion_response_write(res, buf, bytes_read); - } - } - close(fd); - return OCS_PROCESSED; -} - void search(struct mg_connection *nc, struct http_message *hm) { if (hm->body.len == 0) {