diff --git a/README.md b/README.md index a1a7b2f..83a4366 100644 --- a/README.md +++ b/README.md @@ -5,19 +5,40 @@ sist2 (Simple incremental search tool) +*Warning: sist2 is in early development* + ## Features +* Fast, low memory usage +* Portable (all its features are packaged in a single executable) +* Extracts text from common file types\* +* Generates thumbnails\* +* Incremental scanning + + +\* See [format support](#format-support) + +## Getting Started + +1. Have an [Elasticsearch](https://www.elastic.co/downloads/elasticsearch) instance running +1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases) + +*Windows users*: `sist2` runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux) + +*Mac users*: See [#1](https://github.com/simon987/sist2/issues/1) ## Example usage +![demo](demo.gif) + See help page `sist2 --help` for more details. **Scan a directory** ```bash sist2 scan ~/Documents -o ./orig_idx/ sist2 scan --threads 4 --content-size 16384 /mnt/Pictures -sist2 scan -i ./orig_idx/ -o ./updated_idx/ ~/Documents +sist2 scan --incremental ./orig_idx/ -o ./updated_idx/ ~/Documents ``` **Push index to Elasticsearch or file** @@ -46,22 +67,11 @@ docx, xlsx, pptx | | *planned* | no | *planned* | -## Installation - -1. Download runtime dependencies - - `apt install curl bzip2` - -1. Download binary - - Get [the latest release](https://github.com/simon987/sist2/releases) from GitHub - -1. (Optional) Add to search path - - `mv sist2 /usr/bin/` - ## Build from source +You can compile **sist2** by yourself if you don't want to use the pre-compiled +binaries. + 1. Install compile-time dependencies *(Debian)* @@ -70,12 +80,7 @@ docx, xlsx, pptx | | *planned* | no | *planned* | libssl-dev uuid-dev libavformat-dev libswscale-dev \ python3 libmagic-dev libfreetype6-dev libcurl-dev \ libbz2-dev yasm - ``` - *(Archlinux)* - ```bash - pacman -S git ffmpeg pkg-config cmake openssl curl \ - bzip2 yasm libutil-linux - ``` + 2. Build ```bash git clone --recurse-submodules https://github.com/simon987/sist2 diff --git a/demo.gif b/demo.gif new file mode 100644 index 0000000..d7832ab Binary files /dev/null and b/demo.gif differ diff --git a/src/ctx.h b/src/ctx.h index 1e32fa5..f3a4816 100644 --- a/src/ctx.h +++ b/src/ctx.h @@ -37,5 +37,4 @@ struct { } WebCtx; - #endif diff --git a/src/parsing/parse.c b/src/parsing/parse.c index 7bb59f0..dbf4e2b 100644 --- a/src/parsing/parse.c +++ b/src/parsing/parse.c @@ -94,7 +94,7 @@ void parse(void *arg) { if (!(SHOULD_PARSE(doc.mime))) { } else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) || mmime == MimeAudio || mmime == MimeImage) { -// parse_media(job->filepath, &doc); + parse_media(job->filepath, &doc); } else if (IS_PDF(doc.mime)) { void *pdf_buf = read_all(job, (char *) buf, bytes_read, &fd); @@ -105,15 +105,15 @@ void parse(void *arg) { } } else if (mmime == MimeText && ScanCtx.content_size > 0) { -// parse_text(bytes_read, &fd, (char *) buf, &doc); + parse_text(bytes_read, &fd, (char *) buf, &doc); } else if (IS_FONT(doc.mime)) { -// void *font_buf = read_all(job, (char *) buf, bytes_read, &fd); -// parse_font(font_buf, doc.size, &doc); -// -// if (font_buf != buf) { -// free(font_buf); -// } + void *font_buf = read_all(job, (char *) buf, bytes_read, &fd); + parse_font(font_buf, doc.size, &doc); + + if (font_buf != buf) { + free(font_buf); + } } write_document(&doc); diff --git a/src/parsing/pdf.c b/src/parsing/pdf.c index 88ec15e..af19ae8 100644 --- a/src/parsing/pdf.c +++ b/src/parsing/pdf.c @@ -1,7 +1,7 @@ +#include #include "pdf.h" #include "src/ctx.h" -__always_inline fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) { fz_page *cover = fz_load_page(ctx, fzdoc, 0); @@ -25,8 +25,12 @@ fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) { fz_device *dev = fz_new_draw_device(ctx, m, pixmap); pthread_mutex_lock(&ScanCtx.mupdf_mu); - fz_run_page(ctx, cover, dev, fz_identity, NULL); - pthread_mutex_unlock(&ScanCtx.mupdf_mu); + fz_try(ctx) + fz_run_page(ctx, cover, dev, fz_identity, NULL); + fz_always(ctx) + pthread_mutex_unlock(&ScanCtx.mupdf_mu); + fz_catch(ctx) + fz_rethrow(ctx); fz_drop_device(ctx, dev); @@ -72,6 +76,7 @@ void parse_pdf(void *buf, size_t buf_len, document_t *doc) { fzdoc = fz_open_document_with_stream(ctx, mime_get_mime_text(doc->mime), stream); int page_count = fz_count_pages(ctx, fzdoc); + fz_page *cover = render_cover(ctx, doc, fzdoc); fz_stext_options opts; @@ -90,8 +95,12 @@ void parse_pdf(void *buf, size_t buf_len, document_t *doc) { fz_device *dev = fz_new_stext_device(ctx, stext, &opts); pthread_mutex_lock(&ScanCtx.mupdf_mu); - fz_run_page_contents(ctx, page, dev, fz_identity, NULL); - pthread_mutex_unlock(&ScanCtx.mupdf_mu); + fz_try(ctx) + fz_run_page_contents(ctx, page, dev, fz_identity, NULL); + fz_always(ctx) + pthread_mutex_unlock(&ScanCtx.mupdf_mu); + fz_catch(ctx) + fz_rethrow(ctx); fz_drop_device(ctx, dev);