mirror of
https://github.com/simon987/sist2.git
synced 2025-04-20 02:36:43 +00:00
Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
9e51d55ca1 |
47
README.md
47
README.md
@ -5,19 +5,40 @@
|
||||
|
||||
sist2 (Simple incremental search tool)
|
||||
|
||||
*Warning: sist2 is in early development*
|
||||
|
||||
## Features
|
||||
|
||||
* Fast, low memory usage
|
||||
* Portable (all its features are packaged in a single executable)
|
||||
* Extracts text from common file types\*
|
||||
* Generates thumbnails\*
|
||||
* Incremental scanning
|
||||
|
||||
|
||||
\* See [format support](#format-support)
|
||||
|
||||
## Getting Started
|
||||
|
||||
1. Have an [Elasticsearch](https://www.elastic.co/downloads/elasticsearch) instance running
|
||||
1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases)
|
||||
|
||||
*Windows users*: `sist2` runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)
|
||||
|
||||
*Mac users*: See [#1](https://github.com/simon987/sist2/issues/1)
|
||||
|
||||
|
||||
## Example usage
|
||||
|
||||

|
||||
|
||||
See help page `sist2 --help` for more details.
|
||||
|
||||
**Scan a directory**
|
||||
```bash
|
||||
sist2 scan ~/Documents -o ./orig_idx/
|
||||
sist2 scan --threads 4 --content-size 16384 /mnt/Pictures
|
||||
sist2 scan -i ./orig_idx/ -o ./updated_idx/ ~/Documents
|
||||
sist2 scan --incremental ./orig_idx/ -o ./updated_idx/ ~/Documents
|
||||
```
|
||||
|
||||
**Push index to Elasticsearch or file**
|
||||
@ -46,22 +67,11 @@ docx, xlsx, pptx | | *planned* | no | *planned* |
|
||||
|
||||
|
||||
|
||||
## Installation
|
||||
|
||||
1. Download runtime dependencies
|
||||
|
||||
`apt install curl bzip2`
|
||||
|
||||
1. Download binary
|
||||
|
||||
Get [the latest release](https://github.com/simon987/sist2/releases) from GitHub
|
||||
|
||||
1. (Optional) Add to search path
|
||||
|
||||
`mv sist2 /usr/bin/`
|
||||
|
||||
## Build from source
|
||||
|
||||
You can compile **sist2** by yourself if you don't want to use the pre-compiled
|
||||
binaries.
|
||||
|
||||
1. Install compile-time dependencies
|
||||
|
||||
*(Debian)*
|
||||
@ -70,12 +80,7 @@ docx, xlsx, pptx | | *planned* | no | *planned* |
|
||||
libssl-dev uuid-dev libavformat-dev libswscale-dev \
|
||||
python3 libmagic-dev libfreetype6-dev libcurl-dev \
|
||||
libbz2-dev yasm
|
||||
```
|
||||
*(Archlinux)*
|
||||
```bash
|
||||
pacman -S git ffmpeg pkg-config cmake openssl curl \
|
||||
bzip2 yasm libutil-linux
|
||||
```
|
||||
|
||||
2. Build
|
||||
```bash
|
||||
git clone --recurse-submodules https://github.com/simon987/sist2
|
||||
|
@ -94,7 +94,7 @@ void parse(void *arg) {
|
||||
if (!(SHOULD_PARSE(doc.mime))) {
|
||||
|
||||
} else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) || mmime == MimeAudio || mmime == MimeImage) {
|
||||
// parse_media(job->filepath, &doc);
|
||||
parse_media(job->filepath, &doc);
|
||||
|
||||
} else if (IS_PDF(doc.mime)) {
|
||||
void *pdf_buf = read_all(job, (char *) buf, bytes_read, &fd);
|
||||
@ -105,15 +105,15 @@ void parse(void *arg) {
|
||||
}
|
||||
|
||||
} else if (mmime == MimeText && ScanCtx.content_size > 0) {
|
||||
// parse_text(bytes_read, &fd, (char *) buf, &doc);
|
||||
parse_text(bytes_read, &fd, (char *) buf, &doc);
|
||||
|
||||
} else if (IS_FONT(doc.mime)) {
|
||||
// void *font_buf = read_all(job, (char *) buf, bytes_read, &fd);
|
||||
// parse_font(font_buf, doc.size, &doc);
|
||||
//
|
||||
// if (font_buf != buf) {
|
||||
// free(font_buf);
|
||||
// }
|
||||
void *font_buf = read_all(job, (char *) buf, bytes_read, &fd);
|
||||
parse_font(font_buf, doc.size, &doc);
|
||||
|
||||
if (font_buf != buf) {
|
||||
free(font_buf);
|
||||
}
|
||||
}
|
||||
|
||||
write_document(&doc);
|
||||
|
@ -1,7 +1,7 @@
|
||||
#include <src/ctx.h>
|
||||
#include "pdf.h"
|
||||
#include "src/ctx.h"
|
||||
|
||||
__always_inline
|
||||
fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) {
|
||||
|
||||
fz_page *cover = fz_load_page(ctx, fzdoc, 0);
|
||||
@ -25,8 +25,12 @@ fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) {
|
||||
fz_device *dev = fz_new_draw_device(ctx, m, pixmap);
|
||||
|
||||
pthread_mutex_lock(&ScanCtx.mupdf_mu);
|
||||
fz_run_page(ctx, cover, dev, fz_identity, NULL);
|
||||
pthread_mutex_unlock(&ScanCtx.mupdf_mu);
|
||||
fz_try(ctx)
|
||||
fz_run_page(ctx, cover, dev, fz_identity, NULL);
|
||||
fz_always(ctx)
|
||||
pthread_mutex_unlock(&ScanCtx.mupdf_mu);
|
||||
fz_catch(ctx)
|
||||
fz_rethrow(ctx);
|
||||
|
||||
fz_drop_device(ctx, dev);
|
||||
|
||||
@ -72,6 +76,7 @@ void parse_pdf(void *buf, size_t buf_len, document_t *doc) {
|
||||
fzdoc = fz_open_document_with_stream(ctx, mime_get_mime_text(doc->mime), stream);
|
||||
|
||||
int page_count = fz_count_pages(ctx, fzdoc);
|
||||
|
||||
fz_page *cover = render_cover(ctx, doc, fzdoc);
|
||||
|
||||
fz_stext_options opts;
|
||||
@ -90,8 +95,12 @@ void parse_pdf(void *buf, size_t buf_len, document_t *doc) {
|
||||
fz_device *dev = fz_new_stext_device(ctx, stext, &opts);
|
||||
|
||||
pthread_mutex_lock(&ScanCtx.mupdf_mu);
|
||||
fz_run_page_contents(ctx, page, dev, fz_identity, NULL);
|
||||
pthread_mutex_unlock(&ScanCtx.mupdf_mu);
|
||||
fz_try(ctx)
|
||||
fz_run_page_contents(ctx, page, dev, fz_identity, NULL);
|
||||
fz_always(ctx)
|
||||
pthread_mutex_unlock(&ScanCtx.mupdf_mu);
|
||||
fz_catch(ctx)
|
||||
fz_rethrow(ctx);
|
||||
|
||||
fz_drop_device(ctx, dev);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user