mirror of
https://github.com/simon987/sist2.git
synced 2025-04-24 12:45:56 +00:00
Update readme
This commit is contained in:
parent
bfc7f4ddb6
commit
9e51d55ca1
47
README.md
47
README.md
@ -5,19 +5,40 @@
|
|||||||
|
|
||||||
sist2 (Simple incremental search tool)
|
sist2 (Simple incremental search tool)
|
||||||
|
|
||||||
|
*Warning: sist2 is in early development*
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
|
* Fast, low memory usage
|
||||||
|
* Portable (all its features are packaged in a single executable)
|
||||||
|
* Extracts text from common file types\*
|
||||||
|
* Generates thumbnails\*
|
||||||
|
* Incremental scanning
|
||||||
|
|
||||||
|
|
||||||
|
\* See [format support](#format-support)
|
||||||
|
|
||||||
|
## Getting Started
|
||||||
|
|
||||||
|
1. Have an [Elasticsearch](https://www.elastic.co/downloads/elasticsearch) instance running
|
||||||
|
1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases)
|
||||||
|
|
||||||
|
*Windows users*: `sist2` runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)
|
||||||
|
|
||||||
|
*Mac users*: See [#1](https://github.com/simon987/sist2/issues/1)
|
||||||
|
|
||||||
|
|
||||||
## Example usage
|
## Example usage
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
See help page `sist2 --help` for more details.
|
See help page `sist2 --help` for more details.
|
||||||
|
|
||||||
**Scan a directory**
|
**Scan a directory**
|
||||||
```bash
|
```bash
|
||||||
sist2 scan ~/Documents -o ./orig_idx/
|
sist2 scan ~/Documents -o ./orig_idx/
|
||||||
sist2 scan --threads 4 --content-size 16384 /mnt/Pictures
|
sist2 scan --threads 4 --content-size 16384 /mnt/Pictures
|
||||||
sist2 scan -i ./orig_idx/ -o ./updated_idx/ ~/Documents
|
sist2 scan --incremental ./orig_idx/ -o ./updated_idx/ ~/Documents
|
||||||
```
|
```
|
||||||
|
|
||||||
**Push index to Elasticsearch or file**
|
**Push index to Elasticsearch or file**
|
||||||
@ -46,22 +67,11 @@ docx, xlsx, pptx | | *planned* | no | *planned* |
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Installation
|
|
||||||
|
|
||||||
1. Download runtime dependencies
|
|
||||||
|
|
||||||
`apt install curl bzip2`
|
|
||||||
|
|
||||||
1. Download binary
|
|
||||||
|
|
||||||
Get [the latest release](https://github.com/simon987/sist2/releases) from GitHub
|
|
||||||
|
|
||||||
1. (Optional) Add to search path
|
|
||||||
|
|
||||||
`mv sist2 /usr/bin/`
|
|
||||||
|
|
||||||
## Build from source
|
## Build from source
|
||||||
|
|
||||||
|
You can compile **sist2** by yourself if you don't want to use the pre-compiled
|
||||||
|
binaries.
|
||||||
|
|
||||||
1. Install compile-time dependencies
|
1. Install compile-time dependencies
|
||||||
|
|
||||||
*(Debian)*
|
*(Debian)*
|
||||||
@ -70,12 +80,7 @@ docx, xlsx, pptx | | *planned* | no | *planned* |
|
|||||||
libssl-dev uuid-dev libavformat-dev libswscale-dev \
|
libssl-dev uuid-dev libavformat-dev libswscale-dev \
|
||||||
python3 libmagic-dev libfreetype6-dev libcurl-dev \
|
python3 libmagic-dev libfreetype6-dev libcurl-dev \
|
||||||
libbz2-dev yasm
|
libbz2-dev yasm
|
||||||
```
|
|
||||||
*(Archlinux)*
|
|
||||||
```bash
|
|
||||||
pacman -S git ffmpeg pkg-config cmake openssl curl \
|
|
||||||
bzip2 yasm libutil-linux
|
|
||||||
```
|
|
||||||
2. Build
|
2. Build
|
||||||
```bash
|
```bash
|
||||||
git clone --recurse-submodules https://github.com/simon987/sist2
|
git clone --recurse-submodules https://github.com/simon987/sist2
|
||||||
|
@ -94,7 +94,7 @@ void parse(void *arg) {
|
|||||||
if (!(SHOULD_PARSE(doc.mime))) {
|
if (!(SHOULD_PARSE(doc.mime))) {
|
||||||
|
|
||||||
} else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) || mmime == MimeAudio || mmime == MimeImage) {
|
} else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) || mmime == MimeAudio || mmime == MimeImage) {
|
||||||
// parse_media(job->filepath, &doc);
|
parse_media(job->filepath, &doc);
|
||||||
|
|
||||||
} else if (IS_PDF(doc.mime)) {
|
} else if (IS_PDF(doc.mime)) {
|
||||||
void *pdf_buf = read_all(job, (char *) buf, bytes_read, &fd);
|
void *pdf_buf = read_all(job, (char *) buf, bytes_read, &fd);
|
||||||
@ -105,15 +105,15 @@ void parse(void *arg) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
} else if (mmime == MimeText && ScanCtx.content_size > 0) {
|
} else if (mmime == MimeText && ScanCtx.content_size > 0) {
|
||||||
// parse_text(bytes_read, &fd, (char *) buf, &doc);
|
parse_text(bytes_read, &fd, (char *) buf, &doc);
|
||||||
|
|
||||||
} else if (IS_FONT(doc.mime)) {
|
} else if (IS_FONT(doc.mime)) {
|
||||||
// void *font_buf = read_all(job, (char *) buf, bytes_read, &fd);
|
void *font_buf = read_all(job, (char *) buf, bytes_read, &fd);
|
||||||
// parse_font(font_buf, doc.size, &doc);
|
parse_font(font_buf, doc.size, &doc);
|
||||||
//
|
|
||||||
// if (font_buf != buf) {
|
if (font_buf != buf) {
|
||||||
// free(font_buf);
|
free(font_buf);
|
||||||
// }
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
write_document(&doc);
|
write_document(&doc);
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
|
#include <src/ctx.h>
|
||||||
#include "pdf.h"
|
#include "pdf.h"
|
||||||
#include "src/ctx.h"
|
#include "src/ctx.h"
|
||||||
|
|
||||||
__always_inline
|
|
||||||
fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) {
|
fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) {
|
||||||
|
|
||||||
fz_page *cover = fz_load_page(ctx, fzdoc, 0);
|
fz_page *cover = fz_load_page(ctx, fzdoc, 0);
|
||||||
@ -25,8 +25,12 @@ fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) {
|
|||||||
fz_device *dev = fz_new_draw_device(ctx, m, pixmap);
|
fz_device *dev = fz_new_draw_device(ctx, m, pixmap);
|
||||||
|
|
||||||
pthread_mutex_lock(&ScanCtx.mupdf_mu);
|
pthread_mutex_lock(&ScanCtx.mupdf_mu);
|
||||||
|
fz_try(ctx)
|
||||||
fz_run_page(ctx, cover, dev, fz_identity, NULL);
|
fz_run_page(ctx, cover, dev, fz_identity, NULL);
|
||||||
|
fz_always(ctx)
|
||||||
pthread_mutex_unlock(&ScanCtx.mupdf_mu);
|
pthread_mutex_unlock(&ScanCtx.mupdf_mu);
|
||||||
|
fz_catch(ctx)
|
||||||
|
fz_rethrow(ctx);
|
||||||
|
|
||||||
fz_drop_device(ctx, dev);
|
fz_drop_device(ctx, dev);
|
||||||
|
|
||||||
@ -72,6 +76,7 @@ void parse_pdf(void *buf, size_t buf_len, document_t *doc) {
|
|||||||
fzdoc = fz_open_document_with_stream(ctx, mime_get_mime_text(doc->mime), stream);
|
fzdoc = fz_open_document_with_stream(ctx, mime_get_mime_text(doc->mime), stream);
|
||||||
|
|
||||||
int page_count = fz_count_pages(ctx, fzdoc);
|
int page_count = fz_count_pages(ctx, fzdoc);
|
||||||
|
|
||||||
fz_page *cover = render_cover(ctx, doc, fzdoc);
|
fz_page *cover = render_cover(ctx, doc, fzdoc);
|
||||||
|
|
||||||
fz_stext_options opts;
|
fz_stext_options opts;
|
||||||
@ -90,8 +95,12 @@ void parse_pdf(void *buf, size_t buf_len, document_t *doc) {
|
|||||||
fz_device *dev = fz_new_stext_device(ctx, stext, &opts);
|
fz_device *dev = fz_new_stext_device(ctx, stext, &opts);
|
||||||
|
|
||||||
pthread_mutex_lock(&ScanCtx.mupdf_mu);
|
pthread_mutex_lock(&ScanCtx.mupdf_mu);
|
||||||
|
fz_try(ctx)
|
||||||
fz_run_page_contents(ctx, page, dev, fz_identity, NULL);
|
fz_run_page_contents(ctx, page, dev, fz_identity, NULL);
|
||||||
|
fz_always(ctx)
|
||||||
pthread_mutex_unlock(&ScanCtx.mupdf_mu);
|
pthread_mutex_unlock(&ScanCtx.mupdf_mu);
|
||||||
|
fz_catch(ctx)
|
||||||
|
fz_rethrow(ctx);
|
||||||
|
|
||||||
fz_drop_device(ctx, dev);
|
fz_drop_device(ctx, dev);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user