Compare commits

...

14 Commits

Author SHA1 Message Date
Michael M. Chang
760fc5093e
Merge d0ec304f7b5610d79be2f84cc461f796f1322012 into d800effad9dfbdd5fa00b6f012eba99095448f21 2025-02-16 10:56:34 +01:00
Shy
d800effad9
Merge pull request #511 from dpieski/patch-5
Update README.md
2025-02-06 17:58:36 -05:00
Shy
371e9c408e
Merge pull request #512 from dpieski/patch-6
Update README.md
2025-02-06 17:58:07 -05:00
Andrew
ee1b1d8bb4
Update README.md
Moved README references from simon987 to sist2app
2025-02-03 15:09:11 -06:00
Andrew
63a097a463
Update README.md
Update to the docker-compose.yml example.
2025-02-03 15:00:03 -06:00
Shy
7a03a2202e Fix #481 2025-01-24 19:40:08 -05:00
Shy
050fc500ce Fix #462 2025-01-24 19:22:01 -05:00
Shy
d44679131b Update compose file to avoid confusion. Fixes #490 2025-01-23 21:45:01 -05:00
Shy
4dd5e70406 Fix #492 2025-01-23 21:40:37 -05:00
Shy
5a82581992 Fix magic database problem 2025-01-23 21:40:27 -05:00
Shy
0dc18a56c0 Fix #509 2025-01-23 19:10:17 -05:00
Shy
258b2e31e6 Version bump 2025-01-23 19:10:02 -05:00
Shy
c726074029 Update tessdata paths 2025-01-23 19:09:54 -05:00
Michael C
d0ec304f7b
PUID and PGID have no effect on elasticsearch container 2024-05-20 11:41:30 -04:00
10 changed files with 75 additions and 45 deletions

View File

@ -1,5 +1,5 @@
![GitHub](https://img.shields.io/github/license/simon987/sist2.svg) ![GitHub](https://img.shields.io/github/license/sist2app/sist2.svg)
[![CodeFactor](https://www.codefactor.io/repository/github/simon987/sist2/badge?s=05daa325188aac4eae32c786f3d9cf4e0593f822)](https://www.codefactor.io/repository/github/simon987/sist2) [![CodeFactor](https://www.codefactor.io/repository/github/sist2app/sist2/badge?s=05daa325188aac4eae32c786f3d9cf4e0593f822)](https://www.codefactor.io/repository/github/sist2app/sist2)
[![Development snapshots](https://ci.simon987.net/api/badges/simon987/sist2/status.svg)](https://files.simon987.net/.gate/sist2/simon987_sist2/) [![Development snapshots](https://ci.simon987.net/api/badges/simon987/sist2/status.svg)](https://files.simon987.net/.gate/sist2/simon987_sist2/)
**Demo**: [sist2.simon987.net](https://sist2.simon987.net/) **Demo**: [sist2.simon987.net](https://sist2.simon987.net/)
@ -38,26 +38,22 @@ sist2 (Simple incremental search tool)
### Using Docker Compose *(Windows/Linux/Mac)* ### Using Docker Compose *(Windows/Linux/Mac)*
```yaml ```yaml
version: "3"
services: services:
elasticsearch: elasticsearch:
image: elasticsearch:7.17.9 image: elasticsearch:7.17.9
restart: unless-stopped restart: unless-stopped
volumes: volumes:
# This directory must have 1000:1000 permissions (or update PUID & PGID below) # This directory must have 1000:1000 permissions
- /data/sist2-es-data/:/usr/share/elasticsearch/data - /data/sist2-es-data/:/usr/share/elasticsearch/data
environment: environment:
- "discovery.type=single-node" - "discovery.type=single-node"
- "ES_JAVA_OPTS=-Xms2g -Xmx2g" - "ES_JAVA_OPTS=-Xms2g -Xmx2g"
- "PUID=1000"
- "PGID=1000"
sist2-admin: sist2-admin:
image: simon987/sist2:3.4.2-x64-linux image: sist2app/sist2:x64-linux
restart: unless-stopped restart: unless-stopped
volumes: volumes:
- /data/sist2-admin-data/:/sist2-admin/ - /data/sist2-admin-data/:/sist2-admin/
- /:/host - /<path to index>/:/host
ports: ports:
- 4090:4090 - 4090:4090
# NOTE: Don't expose this port publicly! # NOTE: Don't expose this port publicly!
@ -81,7 +77,7 @@ Navigate to http://localhost:8080/ to configure sist2-admin.
``` ```
* **SQLite**: No installation required * **SQLite**: No installation required
2. Download the [latest sist2 release](https://github.com/simon987/sist2/releases). 2. Download the [latest sist2 release](https://github.com/sist2app/sist2/releases).
Select the file corresponding to your CPU architecture and mark the binary as executable with `chmod +x`. Select the file corresponding to your CPU architecture and mark the binary as executable with `chmod +x`.
3. See [usage guide](docs/USAGE.md) for command line usage. 3. See [usage guide](docs/USAGE.md) for command line usage.
@ -100,20 +96,20 @@ Example usage:
| File type | Library | Content | Thumbnail | Metadata | | File type | Library | Content | Thumbnail | Metadata |
|:--------------------------------------------------------------------------|:-----------------------------------------------------------------------------|:---------|:------------|:---------------------------------------------------------------------------------------------------------------------------------------| |:--------------------------------------------------------------------------|:-----------------------------------------------------------------------------|:---------|:------------|:---------------------------------------------------------------------------------------------------------------------------------------|
| pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title | | pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
| cbz,cbr | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | - | yes | - | | cbz,cbr | [libscan](https://github.com/sist2app/sist2/tree/master/third-party/libscan) | - | yes | - |
| `audio/*` | ffmpeg | - | yes | ID3 tags | | `audio/*` | ffmpeg | - | yes | ID3 tags |
| `video/*` | ffmpeg | - | yes | title, comment, artist | | `video/*` | ffmpeg | - | yes | title, comment, artist |
| `image/*` | ffmpeg | ocr | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags | | `image/*` | ffmpeg | ocr | yes | [Common EXIF tags](https://github.com/sist2app/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
| raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | no | yes | Common EXIF tags, GPS tags | | raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | no | yes | Common EXIF tags, GPS tags |
| ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style | | ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
| `text/plain` | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - | | `text/plain` | [libscan](https://github.com/sist2app/sist2/tree/master/third-party/libscan) | yes | no | - |
| html, xml | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - | | html, xml | [libscan](https://github.com/sist2app/sist2/tree/master/third-party/libscan) | yes | no | - |
| tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no | | tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
| docx, xlsx, pptx | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | if embedded | creator, modified_by, title | | docx, xlsx, pptx | [libscan](https://github.com/sist2app/sist2/tree/master/third-party/libscan) | yes | if embedded | creator, modified_by, title |
| doc (MS Word 97-2003) | antiword | yes | no | author, title | | doc (MS Word 97-2003) | antiword | yes | no | author, title |
| mobi, azw, azw3 | libmobi | yes | yes | author, title | | mobi, azw, azw3 | libmobi | yes | yes | author, title |
| wpd (WordPerfect) | libwpd | yes | no | *planned* | | wpd (WordPerfect) | libwpd | yes | no | *planned* |
| json, jsonl, ndjson | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | - | - | | json, jsonl, ndjson | [libscan](https://github.com/sist2app/sist2/tree/master/third-party/libscan) | yes | - | - |
\* *See [Archive files](#archive-files)* \* *See [Archive files](#archive-files)*
@ -137,7 +133,7 @@ You can enable OCR support for ebook (pdf,xps,fb2,epub) or image file types with
Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files). directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).
The `simon987/sist2` image comes with common languages The `sist2app/sist2` image comes with common languages
(hin, jpn, eng, fra, rus, spa, chi_sim, deu, pol) pre-installed. (hin, jpn, eng, fra, rus, spa, chi_sim, deu, pol) pre-installed.
You can use the `+` separator to specify multiple languages. The language You can use the `+` separator to specify multiple languages. The language
@ -177,13 +173,13 @@ sist2 v3.0.4+ supports named-entity recognition (NER). Simply add a supported re
to enable it. to enable it.
The text processing is done in your browser, no data is sent to any third-party services. The text processing is done in your browser, no data is sent to any third-party services.
See [simon987/sist2-ner-models](https://github.com/simon987/sist2-ner-models) for more details. See [sist2app/sist2-ner-models](https://github.com/sist2app/sist2-ner-models) for more details.
#### List of available repositories: #### List of available repositories:
| URL | Maintainer | Purpose | | URL | Maintainer | Purpose |
|---------------------------------------------------------------------------------------------------------|-----------------------------------------|---------| |---------------------------------------------------------------------------------------------------------|-----------------------------------------|---------|
| [simon987/sist2-ner-models](https://raw.githubusercontent.com/simon987/sist2-ner-models/main/repo.json) | [simon987](https://github.com/simon987) | General | | [sist2app/sist2-ner-models](https://raw.githubusercontent.com/sist2app/sist2-ner-models/main/repo.json) | [sist2app](https://github.com/sist2app) | General |
<details> <details>
<summary>Screenshot</summary> <summary>Screenshot</summary>
@ -199,7 +195,7 @@ You can compile **sist2** by yourself if you don't want to use the pre-compiled
### Using docker ### Using docker
```bash ```bash
git clone --recursive https://github.com/simon987/sist2/ git clone --recursive https://github.com/sist2app/sist2/
cd sist2 cd sist2
docker build . -t my-sist2-image docker build . -t my-sist2-image
# Copy sist2 executable from docker image # Copy sist2 executable from docker image
@ -214,7 +210,7 @@ docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
apt install gcc g++ python3 yasm ragel automake autotools-dev wget libtool libssl-dev curl zip unzip tar xorg-dev libglu1-mesa-dev libxcursor-dev libxml2-dev libxinerama-dev gettext nasm git nodejs apt install gcc g++ python3 yasm ragel automake autotools-dev wget libtool libssl-dev curl zip unzip tar xorg-dev libglu1-mesa-dev libxcursor-dev libxml2-dev libxinerama-dev gettext nasm git nodejs
``` ```
2. Install vcpkg using my fork: https://github.com/simon987/vcpkg 2. Install vcpkg using my fork: https://github.com/sist2app/vcpkg
3. Install vcpkg dependencies 3. Install vcpkg dependencies
```bash ```bash
@ -223,7 +219,7 @@ docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
4. Build 4. Build
```bash ```bash
git clone --recursive https://github.com/simon987/sist2/ git clone --recursive https://github.com/sist2app/sist2/
(cd sist2-vue; npm install; npm run build) (cd sist2-vue; npm install; npm run build)
(cd sist2-admin/frontend; npm install; npm run build) (cd sist2-admin/frontend; npm install; npm run build)
cmake -DSIST_DEBUG=off -DCMAKE_TOOLCHAIN_FILE=<VCPKG_ROOT>/scripts/buildsystems/vcpkg.cmake . cmake -DSIST_DEBUG=off -DCMAKE_TOOLCHAIN_FILE=<VCPKG_ROOT>/scripts/buildsystems/vcpkg.cmake .

View File

@ -5,20 +5,18 @@ services:
image: elasticsearch:7.17.9 image: elasticsearch:7.17.9
container_name: sist2-es container_name: sist2-es
volumes: volumes:
# This directory must have 1000:1000 permissions (or update PUID & PGID below) # This directory must have 1000:1000 permissions
- /data/sist2-es-data/:/usr/share/elasticsearch/data - /data/sist2-es-data/:/usr/share/elasticsearch/data
environment: environment:
- "discovery.type=single-node" - "discovery.type=single-node"
- "ES_JAVA_OPTS=-Xms2g -Xmx2g" - "ES_JAVA_OPTS=-Xms2g -Xmx2g"
- "PUID=1000"
- "PGID=1000"
sist2-admin: sist2-admin:
build: build:
context: . context: .
container_name: sist2-admin container_name: sist2-admin
volumes: volumes:
- /data/sist2-admin-data/:/sist2-admin/ - /data/sist2-admin-data/:/sist2-admin/
- /:/host - /<path to index>/:/host
ports: ports:
- 4090:4090 - 4090:4090
# NOTE: Don't export this port publicly! # NOTE: Don't export this port publicly!

View File

@ -1,5 +1,16 @@
with open("/usr/lib/file/magic.mgc", "rb") as f: MAGIC_PATHS = [
data = f.read() "/vcpkg/installed/x64-linux/share/libmagic/misc/magic.mgc",
"/work/vcpkg/installed/x64-linux/share/libmagic/misc/magic.mgc",
"/usr/lib/file/magic.mgc"
]
for path in MAGIC_PATHS:
try:
with open(path, "rb") as f:
data = f.read()
break
except:
continue
print("char magic_database_buffer[%d] = {%s};" % (len(data), ",".join(str(int(b)) for b in data))) print("char magic_database_buffer[%d] = {%s};" % (len(data), ",".join(str(int(b)) for b in data)))

View File

@ -309,7 +309,7 @@ class Sist2Api {
} }
getTagsSqlite() { getTagsSqlite() {
return axios.get(`${this.baseUrl}/fts/tags`) return axios.get(`${this.baseUrl}fts/tags`)
.then(resp => { .then(resp => {
return resp.data.map(tag => this._createEsTag(tag.tag, tag.count)) return resp.data.map(tag => this._createEsTag(tag.tag, tag.count))
}); });
@ -566,7 +566,7 @@ class Sist2Api {
} }
getDocumentSqlite(sid) { getDocumentSqlite(sid) {
return axios.get(`${this.baseUrl}/fts/d/${sid}`) return axios.get(`${this.baseUrl}fts/d/${sid}`)
.then(resp => ({ .then(resp => ({
_source: resp.data _source: resp.data
})); }));
@ -589,7 +589,7 @@ class Sist2Api {
} }
getTagSuggestionsSqlite(prefix) { getTagSuggestionsSqlite(prefix) {
return axios.post(`${this.baseUrl}/fts/suggestTags`, prefix) return axios.post(`${this.baseUrl}fts/suggestTags`, prefix)
.then(resp => (resp.data)); .then(resp => (resp.data));
} }
@ -620,7 +620,7 @@ class Sist2Api {
} }
getEmbeddings(sid, modelId) { getEmbeddings(sid, modelId) {
return axios.post(`${this.baseUrl}/e/${sid}/${modelId.toString().padStart(3, '0')}`) return axios.post(`${this.baseUrl}e/${sid}/${modelId.toString().padStart(3, '0')}`)
.then(resp => (resp.data)); .then(resp => (resp.data));
} }
} }

View File

@ -117,11 +117,11 @@ class Sist2ElasticsearchQuery {
} }
if (dateMin && dateMax) { if (dateMin && dateMax) {
filters.push({range: {mtime: {gte: dateMin, lte: dateMax}}}) filters.push({range: {mtime: {gte: dateMin, lte: dateMax, format: "epoch_second"}}})
} else if (dateMin) { } else if (dateMin) {
filters.push({range: {mtime: {gte: dateMin}}}) filters.push({range: {mtime: {gte: dateMin, format: "epoch_second"}}})
} else if (dateMax) { } else if (dateMax) {
filters.push({range: {mtime: {lte: dateMax}}}) filters.push({range: {mtime: {lte: dateMax, format: "epoch_second"}}})
} }
const path = pathText.replace(/\/$/, "").toLowerCase(); //remove trailing slashes const path = pathText.replace(/\/$/, "").toLowerCase(); //remove trailing slashes

View File

@ -25,6 +25,7 @@ const char *TESS_DATAPATHS[] = {
"/usr/share/tessdata/", "/usr/share/tessdata/",
"/usr/share/tesseract-ocr/tessdata/", "/usr/share/tesseract-ocr/tessdata/",
"/usr/share/tesseract-ocr/4.00/tessdata/", "/usr/share/tesseract-ocr/4.00/tessdata/",
"/usr/share/tesseract-ocr/5/tessdata/",
"./", "./",
NULL NULL
}; };

View File

@ -55,7 +55,7 @@
static const char *const Version = VERSION; static const char *const Version = VERSION;
static const int VersionMajor = 3; static const int VersionMajor = 3;
static const int VersionMinor = 4; static const int VersionMinor = 4;
static const int VersionPatch = 2; static const int VersionPatch = 3;
#ifndef SIST_PLATFORM #ifndef SIST_PLATFORM
#define SIST_PLATFORM unknown #define SIST_PLATFORM unknown

View File

@ -175,9 +175,19 @@ int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_d
return TRUE; return TRUE;
} }
#define IS_IGNORED_MESSAGE(message) \
( \
strstr(message, "invalid glyph index") \
|| strstr(message, "... repeated") \
) \
void fz_err_callback(void *user, const char *message) { void fz_err_callback(void *user, const char *message) {
document_t *doc = (document_t *) user; document_t *doc = (document_t *) user;
if (IS_IGNORED_MESSAGE(message)) {
return;
}
const scan_ebook_ctx_t *ctx = &thread_ctx; const scan_ebook_ctx_t *ctx = &thread_ctx;
CTX_LOG_WARNINGF(doc->filepath, "FZ: %s", message); CTX_LOG_WARNINGF(doc->filepath, "FZ: %s", message);
} }
@ -185,6 +195,10 @@ void fz_err_callback(void *user, const char *message) {
void fz_warn_callback(void *user, const char *message) { void fz_warn_callback(void *user, const char *message) {
document_t *doc = (document_t *) user; document_t *doc = (document_t *) user;
if (IS_IGNORED_MESSAGE(message)) {
return;
}
const scan_ebook_ctx_t *ctx = &thread_ctx; const scan_ebook_ctx_t *ctx = &thread_ctx;
CTX_LOG_DEBUGF(doc->filepath, "FZ: %s", message); CTX_LOG_DEBUGF(doc->filepath, "FZ: %s", message);
} }

View File

@ -223,14 +223,10 @@ read_frame(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVCodecContext *d
void append_tag_meta_if_not_exists(scan_media_ctx_t *ctx, document_t *doc, AVDictionaryEntry *tag, enum metakey key) { void append_tag_meta_if_not_exists(scan_media_ctx_t *ctx, document_t *doc, AVDictionaryEntry *tag, enum metakey key) {
meta_line_t *meta = doc->meta_head; if (meta_contains_key(doc->meta_head, key)) {
while (meta != NULL) { CTX_LOG_DEBUGF(doc->filepath, "Ignoring duplicate tag: '%02x=%s'",
if (meta->key == key) { key, tag->value);
CTX_LOG_DEBUGF(doc->filepath, "Ignoring duplicate tag: '%02x=%s' and '%02x=%s'", return;
key, meta->str_val, key, tag->value);
return;
}
meta = meta->next;
} }
text_buffer_t tex = text_buffer_create(-1); text_buffer_t tex = text_buffer_create(-1);
@ -445,7 +441,7 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
return SAVE_THUMBNAIL_FAILED; return SAVE_THUMBNAIL_FAILED;
} }
if (ctx->tesseract_lang != NULL && thumbnail_index == 0) { if (ctx->tesseract_lang != NULL && thumbnail_index == 0 && !meta_contains_key(doc->meta_head, MetaContent)) {
ocr_image(ctx, doc, decoder, frame_and_packet->frame); ocr_image(ctx, doc, decoder, frame_and_packet->frame);
} }

View File

@ -392,4 +392,18 @@ static parse_job_t *create_parse_job(const char *filepath, int mtime, size_t st_
return job; return job;
} }
static int meta_contains_key (meta_line_t *meta_head, enum metakey key) {
meta_line_t *meta = meta_head;
while (meta != NULL) {
if (meta->key == key) {
return TRUE;
}
meta = meta->next;
}
return FALSE;
}
#endif #endif