diff --git a/README.md b/README.md index d81a8a7..7176f1c 100644 --- a/README.md +++ b/README.md @@ -67,23 +67,23 @@ See [Usage guide](docs/USAGE.md) for more details ## Format support -File type | Library | Content | Thumbnail | Metadata -:---|:---|:---|:---|:--- -pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title | -cbz,cbr | [libscan](https://github.com/simon987/libscan) | - | yes | - | -`audio/*` | ffmpeg | - | yes | ID3 tags | -`video/*` | ffmpeg | - | yes | title, comment, artist | -`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags | -raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags | -ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style | -`text/plain` | [libscan](https://github.com/simon987/libscan) | yes | no | - | -html, xml | [libscan](https://github.com/simon987/libscan) | yes | no | - | -tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no | -docx, xlsx, pptx | [libscan](https://github.com/simon987/libscan) | yes | if embedded | creator, modified_by, title | -doc (MS Word 97-2003) | antiword | yes | yes | author, title | -mobi, azw, azw3 | libmobi | yes | no | author, title | -wpd (WordPerfect) | libwpd | yes | no | *planned* | -json, jsonl, ndjson | [libscan](https://github.com/simon987/libscan) | yes | - | - | +| File type | Library | Content | Thumbnail | Metadata | +|:--------------------------------------------------------------------------|:-----------------------------------------------------------------------------|:---------|:------------|:---------------------------------------------------------------------------------------------------------------------------------------| +| pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title | +| cbz,cbr | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | - | yes | - | +| `audio/*` | ffmpeg | - | yes | ID3 tags | +| `video/*` | ffmpeg | - | yes | title, comment, artist | +| `image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags | +| raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags | +| ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style | +| `text/plain` | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - | +| html, xml | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - | +| tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no | +| docx, xlsx, pptx | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | if embedded | creator, modified_by, title | +| doc (MS Word 97-2003) | antiword | yes | yes | author, title | +| mobi, azw, azw3 | libmobi | yes | no | author, title | +| wpd (WordPerfect) | libwpd | yes | no | *planned* | +| json, jsonl, ndjson | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | - | - | \* *See [Archive files](#archive-files)* @@ -109,11 +109,16 @@ directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-File The `simon987/sist2` image comes with common languages (hin, jpn, eng, fra, rus, spa) pre-installed. -Examples +You can use the `+` separator to specify multiple languages. The language +name must be identical to the `*.traineddata` file installed on your system +(use `chi_sim` rather than `chi-sim`). + +Examples: ```bash sist2 scan --ocr jpn ~/Books/Manga/ sist2 scan --ocr eng ~/Books/Textbooks/ +sist2 scan --ocr eng+chi_sim ~/Books/Chinese-Bilingual/ ``` ## Build from source @@ -126,7 +131,7 @@ You can compile **sist2** by yourself if you don't want to use the pre-compiled git clone --recursive https://github.com/simon987/sist2/ cd sist2 docker build . -f ./Dockerfile -t my-sist2-image -docker run --rm my-sist2-image cat /root/sist2 > sist2-x64-linux +docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux ``` ### On a linux computer diff --git a/src/cli.c b/src/cli.c index eb8c7f5..6cb6a5c 100644 --- a/src/cli.c +++ b/src/cli.c @@ -146,7 +146,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) { if (args->name == NULL) { args->name = g_path_get_basename(args->output); } else { - char* tmp = malloc(strlen(args->name) + 1); + char *tmp = malloc(strlen(args->name) + 1); strcpy(tmp, args->name); args->name = tmp; } @@ -171,14 +171,25 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) { if (args->tesseract_lang != NULL) { TessBaseAPI *api = TessBaseAPICreate(); - char filename[128]; - sprintf(filename, "%s.traineddata", args->tesseract_lang); - const char *path = find_file_in_paths(TESS_DATAPATHS, filename); - if (path == NULL) { - LOG_FATAL("cli.c", "Could not find tesseract language file!"); - } + const char* trained_data_path; + char *lang = malloc(strlen(args->tesseract_lang) + 1); + strcpy(lang, args->tesseract_lang); - ret = TessBaseAPIInit3(api, path, args->tesseract_lang); + lang = strtok(lang, "+"); + + while (lang != NULL) { + char filename[128]; + sprintf(filename, "%s.traineddata", lang); + trained_data_path = find_file_in_paths(TESS_DATAPATHS, filename); + if (trained_data_path == NULL) { + LOG_FATALF("cli.c", "Could not find tesseract language file: %s!", filename); + } + + lang = strtok(NULL, "+"); + } + free(lang); + + ret = TessBaseAPIInit3(api, trained_data_path, args->tesseract_lang); if (ret != 0) { fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang); return 1; @@ -186,7 +197,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) { TessBaseAPIEnd(api); TessBaseAPIDelete(api); - args->tesseract_path = path; + args->tesseract_path = trained_data_path; } if (args->exclude_regex != NULL) { @@ -220,7 +231,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) { } if (args->list_path != NULL) { - if(strcmp(args->list_path, "-") == 0) { + if (strcmp(args->list_path, "-") == 0) { args->list_file = stdin; LOG_DEBUG("cli.c", "Using stdin as list file") } else { diff --git a/third-party/libscan/libscan/ocr/ocr.h b/third-party/libscan/libscan/ocr/ocr.h index 423f2c5..8cdd020 100644 --- a/third-party/libscan/libscan/ocr/ocr.h +++ b/third-party/libscan/libscan/ocr/ocr.h @@ -19,25 +19,26 @@ ocr_extract_text(const char *tesseract_path, const char *tesseract_lang, const int img_bpp, const int img_stride, const int img_xres, const ocr_extract_callback_t cb) { - if (img_w < MIN_OCR_SIZE || img_h < MIN_OCR_SIZE || img_xres <= 0 || - !OCR_IS_VALID_BPP(img_bpp)) { - return; - } + if (img_w < MIN_OCR_SIZE || img_h < MIN_OCR_SIZE || img_xres <= 0 || + !OCR_IS_VALID_BPP(img_bpp)) { + return; + } - TessBaseAPI *api = TessBaseAPICreate(); - TessBaseAPIInit3(api, tesseract_path, tesseract_lang); + TessBaseAPI *api = TessBaseAPICreate(); + TessBaseAPIInit3(api, tesseract_path, tesseract_lang); - TessBaseAPISetImage(api, img_buf, img_w, img_h, img_bpp, img_stride); - TessBaseAPISetSourceResolution(api, img_xres); + TessBaseAPISetImage(api, img_buf, img_w, img_h, img_bpp, img_stride); + TessBaseAPISetSourceResolution(api, img_xres); - char *text = TessBaseAPIGetUTF8Text(api); - size_t len = strlen(text); - if (len >= MIN_OCR_LEN) { - cb(text, len); - } + char *text = TessBaseAPIGetUTF8Text(api); + size_t len = strlen(text); + if (len >= MIN_OCR_LEN) { + cb(text, len); + } + TessDeleteText(text); - TessBaseAPIEnd(api); - TessBaseAPIDelete(api); + TessBaseAPIEnd(api); + TessBaseAPIDelete(api); } #endif