Fix tesseract lang validation, update README.md, fix tesseract memory leak

2025-12-18 01:39:05 +00:00 · 2022-01-08 11:04:52 -05:00
parent 255bc2d689
commit 15ae2190cf
3 changed files with 61 additions and 44 deletions
--- a/README.md
+++ b/README.md
@@ -67,23 +67,23 @@ See [Usage guide](docs/USAGE.md) for more details

 ## Format support

-File type | Library | Content | Thumbnail | Metadata
-:---|:---|:---|:---|:---
-pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
-cbz,cbr | [libscan](https://github.com/simon987/libscan) | - | yes | - |
-`audio/*` | ffmpeg | - | yes | ID3 tags |
-`video/*` | ffmpeg | - | yes | title, comment, artist |
-`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
-raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf  | LibRaw | - | yes | Common EXIF tags, GPS tags |
-ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
-`text/plain` | [libscan](https://github.com/simon987/libscan) | yes | no | - |
-html, xml | [libscan](https://github.com/simon987/libscan) | yes | no | - |
-tar, zip, rar, 7z, ar ...  | Libarchive | yes\* | - | no |
-docx, xlsx, pptx | [libscan](https://github.com/simon987/libscan) | yes | if embedded | creator, modified_by, title |
-doc (MS Word 97-2003) | antiword | yes | yes | author, title |
-mobi, azw, azw3 | libmobi | yes | no | author, title |
-wpd (WordPerfect) | libwpd | yes | no | *planned* |
-json, jsonl, ndjson | [libscan](https://github.com/simon987/libscan) | yes | - | - |
+| File type                                                                 | Library                                                                      | Content  | Thumbnail   | Metadata                                                                                                                               |
+|:--------------------------------------------------------------------------|:-----------------------------------------------------------------------------|:---------|:------------|:---------------------------------------------------------------------------------------------------------------------------------------|
+| pdf,xps,fb2,epub                                                          | MuPDF                                                                        | text+ocr | yes         | author, title                                                                                                                          |
+| cbz,cbr                                                                   | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | -        | yes         | -                                                                                                                                      |
+| `audio/*`                                                                 | ffmpeg                                                                       | -        | yes         | ID3 tags                                                                                                                               |
+| `video/*`                                                                 | ffmpeg                                                                       | -        | yes         | title, comment, artist                                                                                                                 |
+| `image/*`                                                                 | ffmpeg                                                                       | -        | yes         | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
+| raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw                                                                       | -        | yes         | Common EXIF tags, GPS tags                                                                                                             |
+| ttf,ttc,cff,woff,fnt,otf                                                  | Freetype2                                                                    | -        | yes, `bmp`  | Name & style                                                                                                                           |
+| `text/plain`                                                              | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes      | no          | -                                                                                                                                      |
+| html, xml                                                                 | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes      | no          | -                                                                                                                                      |
+| tar, zip, rar, 7z, ar ...                                                 | Libarchive                                                                   | yes\*    | -           | no                                                                                                                                     |
+| docx, xlsx, pptx                                                          | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes      | if embedded | creator, modified_by, title                                                                                                            |
+| doc (MS Word 97-2003)                                                     | antiword                                                                     | yes      | yes         | author, title                                                                                                                          |
+| mobi, azw, azw3                                                           | libmobi                                                                      | yes      | no          | author, title                                                                                                                          |
+| wpd (WordPerfect)                                                         | libwpd                                                                       | yes      | no          | *planned*                                                                                                                              |
+| json, jsonl, ndjson                                                       | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes      | -           | -                                                                                                                                      |

 \* *See [Archive files](#archive-files)*

@@ -109,11 +109,16 @@ directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-File
 The `simon987/sist2` image comes with common languages
 (hin, jpn, eng, fra, rus, spa) pre-installed.

-Examples
+You can use the `+` separator to specify multiple languages. The language
+name must be identical to the `*.traineddata` file installed on your system 
+(use `chi_sim` rather than `chi-sim`).
+
+Examples:

 ```bash
 sist2 scan --ocr jpn ~/Books/Manga/
 sist2 scan --ocr eng ~/Books/Textbooks/
+sist2 scan --ocr eng+chi_sim ~/Books/Chinese-Bilingual/
 ```

 ## Build from source
@@ -126,7 +131,7 @@ You can compile **sist2** by yourself if you don't want to use the pre-compiled
 git clone --recursive https://github.com/simon987/sist2/
 cd sist2
 docker build . -f ./Dockerfile -t my-sist2-image
-docker run --rm my-sist2-image cat /root/sist2 > sist2-x64-linux
+docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
 ```

 ### On a linux computer
--- a/src/cli.c
+++ b/src/cli.c
@@ -146,7 +146,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
    if (args->name == NULL) {
        args->name = g_path_get_basename(args->output);
    } else {
-        char* tmp = malloc(strlen(args->name) + 1);
+        char *tmp = malloc(strlen(args->name) + 1);
        strcpy(tmp, args->name);
        args->name = tmp;
    }
@@ -171,14 +171,25 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
    if (args->tesseract_lang != NULL) {
        TessBaseAPI *api = TessBaseAPICreate();

-        char filename[128];
-        sprintf(filename, "%s.traineddata", args->tesseract_lang);
-        const char *path = find_file_in_paths(TESS_DATAPATHS, filename);
-        if (path == NULL) {
-            LOG_FATAL("cli.c", "Could not find tesseract language file!");
-        }
+        const char* trained_data_path;
+        char *lang = malloc(strlen(args->tesseract_lang) + 1);
+        strcpy(lang, args->tesseract_lang);

-        ret = TessBaseAPIInit3(api, path, args->tesseract_lang);
+        lang = strtok(lang, "+");
+
+        while (lang != NULL) {
+            char filename[128];
+            sprintf(filename, "%s.traineddata", lang);
+            trained_data_path = find_file_in_paths(TESS_DATAPATHS, filename);
+            if (trained_data_path == NULL) {
+                LOG_FATALF("cli.c", "Could not find tesseract language file: %s!", filename);
+            }
+
+            lang = strtok(NULL, "+");
+        }
+        free(lang);
+
+        ret = TessBaseAPIInit3(api, trained_data_path, args->tesseract_lang);
        if (ret != 0) {
            fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang);
            return 1;
@@ -186,7 +197,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
        TessBaseAPIEnd(api);
        TessBaseAPIDelete(api);

-        args->tesseract_path = path;
+        args->tesseract_path = trained_data_path;
    }

    if (args->exclude_regex != NULL) {
@@ -220,7 +231,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
    }

    if (args->list_path != NULL) {
-        if(strcmp(args->list_path, "-") == 0) {
+        if (strcmp(args->list_path, "-") == 0) {
            args->list_file = stdin;
            LOG_DEBUG("cli.c", "Using stdin as list file")
        } else {
--- a/third-party/libscan/libscan/ocr/ocr.h
+++ b/third-party/libscan/libscan/ocr/ocr.h
@@ -19,25 +19,26 @@ ocr_extract_text(const char *tesseract_path, const char *tesseract_lang,
                 const int img_bpp, const int img_stride, const int img_xres,
                 const ocr_extract_callback_t cb) {

-  if (img_w < MIN_OCR_SIZE || img_h < MIN_OCR_SIZE || img_xres <= 0 ||
-      !OCR_IS_VALID_BPP(img_bpp)) {
-    return;
-  }
+    if (img_w < MIN_OCR_SIZE || img_h < MIN_OCR_SIZE || img_xres <= 0 ||
+        !OCR_IS_VALID_BPP(img_bpp)) {
+        return;
+    }

-  TessBaseAPI *api = TessBaseAPICreate();
-  TessBaseAPIInit3(api, tesseract_path, tesseract_lang);
+    TessBaseAPI *api = TessBaseAPICreate();
+    TessBaseAPIInit3(api, tesseract_path, tesseract_lang);

-  TessBaseAPISetImage(api, img_buf, img_w, img_h, img_bpp, img_stride);
-  TessBaseAPISetSourceResolution(api, img_xres);
+    TessBaseAPISetImage(api, img_buf, img_w, img_h, img_bpp, img_stride);
+    TessBaseAPISetSourceResolution(api, img_xres);

-  char *text = TessBaseAPIGetUTF8Text(api);
-  size_t len = strlen(text);
-  if (len >= MIN_OCR_LEN) {
-    cb(text, len);
-  }
+    char *text = TessBaseAPIGetUTF8Text(api);
+    size_t len = strlen(text);
+    if (len >= MIN_OCR_LEN) {
+        cb(text, len);
+    }
+    TessDeleteText(text);

-  TessBaseAPIEnd(api);
-  TessBaseAPIDelete(api);
+    TessBaseAPIEnd(api);
+    TessBaseAPIDelete(api);
 }

 #endif