Fix tesseract lang validation, update README.md, fix tesseract memory leak

This commit is contained in:
simon987 2022-01-08 11:04:52 -05:00
parent 255bc2d689
commit 15ae2190cf
3 changed files with 61 additions and 44 deletions

View File

@ -67,23 +67,23 @@ See [Usage guide](docs/USAGE.md) for more details
## Format support ## Format support
File type | Library | Content | Thumbnail | Metadata | File type | Library | Content | Thumbnail | Metadata |
:---|:---|:---|:---|:--- |:--------------------------------------------------------------------------|:-----------------------------------------------------------------------------|:---------|:------------|:---------------------------------------------------------------------------------------------------------------------------------------|
pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title | | pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
cbz,cbr | [libscan](https://github.com/simon987/libscan) | - | yes | - | | cbz,cbr | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | - | yes | - |
`audio/*` | ffmpeg | - | yes | ID3 tags | | `audio/*` | ffmpeg | - | yes | ID3 tags |
`video/*` | ffmpeg | - | yes | title, comment, artist | | `video/*` | ffmpeg | - | yes | title, comment, artist |
`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags | | `image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags | | raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags |
ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style | | ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
`text/plain` | [libscan](https://github.com/simon987/libscan) | yes | no | - | | `text/plain` | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - |
html, xml | [libscan](https://github.com/simon987/libscan) | yes | no | - | | html, xml | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - |
tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no | | tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
docx, xlsx, pptx | [libscan](https://github.com/simon987/libscan) | yes | if embedded | creator, modified_by, title | | docx, xlsx, pptx | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | if embedded | creator, modified_by, title |
doc (MS Word 97-2003) | antiword | yes | yes | author, title | | doc (MS Word 97-2003) | antiword | yes | yes | author, title |
mobi, azw, azw3 | libmobi | yes | no | author, title | | mobi, azw, azw3 | libmobi | yes | no | author, title |
wpd (WordPerfect) | libwpd | yes | no | *planned* | | wpd (WordPerfect) | libwpd | yes | no | *planned* |
json, jsonl, ndjson | [libscan](https://github.com/simon987/libscan) | yes | - | - | | json, jsonl, ndjson | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | - | - |
\* *See [Archive files](#archive-files)* \* *See [Archive files](#archive-files)*
@ -109,11 +109,16 @@ directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-File
The `simon987/sist2` image comes with common languages The `simon987/sist2` image comes with common languages
(hin, jpn, eng, fra, rus, spa) pre-installed. (hin, jpn, eng, fra, rus, spa) pre-installed.
Examples You can use the `+` separator to specify multiple languages. The language
name must be identical to the `*.traineddata` file installed on your system
(use `chi_sim` rather than `chi-sim`).
Examples:
```bash ```bash
sist2 scan --ocr jpn ~/Books/Manga/ sist2 scan --ocr jpn ~/Books/Manga/
sist2 scan --ocr eng ~/Books/Textbooks/ sist2 scan --ocr eng ~/Books/Textbooks/
sist2 scan --ocr eng+chi_sim ~/Books/Chinese-Bilingual/
``` ```
## Build from source ## Build from source
@ -126,7 +131,7 @@ You can compile **sist2** by yourself if you don't want to use the pre-compiled
git clone --recursive https://github.com/simon987/sist2/ git clone --recursive https://github.com/simon987/sist2/
cd sist2 cd sist2
docker build . -f ./Dockerfile -t my-sist2-image docker build . -f ./Dockerfile -t my-sist2-image
docker run --rm my-sist2-image cat /root/sist2 > sist2-x64-linux docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
``` ```
### On a linux computer ### On a linux computer

View File

@ -171,14 +171,25 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
if (args->tesseract_lang != NULL) { if (args->tesseract_lang != NULL) {
TessBaseAPI *api = TessBaseAPICreate(); TessBaseAPI *api = TessBaseAPICreate();
const char* trained_data_path;
char *lang = malloc(strlen(args->tesseract_lang) + 1);
strcpy(lang, args->tesseract_lang);
lang = strtok(lang, "+");
while (lang != NULL) {
char filename[128]; char filename[128];
sprintf(filename, "%s.traineddata", args->tesseract_lang); sprintf(filename, "%s.traineddata", lang);
const char *path = find_file_in_paths(TESS_DATAPATHS, filename); trained_data_path = find_file_in_paths(TESS_DATAPATHS, filename);
if (path == NULL) { if (trained_data_path == NULL) {
LOG_FATAL("cli.c", "Could not find tesseract language file!"); LOG_FATALF("cli.c", "Could not find tesseract language file: %s!", filename);
} }
ret = TessBaseAPIInit3(api, path, args->tesseract_lang); lang = strtok(NULL, "+");
}
free(lang);
ret = TessBaseAPIInit3(api, trained_data_path, args->tesseract_lang);
if (ret != 0) { if (ret != 0) {
fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang); fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang);
return 1; return 1;
@ -186,7 +197,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
TessBaseAPIEnd(api); TessBaseAPIEnd(api);
TessBaseAPIDelete(api); TessBaseAPIDelete(api);
args->tesseract_path = path; args->tesseract_path = trained_data_path;
} }
if (args->exclude_regex != NULL) { if (args->exclude_regex != NULL) {

View File

@ -35,6 +35,7 @@ ocr_extract_text(const char *tesseract_path, const char *tesseract_lang,
if (len >= MIN_OCR_LEN) { if (len >= MIN_OCR_LEN) {
cb(text, len); cb(text, len);
} }
TessDeleteText(text);
TessBaseAPIEnd(api); TessBaseAPIEnd(api);
TessBaseAPIDelete(api); TessBaseAPIDelete(api);