From d10c13d46d2becf8dc32de1c7692227c92e04400 Mon Sep 17 00:00:00 2001 From: Yatao Li Date: Sun, 9 Jan 2022 03:09:46 +0800 Subject: [PATCH] fix #228: Handle multiple languages for tesseract OCR --- src/cli.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/src/cli.c b/src/cli.c index eb8c7f5..f220569 100644 --- a/src/cli.c +++ b/src/cli.c @@ -77,6 +77,15 @@ void exec_args_destroy(exec_args_t *args) { free(args); } +__always_inline +static const char* find_tessdata(const char* p_lang, size_t len) { + char filename[128]; + memcpy(filename, p_lang, len); + filename[len] = '\0'; + strcat(filename, ".traineddata"); + return find_file_in_paths(TESS_DATAPATHS, filename); +} + int scan_args_validate(scan_args_t *args, int argc, const char **argv) { if (argc < 2) { fprintf(stderr, "Required positional argument: PATH.\n"); @@ -171,16 +180,28 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) { if (args->tesseract_lang != NULL) { TessBaseAPI *api = TessBaseAPICreate(); - char filename[128]; - sprintf(filename, "%s.traineddata", args->tesseract_lang); - const char *path = find_file_in_paths(TESS_DATAPATHS, filename); - if (path == NULL) { - LOG_FATAL("cli.c", "Could not find tesseract language file!"); + const char *p_lang = args->tesseract_lang; + const char *e_lang = p_lang + strlen(p_lang); + const char *path = NULL; + while (p_lang < e_lang) { + const char *p_plus = strstr(p_lang, "+"); + if (p_plus == NULL) { + p_plus = e_lang; + } + const char* _path = find_tessdata(p_lang, p_plus - p_lang); + if (_path == NULL) { + LOG_FATAL("cli.c", "Could not find tesseract language file!"); + } else if (path != NULL && path != _path) { + LOG_FATAL("cli.c", "Multiple tesseract language files are not in the same directory!"); + } + path = _path; + p_lang = p_plus + 1; } ret = TessBaseAPIInit3(api, path, args->tesseract_lang); if (ret != 0) { fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang); + TessBaseAPIDelete(api); return 1; } TessBaseAPIEnd(api);