tesseract ocr path fix

This commit is contained in:
simon 2020-02-01 19:58:45 -05:00
parent d22f83c797
commit bbee8aa721
9 changed files with 54 additions and 7 deletions

View File

@ -15,6 +15,13 @@
#define DEFAULT_BIND_ADDR "localhost"
#define DEFAULT_PORT "4090"
const char* TESS_DATAPATHS[] = {
"/usr/share/tessdata/",
"/usr/share/tesseract-ocr/tessdata/",
"./",
NULL
};
scan_args_t *scan_args_create() {
scan_args_t *args = calloc(sizeof(scan_args_t), 1);
@ -136,13 +143,23 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
if (args->tesseract_lang != NULL) {
TessBaseAPI *api = TessBaseAPICreate();
ret = TessBaseAPIInit3(api, TESS_DATAPATH, args->tesseract_lang);
char filename[128];
sprintf(filename, "%s.traineddata", args->tesseract_lang);
const char * path = find_file_in_paths(TESS_DATAPATHS, filename);
if (path == NULL) {
LOG_FATAL("cli.c", "Could not find tesseract language file!");
}
ret = TessBaseAPIInit3(api, path, args->tesseract_lang);
if (ret != 0) {
fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang);
return 1;
}
TessBaseAPIEnd(api);
TessBaseAPIDelete(api);
args->tesseract_path = path;
}
LOG_DEBUGF("cli.c", "arg quality=%f", args->quality)
@ -156,7 +173,8 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
LOG_DEBUGF("cli.c", "arg depth=%d", args->depth)
LOG_DEBUGF("cli.c", "arg path=%s", args->path)
LOG_DEBUGF("cli.c", "arg archive=%s", args->archive)
LOG_DEBUGF("cli.c", "arg ocr=%s", args->tesseract_lang)
LOG_DEBUGF("cli.c", "arg tesseract_lang=%s", args->tesseract_lang)
LOG_DEBUGF("cli.c", "arg tesseract_path=%s", args->tesseract_path)
return 0;
}

View File

@ -17,6 +17,7 @@ typedef struct scan_args {
char *archive;
archive_mode_t archive_mode;
char *tesseract_lang;
const char *tesseract_path;
} scan_args_t;
scan_args_t *scan_args_create();

View File

@ -28,6 +28,7 @@ struct {
pthread_mutex_t mupdf_mu;
char * tesseract_lang;
char * tesseract_path;
} ScanCtx;
struct {

File diff suppressed because one or more lines are too long

View File

@ -51,6 +51,7 @@ void sist2_scan(scan_args_t *args) {
strncpy(ScanCtx.index.desc.root, args->path, sizeof(ScanCtx.index.desc.root));
ScanCtx.index.desc.root_len = (short) strlen(ScanCtx.index.desc.root);
ScanCtx.tesseract_lang = args->tesseract_lang;
ScanCtx.tesseract_path = args->tesseract_path;
init_dir(ScanCtx.index.path);

View File

@ -141,7 +141,7 @@ void fill_image(fz_context *ctx, UNUSED(fz_device *dev),
if (pix->h > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && img->xres != 0) {
TessBaseAPI *api = TessBaseAPICreate();
TessBaseAPIInit3(api, TESS_DATAPATH, ScanCtx.tesseract_lang);
TessBaseAPIInit3(api, ScanCtx.tesseract_path, ScanCtx.tesseract_lang);
TessBaseAPISetImage(api, pix->samples, pix->w, pix->h, pix->n, pix->stride);
TessBaseAPISetSourceResolution(api, pix->xres);
@ -157,8 +157,8 @@ void fill_image(fz_context *ctx, UNUSED(fz_device *dev),
TessBaseAPIEnd(api);
TessBaseAPIDelete(api);
fz_drop_pixmap(ctx, pix);
}
fz_drop_pixmap(ctx, pix);
}
}

View File

@ -3,7 +3,6 @@
#define UUID_STR_LEN 37
#define UNUSED(x) __attribute__((__unused__)) x
#define TESS_DATAPATH "/usr/share/tessdata/"
#include <glib-2.0/glib.h>
#include <unistd.h>

View File

@ -1,4 +1,5 @@
#include "util.h"
#include "src/ctx.h"
dyn_buffer_t dyn_buffer_create() {
dyn_buffer_t buf;
@ -317,4 +318,29 @@ GHashTable *incremental_get_table() {
return file_table;
}
const char *find_file_in_paths(const char *paths[], const char *filename) {
for (int i = 0; paths[i] != NULL; i++) {
char *apath = abspath(paths[i]);
if (apath == NULL) {
continue;
}
char path[PATH_MAX];
snprintf(path, sizeof(path), "%s%s", apath, filename);
LOG_DEBUGF("util.c", "Looking for '%s' in folder '%s'", filename, apath)
free(apath);
struct stat info;
int ret = stat(path, &info);
if (ret != -1) {
return paths[i];
}
}
return NULL;
}

View File

@ -74,5 +74,6 @@ int incremental_get(GHashTable *table, unsigned long inode_no);
int incremental_mark_file_for_copy(GHashTable *table, unsigned long inode_no);
const char *find_file_in_paths(const char **paths, const char *filename);
#endif