OCR tweaks

This commit is contained in:
simon 2020-02-11 21:13:47 -05:00
parent 7f98d5a682
commit 7f6f3c02fa

View File

@ -2,6 +2,7 @@
#include "src/ctx.h" #include "src/ctx.h"
#define MIN_OCR_SIZE 350 #define MIN_OCR_SIZE 350
#define MIN_OCR_LEN 10
__thread text_buffer_t thread_buffer; __thread text_buffer_t thread_buffer;
@ -128,6 +129,7 @@ int read_stext_block(fz_stext_block *block, text_buffer_t *tex) {
return 0; return 0;
} }
#define IS_VALID_BPP(d) (d==1 || d==2 || d==4 || d==8 || d==16 || d==24 || d==32)
void fill_image(fz_context *ctx, UNUSED(fz_device *dev), void fill_image(fz_context *ctx, UNUSED(fz_device *dev),
fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha), fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
@ -135,7 +137,7 @@ void fill_image(fz_context *ctx, UNUSED(fz_device *dev),
int l2factor = 0; int l2factor = 0;
if (img->w > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE) { if (img->w > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && IS_VALID_BPP(img->n)) {
fz_pixmap *pix = img->get_pixmap(ctx, img, NULL, img->w, img->h, &l2factor); fz_pixmap *pix = img->get_pixmap(ctx, img, NULL, img->w, img->h, &l2factor);
@ -148,12 +150,14 @@ void fill_image(fz_context *ctx, UNUSED(fz_device *dev),
char *text = TessBaseAPIGetUTF8Text(api); char *text = TessBaseAPIGetUTF8Text(api);
size_t len = strlen(text); size_t len = strlen(text);
if (len >= MIN_OCR_LEN) {
text_buffer_append_string(&thread_buffer, text, len - 1); text_buffer_append_string(&thread_buffer, text, len - 1);
LOG_DEBUGF( LOG_DEBUGF(
"pdf.c", "pdf.c",
"(OCR) %dx%d got %dB from tesseract (%s), buffer:%dB", "(OCR) %dx%d got %dB from tesseract (%s), buffer:%dB",
pix->w, pix->h, len, ScanCtx.tesseract_lang, thread_buffer.dyn_buffer.cur pix->w, pix->h, len, ScanCtx.tesseract_lang, thread_buffer.dyn_buffer.cur
) )
}
TessBaseAPIEnd(api); TessBaseAPIEnd(api);
TessBaseAPIDelete(api); TessBaseAPIDelete(api);