mirror of
https://github.com/simon987/sist2.git
synced 2025-04-19 10:16:42 +00:00
OCR tweaks
This commit is contained in:
parent
7f98d5a682
commit
7f6f3c02fa
@ -2,6 +2,7 @@
|
|||||||
#include "src/ctx.h"
|
#include "src/ctx.h"
|
||||||
|
|
||||||
#define MIN_OCR_SIZE 350
|
#define MIN_OCR_SIZE 350
|
||||||
|
#define MIN_OCR_LEN 10
|
||||||
__thread text_buffer_t thread_buffer;
|
__thread text_buffer_t thread_buffer;
|
||||||
|
|
||||||
|
|
||||||
@ -128,6 +129,7 @@ int read_stext_block(fz_stext_block *block, text_buffer_t *tex) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define IS_VALID_BPP(d) (d==1 || d==2 || d==4 || d==8 || d==16 || d==24 || d==32)
|
||||||
|
|
||||||
void fill_image(fz_context *ctx, UNUSED(fz_device *dev),
|
void fill_image(fz_context *ctx, UNUSED(fz_device *dev),
|
||||||
fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
|
fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
|
||||||
@ -135,7 +137,7 @@ void fill_image(fz_context *ctx, UNUSED(fz_device *dev),
|
|||||||
|
|
||||||
int l2factor = 0;
|
int l2factor = 0;
|
||||||
|
|
||||||
if (img->w > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE) {
|
if (img->w > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && IS_VALID_BPP(img->n)) {
|
||||||
|
|
||||||
fz_pixmap *pix = img->get_pixmap(ctx, img, NULL, img->w, img->h, &l2factor);
|
fz_pixmap *pix = img->get_pixmap(ctx, img, NULL, img->w, img->h, &l2factor);
|
||||||
|
|
||||||
@ -148,12 +150,14 @@ void fill_image(fz_context *ctx, UNUSED(fz_device *dev),
|
|||||||
|
|
||||||
char *text = TessBaseAPIGetUTF8Text(api);
|
char *text = TessBaseAPIGetUTF8Text(api);
|
||||||
size_t len = strlen(text);
|
size_t len = strlen(text);
|
||||||
|
if (len >= MIN_OCR_LEN) {
|
||||||
text_buffer_append_string(&thread_buffer, text, len - 1);
|
text_buffer_append_string(&thread_buffer, text, len - 1);
|
||||||
LOG_DEBUGF(
|
LOG_DEBUGF(
|
||||||
"pdf.c",
|
"pdf.c",
|
||||||
"(OCR) %dx%d got %dB from tesseract (%s), buffer:%dB",
|
"(OCR) %dx%d got %dB from tesseract (%s), buffer:%dB",
|
||||||
pix->w, pix->h, len, ScanCtx.tesseract_lang, thread_buffer.dyn_buffer.cur
|
pix->w, pix->h, len, ScanCtx.tesseract_lang, thread_buffer.dyn_buffer.cur
|
||||||
)
|
)
|
||||||
|
}
|
||||||
|
|
||||||
TessBaseAPIEnd(api);
|
TessBaseAPIEnd(api);
|
||||||
TessBaseAPIDelete(api);
|
TessBaseAPIDelete(api);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user