From 94a5e0ac59e1998bd1e1741145b861277c139d31 Mon Sep 17 00:00:00 2001 From: Yatao Li Date: Fri, 7 Jan 2022 23:20:35 +0800 Subject: [PATCH 1/2] refactor: split ocr_extract_text from ebook --- third-party/libscan/libscan/ebook/ebook.c | 29 ++++----------- third-party/libscan/libscan/ocr/ocr.h | 43 +++++++++++++++++++++++ 2 files changed, 49 insertions(+), 23 deletions(-) create mode 100644 third-party/libscan/libscan/ocr/ocr.h diff --git a/third-party/libscan/libscan/ebook/ebook.c b/third-party/libscan/libscan/ebook/ebook.c index 4cdef33..29c907e 100644 --- a/third-party/libscan/libscan/ebook/ebook.c +++ b/third-party/libscan/libscan/ebook/ebook.c @@ -5,9 +5,7 @@ #include "../media/media.h" #include "../arc/arc.h" - -#define MIN_OCR_SIZE 350 -#define MIN_OCR_LEN 10 +#include "../ocr/ocr.h" /* fill_image callback doesn't let us pass opaque pointers unless I create my own device */ __thread text_buffer_t thread_buffer; @@ -225,7 +223,9 @@ static int read_stext_block(fz_stext_block *block, text_buffer_t *tex) { return 0; } -#define IS_VALID_BPP(d) ((d)==1 || (d)==2 || (d)==4 || (d)==8 || (d)==16 || (d)==24 || (d)==32) +static void fill_image_ocr_cb(const char* text, size_t len) { + text_buffer_append_string(&thread_buffer, text, len - 1); +} void fill_image(fz_context *fzctx, UNUSED(fz_device *dev), fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha), @@ -233,26 +233,9 @@ void fill_image(fz_context *fzctx, UNUSED(fz_device *dev), int l2factor = 0; - if (img->w > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && IS_VALID_BPP(img->n)) { - + if (img->w > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && OCR_IS_VALID_BPP(img->n)) { fz_pixmap *pix = img->get_pixmap(fzctx, img, NULL, img->w, img->h, &l2factor); - - if (pix->h > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && img->xres != 0) { - TessBaseAPI *api = TessBaseAPICreate(); - TessBaseAPIInit3(api, thread_ctx.tesseract_path, thread_ctx.tesseract_lang); - - TessBaseAPISetImage(api, pix->samples, pix->w, pix->h, pix->n, pix->stride); - TessBaseAPISetSourceResolution(api, pix->xres); - - char *text = TessBaseAPIGetUTF8Text(api); - size_t len = strlen(text); - if (len >= MIN_OCR_LEN) { - text_buffer_append_string(&thread_buffer, text, len - 1); - } - - TessBaseAPIEnd(api); - TessBaseAPIDelete(api); - } + ocr_extract_text(thread_ctx.tesseract_path, thread_ctx.tesseract_lang, pix->samples, pix->w, pix->h, pix->n, pix->stride, pix->xres, fill_image_ocr_cb); fz_drop_pixmap(fzctx, pix); } } diff --git a/third-party/libscan/libscan/ocr/ocr.h b/third-party/libscan/libscan/ocr/ocr.h new file mode 100644 index 0000000..caf5972 --- /dev/null +++ b/third-party/libscan/libscan/ocr/ocr.h @@ -0,0 +1,43 @@ +#ifndef OCR_H +#define OCR_H + +#include "../scan.h" +#include + +#define MIN_OCR_SIZE 350 +#define MIN_OCR_LEN 10 + +#define OCR_IS_VALID_BPP(d) \ + ((d) == 1 || (d) == 2 || (d) == 4 || (d) == 8 || (d) == 16 || (d) == 24 || \ + (d) == 32) + +typedef void (*ocr_extract_callback_t)(const char *, size_t); + +__always_inline static void +ocr_extract_text(const char *tesseract_path, const char *tesseract_lang, + const unsigned char *img_buf, const int img_w, const int img_h, + const int img_bpp, const int img_stride, const int img_xres, + const ocr_extract_callback_t cb) { + + if (img_h <= MIN_OCR_SIZE || img_h <= MIN_OCR_SIZE || img_xres <= 0 || + !OCR_IS_VALID_BPP(img_bpp)) { + return; + } + + TessBaseAPI *api = TessBaseAPICreate(); + TessBaseAPIInit3(api, tesseract_path, tesseract_lang); + + TessBaseAPISetImage(api, img_buf, img_w, img_h, img_bpp, img_stride); + TessBaseAPISetSourceResolution(api, img_xres); + + char *text = TessBaseAPIGetUTF8Text(api); + size_t len = strlen(text); + if (len >= MIN_OCR_LEN) { + cb(text, len); + } + + TessBaseAPIEnd(api); + TessBaseAPIDelete(api); +} + +#endif From cd2a44e016247f3bb219ea303e3192de0ba39c8d Mon Sep 17 00:00:00 2001 From: simon987 Date: Sat, 8 Jan 2022 10:24:57 -0500 Subject: [PATCH 2/2] Update ocr.h Fix minimum image size validation in ocr_extract_text --- third-party/libscan/libscan/ocr/ocr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third-party/libscan/libscan/ocr/ocr.h b/third-party/libscan/libscan/ocr/ocr.h index caf5972..e4f158d 100644 --- a/third-party/libscan/libscan/ocr/ocr.h +++ b/third-party/libscan/libscan/ocr/ocr.h @@ -19,7 +19,7 @@ ocr_extract_text(const char *tesseract_path, const char *tesseract_lang, const int img_bpp, const int img_stride, const int img_xres, const ocr_extract_callback_t cb) { - if (img_h <= MIN_OCR_SIZE || img_h <= MIN_OCR_SIZE || img_xres <= 0 || + if (img_w <= MIN_OCR_SIZE || img_h <= MIN_OCR_SIZE || img_xres <= 0 || !OCR_IS_VALID_BPP(img_bpp)) { return; }