Update --ocr-* args, enable OCR'ing images

2025-12-10 22:18:54 +00:00 · 2022-01-08 14:24:50 -05:00
parent b37e5a4ad4
commit ad95684771
12 changed files with 121 additions and 24 deletions
--- a/third-party/libscan/libscan/ebook/ebook.c
+++ b/third-party/libscan/libscan/ebook/ebook.c
@@ -233,7 +233,7 @@ void fill_image(fz_context *fzctx, UNUSED(fz_device *dev),

    int l2factor = 0;

-    if (img->w >= MIN_OCR_SIZE && img->h >= MIN_OCR_SIZE && OCR_IS_VALID_BPP(img->n)) {
+    if (img->w >= MIN_OCR_WIDTH && img->h >= MIN_OCR_HEIGHT && OCR_IS_VALID_BPP(img->n)) {
        fz_pixmap *pix = img->get_pixmap(fzctx, img, NULL, img->w, img->h, &l2factor);
        ocr_extract_text(thread_ctx.tesseract_path, thread_ctx.tesseract_lang, pix->samples, pix->w, pix->h, pix->n, pix->stride, pix->xres, fill_image_ocr_cb);
        fz_drop_pixmap(fzctx, pix);
--- a/third-party/libscan/libscan/media/media.c
+++ b/third-party/libscan/libscan/media/media.c
@@ -1,12 +1,18 @@
 #include "media.h"
+#include "../ocr/ocr.h"
 #include <ctype.h>

 #define MIN_SIZE 32
 #define AVIO_BUF_SIZE 8192
 #define IS_VIDEO(fmt) ((fmt)->iformat->name && strcmp((fmt)->iformat->name, "image2") != 0)

+#define STREAM_IS_IMAGE (stream->nb_frames <= 1)
+
 #define STORE_AS_IS ((void*)-1)

+// Pointer to document being processed
+__thread document_t *thread_doc;
+
 const char *get_filepath_with_ext(document_t *doc, const char *filepath, const char *mime_str) {

    int has_extension = doc->ext > doc->base;
@@ -311,7 +317,7 @@ append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *f
            if (strcmp(key, "artist") == 0) {
                append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
            } else if (strcmp(key, "imagedescription") == 0) {
-                APPEND_TAG_META(MetaContent)
+                append_tag_meta_if_not_exists(ctx, doc, tag, MetaContent);
            } else if (strcmp(key, "make") == 0) {
                APPEND_TAG_META(MetaExifMake)
            } else if (strcmp(key, "model") == 0) {
@@ -343,6 +349,55 @@ append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *f
    }
 }

+static void ocr_image_cb(const char *text, size_t len) {
+    APPEND_STR_META(thread_doc, MetaContent, text);
+}
+
+#define OCR_PIXEL_FORMAT AV_PIX_FMT_RGB32
+#define OCR_BYTES_PER_PIXEL 4
+#define OCR_PIXELS_PER_INCH 70
+
+void ocr_image(scan_media_ctx_t *ctx, document_t *doc, const AVCodecContext *decoder, AVFrame *frame) {
+
+    // Convert to RGB32
+    AVFrame *rgb_frame = av_frame_alloc();
+
+    struct SwsContext *sws_ctx = sws_getContext(
+            frame->width, frame->height, decoder->pix_fmt,
+            frame->width, frame->height, OCR_PIXEL_FORMAT,
+            SWS_LANCZOS, 0, 0, 0
+    );
+
+    int dst_buf_len = av_image_get_buffer_size(OCR_PIXEL_FORMAT, frame->width, frame->height, 1);
+    uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len * 2);
+
+    av_image_fill_arrays(rgb_frame->data, rgb_frame->linesize, dst_buf, OCR_PIXEL_FORMAT, frame->width, frame->height,
+                         1);
+
+    sws_scale(sws_ctx,
+              (const uint8_t *const *) frame->data, frame->linesize,
+              0, frame->height,
+              rgb_frame->data, rgb_frame->linesize
+    );
+
+    thread_doc = doc;
+    ocr_extract_text(
+            ctx->tesseract_path,
+            ctx->tesseract_lang,
+            rgb_frame->data[0],
+            frame->width,
+            frame->height,
+            OCR_BYTES_PER_PIXEL,
+            rgb_frame->linesize[0],
+            OCR_PIXELS_PER_INCH,
+            ocr_image_cb
+    );
+
+    sws_freeContext(sws_ctx);
+    av_free(*rgb_frame->data);
+    av_frame_free(&rgb_frame);
+}
+
 void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) {

    int video_stream = -1;
@@ -419,11 +474,11 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
        avcodec_open2(decoder, video_codec, NULL);

        //Seek
-        if (stream->nb_frames > 1 && stream->codecpar->codec_id != AV_CODEC_ID_GIF) {
+        if (!STREAM_IS_IMAGE && stream->codecpar->codec_id != AV_CODEC_ID_GIF) {
            int seek_ret;
            for (int i = 20; i >= 0; i--) {
                seek_ret = av_seek_frame(pFormatCtx, video_stream,
-                                         stream->duration * 0.10, 0);
+                                         (long) ((double) stream->duration * 0.10), 0);
                if (seek_ret == 0) {
                    break;
                }
@@ -438,6 +493,11 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
            return;
        }

+        if (ctx->tesseract_lang != NULL && STREAM_IS_IMAGE) {
+            ocr_image(ctx, doc, decoder, frame_and_packet->frame);
+        }
+
+        // NOTE: OCR'd content takes precedence over exif image description
        append_video_meta(ctx, pFormatCtx, frame_and_packet->frame, doc, IS_VIDEO(pFormatCtx));

        // Scale frame
--- a/third-party/libscan/libscan/media/media.h
+++ b/third-party/libscan/libscan/media/media.h
@@ -19,6 +19,9 @@ typedef struct {
    float tn_qscale;
    long max_media_buffer;
    int read_subtitles;
+
+    const char *tesseract_lang;
+    const char *tesseract_path;
 } scan_media_ctx_t;

 __always_inline
--- a/third-party/libscan/libscan/ocr/ocr.h
+++ b/third-party/libscan/libscan/ocr/ocr.h
@@ -4,7 +4,8 @@
 #include "../scan.h"
 #include <tesseract/capi.h>

-#define MIN_OCR_SIZE 350
+#define MIN_OCR_WIDTH 350
+#define MIN_OCR_HEIGHT 100
 #define MIN_OCR_LEN 10

 #define OCR_IS_VALID_BPP(d)                                                    \
@@ -19,7 +20,7 @@ ocr_extract_text(const char *tesseract_path, const char *tesseract_lang,
                 const int img_bpp, const int img_stride, const int img_xres,
                 const ocr_extract_callback_t cb) {

-    if (img_w < MIN_OCR_SIZE || img_h < MIN_OCR_SIZE || img_xres <= 0 ||
+    if (img_w < MIN_OCR_WIDTH || img_h < MIN_OCR_HEIGHT || img_xres <= 0 ||
        !OCR_IS_VALID_BPP(img_bpp)) {
        return;
    }
@@ -31,11 +32,13 @@ ocr_extract_text(const char *tesseract_path, const char *tesseract_lang,
    TessBaseAPISetSourceResolution(api, img_xres);

    char *text = TessBaseAPIGetUTF8Text(api);
-    size_t len = strlen(text);
-    if (len >= MIN_OCR_LEN) {
-        cb(text, len);
+    if (text != NULL) {
+        size_t len = strlen(text);
+        if (len >= MIN_OCR_LEN) {
+            cb(text, len);
+        }
+        TessDeleteText(text);
    }
-    TessDeleteText(text);

    TessBaseAPIEnd(api);
    TessBaseAPIDelete(api);
--- a/third-party/libscan/libscan/scan.h
+++ b/third-party/libscan/libscan/scan.h
@@ -61,6 +61,7 @@ enum metakey {
    MetaFontName,
    MetaParent,
    MetaExifMake,
+    MetaExifDescription,
    MetaExifSoftware,
    MetaExifExposureTime,
    MetaExifFNumber,