Use mupdf's OCR methods rather than raw tesseract, various fixes

This commit is contained in:
simon987 2023-07-10 21:40:58 -04:00
parent 5a1a04629f
commit 2596361af5
4 changed files with 132 additions and 70 deletions

View File

@ -206,7 +206,7 @@ docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
3. Install vcpkg dependencies 3. Install vcpkg dependencies
```bash ```bash
vcpkg install curl[core,openssl] sqlite3[core,fts5] cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample,webp] vcpkg install curl[core,openssl] sqlite3[core,fts5] cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample,webp,opus,mp3lame,vpx,ffprobe,zlib]
``` ```
4. Build 4. Build

View File

@ -149,6 +149,7 @@ void database_generate_stats(database_t *db, double treemap_threshold) {
merged_rows += 1; merged_rows += 1;
} }
free(iter);
} while (merged_rows > TREEMAP_MINIMUM_MERGES_TO_CONTINUE); } while (merged_rows > TREEMAP_MINIMUM_MERGES_TO_CONTINUE);
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db,

View File

@ -11,8 +11,6 @@
pthread_mutex_t Mutex; pthread_mutex_t Mutex;
#endif #endif
/* fill_image callback doesn't let us pass opaque pointers unless I create my own device */
__thread text_buffer_t thread_buffer;
__thread scan_ebook_ctx_t thread_ctx; __thread scan_ebook_ctx_t thread_ctx;
static void my_fz_lock(UNUSED(void *user), int lock) { static void my_fz_lock(UNUSED(void *user), int lock) {
@ -232,21 +230,54 @@ static int read_stext_block(fz_stext_block *block, text_buffer_t *tex) {
return 0; return 0;
} }
static void fill_image_ocr_cb(const char* text, size_t len) { static int ocr_progress(fz_context *fzctx, void *user_data, int progress) {
text_buffer_append_string(&thread_buffer, text, len - 1); scan_ebook_ctx_t *ctx = user_data;
CTX_LOG_INFOF("ebook.c", "OCR PROGRESS=%d", progress);
return 0;
} }
void fill_image(fz_context *fzctx, UNUSED(fz_device *dev), int read_stext(text_buffer_t *tex, fz_stext_page *stext) {
fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
UNUSED(fz_color_params color_params)) {
int l2factor = 0; int count = 0;
if (img->w >= MIN_OCR_WIDTH && img->h >= MIN_OCR_HEIGHT && OCR_IS_VALID_BPP(img->n)) { fz_stext_block *block = stext->first_block;
fz_pixmap *pix = img->get_pixmap(fzctx, img, NULL, img->w, img->h, &l2factor);
ocr_extract_text(thread_ctx.tesseract_path, thread_ctx.tesseract_lang, pix->samples, pix->w, pix->h, pix->n, (int)pix->stride, pix->xres, fill_image_ocr_cb); while (block != NULL) {
fz_drop_pixmap(fzctx, pix); int ret = read_stext_block(block, tex);
count += 1;
if (ret == TEXT_BUF_FULL) {
break;
}
block = block->next;
} }
return count;
}
int load_page(fz_context *fzctx, fz_document *fzdoc, int current_page, fz_page **page) {
int err = 0;
fz_var(err);
fz_try(fzctx)(*page) = fz_load_page(fzctx, fzdoc, current_page);
fz_catch(fzctx)err = fzctx->error.errcode;
return err;
}
fz_device *new_stext_dev(fz_context *fzctx, fz_stext_page *stext) {
fz_stext_options opts = {
.flags = FZ_STEXT_DEHYPHENATE,
.scale = 0
};
fz_device *stext_dev = fz_new_stext_device(fzctx, stext, &opts);
stext_dev->stroke_path = NULL;
stext_dev->stroke_text = NULL;
stext_dev->clip_text = NULL;
stext_dev->clip_stroke_path = NULL;
stext_dev->clip_stroke_text = NULL;
return stext_dev;
} }
void void
@ -326,46 +357,37 @@ parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mi
if (ctx->content_size > 0) { if (ctx->content_size > 0) {
fz_stext_options opts = {0}; text_buffer_t tex = text_buffer_create(ctx->content_size);
thread_buffer = text_buffer_create(ctx->content_size);
for (int current_page = 0; current_page < page_count; current_page++) { for (int current_page = 0; current_page < page_count; current_page++) {
fz_page *page = NULL; fz_page *page = NULL;
fz_var(err); err = load_page(fzctx, fzdoc, current_page, &page);
fz_try(fzctx)page = fz_load_page(fzctx, fzdoc, current_page);
fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) { if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message); CTX_LOG_WARNINGF(doc->filepath,
text_buffer_destroy(&thread_buffer); "fz_load_page() returned error code [%d] %s", err, fzctx->error.message);
text_buffer_destroy(&tex);
fz_drop_page(fzctx, page); fz_drop_page(fzctx, page);
fz_drop_stream(fzctx, stream); fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc); fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx); fz_drop_context(fzctx);
return; return;
} }
fz_rect page_mediabox = fz_bound_page(fzctx, page);
fz_stext_page *stext = fz_new_stext_page(fzctx, fz_bound_page(fzctx, page)); fz_stext_page *stext = fz_new_stext_page(fzctx, page_mediabox);
fz_device *dev = fz_new_stext_device(fzctx, stext, &opts); fz_device *stext_dev = new_stext_dev(fzctx, stext);
dev->stroke_path = NULL;
dev->stroke_text = NULL;
dev->clip_text = NULL;
dev->clip_stroke_path = NULL;
dev->clip_stroke_text = NULL;
if (ctx->tesseract_lang != NULL) {
dev->fill_image = fill_image;
}
fz_var(err); fz_var(err);
fz_try(fzctx)fz_run_page(fzctx, page, dev, fz_identity, NULL); fz_try(fzctx)fz_run_page(fzctx, page, stext_dev, fz_identity, NULL);
fz_always(fzctx) { fz_always(fzctx) {
fz_close_device(fzctx, dev); fz_close_device(fzctx, stext_dev);
fz_drop_device(fzctx, dev); fz_drop_device(fzctx, stext_dev);
} fz_catch(fzctx)err = fzctx->error.errcode; } fz_catch(fzctx) err = fzctx->error.errcode;
if (err != 0) { if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message); CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message);
text_buffer_destroy(&thread_buffer); text_buffer_destroy(&tex);
fz_drop_page(fzctx, page); fz_drop_page(fzctx, page);
fz_drop_stext_page(fzctx, stext); fz_drop_stext_page(fzctx, stext);
fz_drop_stream(fzctx, stream); fz_drop_stream(fzctx, stream);
@ -374,29 +396,63 @@ parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mi
return; return;
} }
fz_stext_block *block = stext->first_block; int num_blocks_read = read_stext(&tex, stext);
while (block != NULL) {
int ret = read_stext_block(block, &thread_buffer);
if (ret == TEXT_BUF_FULL) {
break;
}
block = block->next;
}
fz_drop_stext_page(fzctx, stext);
fz_drop_page(fzctx, page);
if (thread_buffer.dyn_buffer.cur >= ctx->content_size) { fz_drop_stext_page(fzctx, stext);
if (tex.dyn_buffer.cur >= ctx->content_size) {
fz_drop_page(fzctx, page);
break; break;
} }
}
text_buffer_terminate_string(&thread_buffer);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + thread_buffer.dyn_buffer.cur); // If OCR is enabled and no text is found on the page
if (ctx->tesseract_lang != NULL && num_blocks_read == 0) {
stext = fz_new_stext_page(fzctx, page_mediabox);
stext_dev = new_stext_dev(fzctx, stext);
fz_device *ocr_dev = fz_new_ocr_device(fzctx, stext_dev, fz_identity,
page_mediabox, TRUE,
ctx->tesseract_lang,
ctx->tesseract_path,
ocr_progress, ctx);
fz_var(err);
fz_try(fzctx)fz_run_page(fzctx, page, ocr_dev, fz_identity, NULL);
fz_always(fzctx) {
fz_close_device(fzctx, ocr_dev);
fz_drop_device(fzctx, ocr_dev);
} fz_catch(fzctx) err = fzctx->error.errcode;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message);
fz_close_device(fzctx, stext_dev);
fz_drop_device(fzctx, stext_dev);
text_buffer_destroy(&tex);
fz_drop_page(fzctx, page);
fz_drop_stext_page(fzctx, stext);
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
fz_close_device(fzctx, stext_dev);
fz_drop_device(fzctx, stext_dev);
read_stext(&tex, stext);
fz_drop_stext_page(fzctx, stext);
}
fz_drop_page(fzctx, page);
}
text_buffer_terminate_string(&tex);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta_content->key = MetaContent; meta_content->key = MetaContent;
memcpy(meta_content->str_val, thread_buffer.dyn_buffer.buf, thread_buffer.dyn_buffer.cur); memcpy(meta_content->str_val, tex.dyn_buffer.buf, tex.dyn_buffer.cur);
APPEND_META(doc, meta_content); APPEND_META(doc, meta_content);
text_buffer_destroy(&thread_buffer); text_buffer_destroy(&tex);
} }
fz_drop_stream(fzctx, stream); fz_drop_stream(fzctx, stream);

View File

@ -118,13 +118,12 @@ static void read_subtitles(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, i
AVPacket packet; AVPacket packet;
AVSubtitle subtitle; AVSubtitle subtitle;
AVCodec *subtitle_codec = avcodec_find_decoder(pFormatCtx->streams[stream_idx]->codecpar->codec_id); const AVCodec *subtitle_codec = avcodec_find_decoder(pFormatCtx->streams[stream_idx]->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(subtitle_codec); AVCodecContext *decoder = avcodec_alloc_context3(subtitle_codec);
decoder->thread_count = 1;
avcodec_parameters_to_context(decoder, pFormatCtx->streams[stream_idx]->codecpar); avcodec_parameters_to_context(decoder, pFormatCtx->streams[stream_idx]->codecpar);
avcodec_open2(decoder, subtitle_codec, NULL); avcodec_open2(decoder, subtitle_codec, NULL);
decoder->sub_text_format = FF_SUB_TEXT_FMT_ASS;
int got_sub; int got_sub;
while (1) { while (1) {
@ -177,8 +176,6 @@ read_frame(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVCodecContext *d
result->packet = av_packet_alloc(); result->packet = av_packet_alloc();
result->frame = av_frame_alloc(); result->frame = av_frame_alloc();
av_init_packet(result->packet);
int receive_ret = -EAGAIN; int receive_ret = -EAGAIN;
while (receive_ret == -EAGAIN) { while (receive_ret == -EAGAIN) {
// Get video frame // Get video frame
@ -477,13 +474,12 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
avcodec_send_frame(thumbnail_encoder, scaled_frame); avcodec_send_frame(thumbnail_encoder, scaled_frame);
avcodec_send_frame(thumbnail_encoder, NULL); // send EOF avcodec_send_frame(thumbnail_encoder, NULL); // send EOF
AVPacket thumbnail_packet; AVPacket *thumbnail_packet = av_packet_alloc();
av_init_packet(&thumbnail_packet); avcodec_receive_packet(thumbnail_encoder, thumbnail_packet);
avcodec_receive_packet(thumbnail_encoder, &thumbnail_packet);
// Save thumbnail // Save thumbnail
if (thumbnail_index == 0) { if (thumbnail_index == 0) {
ctx->store(doc->doc_id, 0, thumbnail_packet.data, thumbnail_packet.size); ctx->store(doc->doc_id, 0, thumbnail_packet->data, thumbnail_packet->size);
return_value = SAVE_THUMBNAIL_OK; return_value = SAVE_THUMBNAIL_OK;
} else if (thumbnail_index > 1) { } else if (thumbnail_index > 1) {
@ -491,7 +487,7 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
// I figure out a better fix. // I figure out a better fix.
thumbnail_index -= 1; thumbnail_index -= 1;
ctx->store(doc->doc_id, thumbnail_index, thumbnail_packet.data, thumbnail_packet.size); ctx->store(doc->doc_id, thumbnail_index, thumbnail_packet->data, thumbnail_packet->size);
return_value = SAVE_THUMBNAIL_OK; return_value = SAVE_THUMBNAIL_OK;
} else { } else {
@ -499,7 +495,7 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
} }
avcodec_free_context(&thumbnail_encoder); avcodec_free_context(&thumbnail_encoder);
av_packet_unref(&thumbnail_packet); av_packet_free(&thumbnail_packet);
av_free(*scaled_frame->data); av_free(*scaled_frame->data);
av_frame_free(&scaled_frame); av_frame_free(&scaled_frame);
} }
@ -578,8 +574,9 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
} }
// Decoder // Decoder
AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id); const AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(video_codec); AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
decoder->thread_count = 1;
avcodec_parameters_to_context(decoder, stream->codecpar); avcodec_parameters_to_context(decoder, stream->codecpar);
avcodec_open2(decoder, video_codec, NULL); avcodec_open2(decoder, video_codec, NULL);
@ -630,6 +627,9 @@ void parse_media_filename(scan_media_ctx_t *ctx, const char *filepath, document_
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()"); CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()");
return; return;
} }
pFormatCtx->max_analyze_duration = 100000000;
pFormatCtx->probesize = 100000000;
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL); int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
if (res < 0) { if (res < 0) {
CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res)); CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res));
@ -729,6 +729,9 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc,
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()"); CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()");
return; return;
} }
pFormatCtx->max_analyze_duration = 100000000;
pFormatCtx->probesize = 100000000;
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE); unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
AVIOContext *io_ctx = NULL; AVIOContext *io_ctx = NULL;
@ -792,6 +795,8 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()"); CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()");
return FALSE; return FALSE;
} }
pFormatCtx->max_analyze_duration = 100000000;
pFormatCtx->probesize = 100000000;
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE); unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
@ -823,6 +828,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
// Decoder // Decoder
const AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id); const AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(video_codec); AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
decoder->thread_count = 1;
avcodec_parameters_to_context(decoder, stream->codecpar); avcodec_parameters_to_context(decoder, stream->codecpar);
avcodec_open2(decoder, video_codec, NULL); avcodec_open2(decoder, video_codec, NULL);
@ -861,15 +867,14 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
avcodec_send_frame(jpeg_encoder, scaled_frame); avcodec_send_frame(jpeg_encoder, scaled_frame);
avcodec_send_frame(jpeg_encoder, NULL); // Send EOF avcodec_send_frame(jpeg_encoder, NULL); // Send EOF
AVPacket jpeg_packet; AVPacket *jpeg_packet = av_packet_alloc();
av_init_packet(&jpeg_packet); avcodec_receive_packet(jpeg_encoder, jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
// Save thumbnail // Save thumbnail
APPEND_LONG_META(doc, MetaThumbnail, 1); APPEND_LONG_META(doc, MetaThumbnail, 1);
ctx->store(doc->doc_id, 0, jpeg_packet.data, jpeg_packet.size); ctx->store(doc->doc_id, 0, jpeg_packet->data, jpeg_packet->size);
av_packet_unref(&jpeg_packet); av_packet_free(&jpeg_packet);
avcodec_free_context(&jpeg_encoder); avcodec_free_context(&jpeg_encoder);
av_free(*scaled_frame->data); av_free(*scaled_frame->data);
av_frame_free(&scaled_frame); av_frame_free(&scaled_frame);