mirror of
https://github.com/simon987/sist2.git
synced 2025-04-18 17:56:44 +00:00
Use mupdf's OCR methods rather than raw tesseract, various fixes
This commit is contained in:
parent
5a1a04629f
commit
2596361af5
@ -206,7 +206,7 @@ docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
|
||||
3. Install vcpkg dependencies
|
||||
|
||||
```bash
|
||||
vcpkg install curl[core,openssl] sqlite3[core,fts5] cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample,webp]
|
||||
vcpkg install curl[core,openssl] sqlite3[core,fts5] cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample,webp,opus,mp3lame,vpx,ffprobe,zlib]
|
||||
```
|
||||
|
||||
4. Build
|
||||
|
@ -149,6 +149,7 @@ void database_generate_stats(database_t *db, double treemap_threshold) {
|
||||
|
||||
merged_rows += 1;
|
||||
}
|
||||
free(iter);
|
||||
} while (merged_rows > TREEMAP_MINIMUM_MERGES_TO_CONTINUE);
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db,
|
||||
|
156
third-party/libscan/libscan/ebook/ebook.c
vendored
156
third-party/libscan/libscan/ebook/ebook.c
vendored
@ -11,8 +11,6 @@
|
||||
pthread_mutex_t Mutex;
|
||||
#endif
|
||||
|
||||
/* fill_image callback doesn't let us pass opaque pointers unless I create my own device */
|
||||
__thread text_buffer_t thread_buffer;
|
||||
__thread scan_ebook_ctx_t thread_ctx;
|
||||
|
||||
static void my_fz_lock(UNUSED(void *user), int lock) {
|
||||
@ -232,21 +230,54 @@ static int read_stext_block(fz_stext_block *block, text_buffer_t *tex) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void fill_image_ocr_cb(const char* text, size_t len) {
|
||||
text_buffer_append_string(&thread_buffer, text, len - 1);
|
||||
static int ocr_progress(fz_context *fzctx, void *user_data, int progress) {
|
||||
scan_ebook_ctx_t *ctx = user_data;
|
||||
CTX_LOG_INFOF("ebook.c", "OCR PROGRESS=%d", progress);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void fill_image(fz_context *fzctx, UNUSED(fz_device *dev),
|
||||
fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
|
||||
UNUSED(fz_color_params color_params)) {
|
||||
int read_stext(text_buffer_t *tex, fz_stext_page *stext) {
|
||||
|
||||
int l2factor = 0;
|
||||
int count = 0;
|
||||
|
||||
if (img->w >= MIN_OCR_WIDTH && img->h >= MIN_OCR_HEIGHT && OCR_IS_VALID_BPP(img->n)) {
|
||||
fz_pixmap *pix = img->get_pixmap(fzctx, img, NULL, img->w, img->h, &l2factor);
|
||||
ocr_extract_text(thread_ctx.tesseract_path, thread_ctx.tesseract_lang, pix->samples, pix->w, pix->h, pix->n, (int)pix->stride, pix->xres, fill_image_ocr_cb);
|
||||
fz_drop_pixmap(fzctx, pix);
|
||||
fz_stext_block *block = stext->first_block;
|
||||
|
||||
while (block != NULL) {
|
||||
int ret = read_stext_block(block, tex);
|
||||
count += 1;
|
||||
if (ret == TEXT_BUF_FULL) {
|
||||
break;
|
||||
}
|
||||
block = block->next;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
int load_page(fz_context *fzctx, fz_document *fzdoc, int current_page, fz_page **page) {
|
||||
int err = 0;
|
||||
|
||||
fz_var(err);
|
||||
fz_try(fzctx)(*page) = fz_load_page(fzctx, fzdoc, current_page);
|
||||
fz_catch(fzctx)err = fzctx->error.errcode;
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
fz_device *new_stext_dev(fz_context *fzctx, fz_stext_page *stext) {
|
||||
fz_stext_options opts = {
|
||||
.flags = FZ_STEXT_DEHYPHENATE,
|
||||
.scale = 0
|
||||
};
|
||||
|
||||
fz_device *stext_dev = fz_new_stext_device(fzctx, stext, &opts);
|
||||
stext_dev->stroke_path = NULL;
|
||||
stext_dev->stroke_text = NULL;
|
||||
stext_dev->clip_text = NULL;
|
||||
stext_dev->clip_stroke_path = NULL;
|
||||
stext_dev->clip_stroke_text = NULL;
|
||||
return stext_dev;
|
||||
}
|
||||
|
||||
void
|
||||
@ -326,46 +357,37 @@ parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mi
|
||||
|
||||
|
||||
if (ctx->content_size > 0) {
|
||||
fz_stext_options opts = {0};
|
||||
thread_buffer = text_buffer_create(ctx->content_size);
|
||||
text_buffer_t tex = text_buffer_create(ctx->content_size);
|
||||
|
||||
for (int current_page = 0; current_page < page_count; current_page++) {
|
||||
fz_page *page = NULL;
|
||||
fz_var(err);
|
||||
fz_try(fzctx)page = fz_load_page(fzctx, fzdoc, current_page);
|
||||
fz_catch(fzctx)err = fzctx->error.errcode;
|
||||
err = load_page(fzctx, fzdoc, current_page, &page);
|
||||
|
||||
if (err != 0) {
|
||||
CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message);
|
||||
text_buffer_destroy(&thread_buffer);
|
||||
CTX_LOG_WARNINGF(doc->filepath,
|
||||
"fz_load_page() returned error code [%d] %s", err, fzctx->error.message);
|
||||
text_buffer_destroy(&tex);
|
||||
fz_drop_page(fzctx, page);
|
||||
fz_drop_stream(fzctx, stream);
|
||||
fz_drop_document(fzctx, fzdoc);
|
||||
fz_drop_context(fzctx);
|
||||
return;
|
||||
}
|
||||
fz_rect page_mediabox = fz_bound_page(fzctx, page);
|
||||
|
||||
fz_stext_page *stext = fz_new_stext_page(fzctx, fz_bound_page(fzctx, page));
|
||||
fz_device *dev = fz_new_stext_device(fzctx, stext, &opts);
|
||||
dev->stroke_path = NULL;
|
||||
dev->stroke_text = NULL;
|
||||
dev->clip_text = NULL;
|
||||
dev->clip_stroke_path = NULL;
|
||||
dev->clip_stroke_text = NULL;
|
||||
|
||||
if (ctx->tesseract_lang != NULL) {
|
||||
dev->fill_image = fill_image;
|
||||
}
|
||||
fz_stext_page *stext = fz_new_stext_page(fzctx, page_mediabox);
|
||||
fz_device *stext_dev = new_stext_dev(fzctx, stext);
|
||||
|
||||
fz_var(err);
|
||||
fz_try(fzctx)fz_run_page(fzctx, page, dev, fz_identity, NULL);
|
||||
fz_try(fzctx)fz_run_page(fzctx, page, stext_dev, fz_identity, NULL);
|
||||
fz_always(fzctx) {
|
||||
fz_close_device(fzctx, dev);
|
||||
fz_drop_device(fzctx, dev);
|
||||
} fz_catch(fzctx)err = fzctx->error.errcode;
|
||||
fz_close_device(fzctx, stext_dev);
|
||||
fz_drop_device(fzctx, stext_dev);
|
||||
} fz_catch(fzctx) err = fzctx->error.errcode;
|
||||
|
||||
if (err != 0) {
|
||||
CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message);
|
||||
text_buffer_destroy(&thread_buffer);
|
||||
text_buffer_destroy(&tex);
|
||||
fz_drop_page(fzctx, page);
|
||||
fz_drop_stext_page(fzctx, stext);
|
||||
fz_drop_stream(fzctx, stream);
|
||||
@ -374,29 +396,63 @@ parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mi
|
||||
return;
|
||||
}
|
||||
|
||||
fz_stext_block *block = stext->first_block;
|
||||
while (block != NULL) {
|
||||
int ret = read_stext_block(block, &thread_buffer);
|
||||
if (ret == TEXT_BUF_FULL) {
|
||||
break;
|
||||
}
|
||||
block = block->next;
|
||||
}
|
||||
int num_blocks_read = read_stext(&tex, stext);
|
||||
|
||||
fz_drop_stext_page(fzctx, stext);
|
||||
fz_drop_page(fzctx, page);
|
||||
|
||||
if (thread_buffer.dyn_buffer.cur >= ctx->content_size) {
|
||||
if (tex.dyn_buffer.cur >= ctx->content_size) {
|
||||
fz_drop_page(fzctx, page);
|
||||
break;
|
||||
}
|
||||
}
|
||||
text_buffer_terminate_string(&thread_buffer);
|
||||
|
||||
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + thread_buffer.dyn_buffer.cur);
|
||||
// If OCR is enabled and no text is found on the page
|
||||
if (ctx->tesseract_lang != NULL && num_blocks_read == 0) {
|
||||
stext = fz_new_stext_page(fzctx, page_mediabox);
|
||||
stext_dev = new_stext_dev(fzctx, stext);
|
||||
|
||||
fz_device *ocr_dev = fz_new_ocr_device(fzctx, stext_dev, fz_identity,
|
||||
page_mediabox, TRUE,
|
||||
ctx->tesseract_lang,
|
||||
ctx->tesseract_path,
|
||||
ocr_progress, ctx);
|
||||
|
||||
fz_var(err);
|
||||
fz_try(fzctx)fz_run_page(fzctx, page, ocr_dev, fz_identity, NULL);
|
||||
fz_always(fzctx) {
|
||||
fz_close_device(fzctx, ocr_dev);
|
||||
fz_drop_device(fzctx, ocr_dev);
|
||||
} fz_catch(fzctx) err = fzctx->error.errcode;
|
||||
|
||||
if (err != 0) {
|
||||
CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message);
|
||||
fz_close_device(fzctx, stext_dev);
|
||||
fz_drop_device(fzctx, stext_dev);
|
||||
text_buffer_destroy(&tex);
|
||||
fz_drop_page(fzctx, page);
|
||||
fz_drop_stext_page(fzctx, stext);
|
||||
fz_drop_stream(fzctx, stream);
|
||||
fz_drop_document(fzctx, fzdoc);
|
||||
fz_drop_context(fzctx);
|
||||
return;
|
||||
}
|
||||
|
||||
fz_close_device(fzctx, stext_dev);
|
||||
fz_drop_device(fzctx, stext_dev);
|
||||
|
||||
read_stext(&tex, stext);
|
||||
fz_drop_stext_page(fzctx, stext);
|
||||
}
|
||||
|
||||
fz_drop_page(fzctx, page);
|
||||
}
|
||||
text_buffer_terminate_string(&tex);
|
||||
|
||||
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
|
||||
meta_content->key = MetaContent;
|
||||
memcpy(meta_content->str_val, thread_buffer.dyn_buffer.buf, thread_buffer.dyn_buffer.cur);
|
||||
memcpy(meta_content->str_val, tex.dyn_buffer.buf, tex.dyn_buffer.cur);
|
||||
APPEND_META(doc, meta_content);
|
||||
|
||||
text_buffer_destroy(&thread_buffer);
|
||||
text_buffer_destroy(&tex);
|
||||
}
|
||||
|
||||
fz_drop_stream(fzctx, stream);
|
||||
|
39
third-party/libscan/libscan/media/media.c
vendored
39
third-party/libscan/libscan/media/media.c
vendored
@ -118,13 +118,12 @@ static void read_subtitles(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, i
|
||||
AVPacket packet;
|
||||
AVSubtitle subtitle;
|
||||
|
||||
AVCodec *subtitle_codec = avcodec_find_decoder(pFormatCtx->streams[stream_idx]->codecpar->codec_id);
|
||||
const AVCodec *subtitle_codec = avcodec_find_decoder(pFormatCtx->streams[stream_idx]->codecpar->codec_id);
|
||||
AVCodecContext *decoder = avcodec_alloc_context3(subtitle_codec);
|
||||
decoder->thread_count = 1;
|
||||
avcodec_parameters_to_context(decoder, pFormatCtx->streams[stream_idx]->codecpar);
|
||||
avcodec_open2(decoder, subtitle_codec, NULL);
|
||||
|
||||
decoder->sub_text_format = FF_SUB_TEXT_FMT_ASS;
|
||||
|
||||
int got_sub;
|
||||
|
||||
while (1) {
|
||||
@ -177,8 +176,6 @@ read_frame(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVCodecContext *d
|
||||
result->packet = av_packet_alloc();
|
||||
result->frame = av_frame_alloc();
|
||||
|
||||
av_init_packet(result->packet);
|
||||
|
||||
int receive_ret = -EAGAIN;
|
||||
while (receive_ret == -EAGAIN) {
|
||||
// Get video frame
|
||||
@ -477,13 +474,12 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
|
||||
avcodec_send_frame(thumbnail_encoder, scaled_frame);
|
||||
avcodec_send_frame(thumbnail_encoder, NULL); // send EOF
|
||||
|
||||
AVPacket thumbnail_packet;
|
||||
av_init_packet(&thumbnail_packet);
|
||||
avcodec_receive_packet(thumbnail_encoder, &thumbnail_packet);
|
||||
AVPacket *thumbnail_packet = av_packet_alloc();
|
||||
avcodec_receive_packet(thumbnail_encoder, thumbnail_packet);
|
||||
|
||||
// Save thumbnail
|
||||
if (thumbnail_index == 0) {
|
||||
ctx->store(doc->doc_id, 0, thumbnail_packet.data, thumbnail_packet.size);
|
||||
ctx->store(doc->doc_id, 0, thumbnail_packet->data, thumbnail_packet->size);
|
||||
return_value = SAVE_THUMBNAIL_OK;
|
||||
|
||||
} else if (thumbnail_index > 1) {
|
||||
@ -491,7 +487,7 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
|
||||
// I figure out a better fix.
|
||||
thumbnail_index -= 1;
|
||||
|
||||
ctx->store(doc->doc_id, thumbnail_index, thumbnail_packet.data, thumbnail_packet.size);
|
||||
ctx->store(doc->doc_id, thumbnail_index, thumbnail_packet->data, thumbnail_packet->size);
|
||||
|
||||
return_value = SAVE_THUMBNAIL_OK;
|
||||
} else {
|
||||
@ -499,7 +495,7 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
|
||||
}
|
||||
|
||||
avcodec_free_context(&thumbnail_encoder);
|
||||
av_packet_unref(&thumbnail_packet);
|
||||
av_packet_free(&thumbnail_packet);
|
||||
av_free(*scaled_frame->data);
|
||||
av_frame_free(&scaled_frame);
|
||||
}
|
||||
@ -578,8 +574,9 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
|
||||
}
|
||||
|
||||
// Decoder
|
||||
AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
|
||||
const AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
|
||||
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
|
||||
decoder->thread_count = 1;
|
||||
avcodec_parameters_to_context(decoder, stream->codecpar);
|
||||
avcodec_open2(decoder, video_codec, NULL);
|
||||
|
||||
@ -630,6 +627,9 @@ void parse_media_filename(scan_media_ctx_t *ctx, const char *filepath, document_
|
||||
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()");
|
||||
return;
|
||||
}
|
||||
pFormatCtx->max_analyze_duration = 100000000;
|
||||
pFormatCtx->probesize = 100000000;
|
||||
|
||||
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
|
||||
if (res < 0) {
|
||||
CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res));
|
||||
@ -729,6 +729,9 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc,
|
||||
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()");
|
||||
return;
|
||||
}
|
||||
pFormatCtx->max_analyze_duration = 100000000;
|
||||
pFormatCtx->probesize = 100000000;
|
||||
|
||||
|
||||
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
|
||||
AVIOContext *io_ctx = NULL;
|
||||
@ -792,6 +795,8 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
|
||||
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()");
|
||||
return FALSE;
|
||||
}
|
||||
pFormatCtx->max_analyze_duration = 100000000;
|
||||
pFormatCtx->probesize = 100000000;
|
||||
|
||||
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
|
||||
|
||||
@ -823,6 +828,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
|
||||
// Decoder
|
||||
const AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
|
||||
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
|
||||
decoder->thread_count = 1;
|
||||
avcodec_parameters_to_context(decoder, stream->codecpar);
|
||||
avcodec_open2(decoder, video_codec, NULL);
|
||||
|
||||
@ -861,15 +867,14 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
|
||||
avcodec_send_frame(jpeg_encoder, scaled_frame);
|
||||
avcodec_send_frame(jpeg_encoder, NULL); // Send EOF
|
||||
|
||||
AVPacket jpeg_packet;
|
||||
av_init_packet(&jpeg_packet);
|
||||
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
|
||||
AVPacket *jpeg_packet = av_packet_alloc();
|
||||
avcodec_receive_packet(jpeg_encoder, jpeg_packet);
|
||||
|
||||
// Save thumbnail
|
||||
APPEND_LONG_META(doc, MetaThumbnail, 1);
|
||||
ctx->store(doc->doc_id, 0, jpeg_packet.data, jpeg_packet.size);
|
||||
ctx->store(doc->doc_id, 0, jpeg_packet->data, jpeg_packet->size);
|
||||
|
||||
av_packet_unref(&jpeg_packet);
|
||||
av_packet_free(&jpeg_packet);
|
||||
avcodec_free_context(&jpeg_encoder);
|
||||
av_free(*scaled_frame->data);
|
||||
av_frame_free(&scaled_frame);
|
||||
|
Loading…
x
Reference in New Issue
Block a user