From 59fe31d6dda8a07fe34648c38ecca1fca678df8c Mon Sep 17 00:00:00 2001 From: simon987 Date: Wed, 8 Apr 2020 11:11:30 -0400 Subject: [PATCH] ebook tests (wip), +author meta, ebook bug fixes (content_len & cover err) --- .gitignore | 3 ++- libscan/ebook/ebook.c | 61 +++++++++++++++++++++++++++---------------- libscan/scan.h | 1 + test/main.cpp | 59 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 5339f1a..deb4f94 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ CMakeFiles CMakeCache.txt scan_test third-party/ -libscan-test-files \ No newline at end of file +libscan-test-files +scan_*_test \ No newline at end of file diff --git a/libscan/ebook/ebook.c b/libscan/ebook/ebook.c index 6adba8c..690b996 100644 --- a/libscan/ebook/ebook.c +++ b/libscan/ebook/ebook.c @@ -27,7 +27,7 @@ int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_d if (err != 0) { fz_drop_page(fzctx, cover); CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message) - return FALSE; + return -1; } fz_rect bounds = fz_bound_page(fzctx, cover); @@ -68,7 +68,7 @@ int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_d CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message) fz_drop_page(fzctx, cover); fz_drop_pixmap(fzctx, pixmap); - return FALSE; + return -1; } fz_buffer *fzbuf = NULL; @@ -93,10 +93,10 @@ int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_d if (err != 0) { CTX_LOG_WARNINGF(doc->filepath, "fz_new_buffer_from_pixmap_as_png() returned error code [%d] %s", err, fzctx->error.message) - return FALSE; + return -1; } - return TRUE; + return 0; } void fz_err_callback(void *user, const char *message) { @@ -121,6 +121,7 @@ static int read_stext_block(fz_stext_block *block, text_buffer_t *tex) { fz_stext_line *line = block->u.t.first_line; while (line != NULL) { + text_buffer_append_char(tex, ' '); fz_stext_char *c = line->first_char; while (c != NULL) { if (text_buffer_append_char(tex, c->c) == TEXT_BUF_FULL) { @@ -130,6 +131,7 @@ static int read_stext_block(fz_stext_block *block, text_buffer_t *tex) { } line = line->next; } + text_buffer_append_char(tex, ' '); return 0; } @@ -185,10 +187,10 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void* buf, size_t buf_len, const cha fz_var(err); fz_try(fzctx) - { - stream = fz_open_memory(fzctx, buf, buf_len); - fzdoc = fz_open_document_with_stream(fzctx, mime_str, stream); - } + { + stream = fz_open_memory(fzctx, buf, buf_len); + fzdoc = fz_open_document_with_stream(fzctx, mime_str, stream); + } fz_catch(fzctx) err = fzctx->error.errcode; @@ -199,23 +201,36 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void* buf, size_t buf_len, const cha return; } - char title[4096] = {'\0',}; + char title[8192] = {'\0',}; fz_try(fzctx) - fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title)); + fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title)); fz_catch(fzctx) ; if (strlen(title) > 0) { - meta_line_t *meta_content = malloc(sizeof(meta_line_t) + strlen(title)); - meta_content->key = MetaTitle; - strcpy(meta_content->str_val, title); - APPEND_META(doc, meta_content) + meta_line_t *meta_title = malloc(sizeof(meta_line_t) + strlen(title)); + meta_title->key = MetaTitle; + strcpy(meta_title->str_val, title); + APPEND_META(doc, meta_title) + } + + char author[4096] = {'\0',}; + fz_try(fzctx) + fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_AUTHOR, author, sizeof(author)); + fz_catch(fzctx) + ; + + if (strlen(author) > 0) { + meta_line_t *meta_author = malloc(sizeof(meta_line_t) + strlen(author)); + meta_author->key = MetaAuthor; + strcpy(meta_author->str_val, author); + APPEND_META(doc, meta_author) } int page_count = -1; fz_var(err); fz_try(fzctx) - page_count = fz_count_pages(fzctx, fzdoc); + page_count = fz_count_pages(fzctx, fzdoc); fz_catch(fzctx) err = fzctx->error.errcode; @@ -231,7 +246,7 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void* buf, size_t buf_len, const cha err = render_cover(ctx, fzctx, doc, fzdoc); } - if (err == TRUE) { + if (err != 0) { fz_drop_stream(fzctx, stream); fz_drop_document(fzctx, fzdoc); fz_drop_context(fzctx); @@ -246,7 +261,7 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void* buf, size_t buf_len, const cha fz_page *page = NULL; fz_var(err); fz_try(fzctx) - page = fz_load_page(fzctx, fzdoc, current_page); + page = fz_load_page(fzctx, fzdoc, current_page); fz_catch(fzctx) err = fzctx->error.errcode; if (err != 0) { @@ -273,12 +288,12 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void* buf, size_t buf_len, const cha fz_var(err); fz_try(fzctx) - fz_run_page(fzctx, page, dev, fz_identity, NULL); + fz_run_page(fzctx, page, dev, fz_identity, NULL); fz_always(fzctx) - { - fz_close_device(fzctx, dev); - fz_drop_device(fzctx, dev); - } + { + fz_close_device(fzctx, dev); + fz_drop_device(fzctx, dev); + } fz_catch(fzctx) err = fzctx->error.errcode; @@ -304,7 +319,7 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void* buf, size_t buf_len, const cha fz_drop_stext_page(fzctx, stext); fz_drop_page(fzctx, page); - if (thread_buffer.dyn_buffer.cur >= thread_buffer.dyn_buffer.size) { + if (thread_buffer.dyn_buffer.cur >= ctx->content_size) { break; } } diff --git a/libscan/scan.h b/libscan/scan.h index 6a70c2c..c84a143 100644 --- a/libscan/scan.h +++ b/libscan/scan.h @@ -78,6 +78,7 @@ enum metakey { MetaExifModel = META_STR(21), MetaExifIsoSpeedRatings = META_STR(22), MetaExifDateTime = META_STR(23), + MetaAuthor = META_STR(24), }; typedef struct meta_line { diff --git a/test/main.cpp b/test/main.cpp index 52a7145..49eda6c 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -4,12 +4,20 @@ extern "C" { #include "../libscan/arc/arc.h" #include "../libscan/text/text.h" +#include "../libscan/ebook/ebook.h" } static scan_arc_ctx_t arc_recurse_ctx; static scan_arc_ctx_t arc_list_ctx; + static scan_text_ctx_t text_500_ctx; +static scan_ebook_ctx_t ebook_ctx; +static scan_ebook_ctx_t ebook_500_ctx; + + + +/* Text */ TEST(Text, BookCsvContentLen) { vfile_t f; @@ -70,6 +78,45 @@ TEST(Text, MemUtf8_Invalid2) { cleanup(&doc, &f); } +TEST(Text, MemWhitespace) { + const char *content = "\n \ttest\t\ntest test "; + vfile_t f; + document_t doc; + load_doc_mem((void *) content, strlen(content), &f, &doc); + + parse_text(&text_500_ctx, &f, &doc); + + ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "test test test"); + cleanup(&doc, &f); +} + +/* Ebook */ + +TEST(Ebook, CandlePdf) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/ebook/General_-_Candle_Making.pdf", &f, &doc); + + parse_ebook(&ebook_500_ctx, &f, "application/pdf", &doc); + + ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Microsoft Word - A531 Candlemaking-01.doc"); + ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Dafydd Prichard"); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + ASSERT_NE(get_meta(&doc, MetaContent)->str_val[0], ' '); + cleanup(&doc, &f); +} + +TEST(Ebook, Utf8Pdf) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/ebook/utf8.pdf", &f, &doc); + + parse_ebook(&ebook_500_ctx, &f, "application/pdf", &doc); + + ASSERT_TRUE(STR_STARTS_WITH(get_meta(&doc, MetaContent)->str_val, "最後測試 ")); + cleanup(&doc, &f); +} + int main(int argc, char **argv) { arc_recurse_ctx.log = noop_log; @@ -87,6 +134,18 @@ int main(int argc, char **argv) { text_500_ctx.log = noop_log; text_500_ctx.logf = noop_logf; + ebook_ctx.content_size = 999999999999; + ebook_ctx.store = noop_store; + ebook_ctx.tesseract_lang = "eng"; + ebook_ctx.tesseract_path = "./tessdata"; + ebook_ctx.tn_size = 500; + pthread_mutex_init(&ebook_ctx.mupdf_mutex, nullptr); + ebook_ctx.log = noop_log; + ebook_ctx.logf = noop_logf; + + ebook_500_ctx = ebook_ctx; + ebook_500_ctx.content_size = 500; + ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } \ No newline at end of file