From 9e0d7bf992678cda6c29abc938329ec67f641f55 Mon Sep 17 00:00:00 2001 From: simon987 Date: Thu, 2 Feb 2023 19:52:37 -0500 Subject: [PATCH] Add test files as submodule, remove support for msword thumbnails --- .gitmodules | 3 ++ README.md | 2 +- third-party/libscan/libscan-test-files | 1 + third-party/libscan/libscan/msdoc/msdoc.c | 58 ----------------------- third-party/libscan/libscan/msdoc/msdoc.h | 2 - third-party/libscan/test/main.cpp | 23 --------- 6 files changed, 5 insertions(+), 84 deletions(-) create mode 160000 third-party/libscan/libscan-test-files diff --git a/.gitmodules b/.gitmodules index 9c7e90c..ddc49f2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "third-party/libscan/third-party/libmobi"] path = third-party/libscan/third-party/libmobi url = https://github.com/bfabiszewski/libmobi +[submodule "third-party/libscan/libscan-test-files"] + path = third-party/libscan/libscan-test-files + url = https://github.com/simon987/libscan-test-files diff --git a/README.md b/README.md index 3a53e0b..a0f874c 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ See [Usage guide](docs/USAGE.md) for more details | html, xml | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - | | tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no | | docx, xlsx, pptx | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | if embedded | creator, modified_by, title | -| doc (MS Word 97-2003) | antiword | yes | yes | author, title | +| doc (MS Word 97-2003) | antiword | yes | no | author, title | | mobi, azw, azw3 | libmobi | yes | no | author, title | | wpd (WordPerfect) | libwpd | yes | no | *planned* | | json, jsonl, ndjson | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | - | - | diff --git a/third-party/libscan/libscan-test-files b/third-party/libscan/libscan-test-files new file mode 160000 index 0000000..cdf1f89 --- /dev/null +++ b/third-party/libscan/libscan-test-files @@ -0,0 +1 @@ +Subproject commit cdf1f89423424b2091520bfe8c580d682fe01f7d diff --git a/third-party/libscan/libscan/msdoc/msdoc.c b/third-party/libscan/libscan/msdoc/msdoc.c index e34f304..a628ea6 100644 --- a/third-party/libscan/libscan/msdoc/msdoc.c +++ b/third-party/libscan/libscan/msdoc/msdoc.c @@ -4,8 +4,6 @@ #include #include "../../third-party/antiword/src/antiword.h" -#include "../ebook/ebook.h" - void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void *buf, size_t buf_len) { // Open word doc @@ -71,57 +69,6 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi free(out_buf); } -void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void *buf, size_t buf_len) { - - scan_ebook_ctx_t ebook_ctx = { - .content_size = ctx->content_size, - .tn_size = ctx->tn_size, - .enable_tn = TRUE, - .log = ctx->log, - .logf = ctx->logf, - .store = ctx->store, - }; - - // Open word doc - options_type *opts = direct_vGetOptions(); - opts->iParagraphBreak = 74; - opts->eConversionType = conversion_pdf; - opts->bHideHiddenText = 1; - opts->bRemoveRemovedText = 1; - opts->bUseLandscape = 0; - opts->eEncoding = encoding_latin_1; - opts->iPageHeight = 842; // A4 - opts->iPageWidth = 595; - opts->eImageLevel = level_ps_3; - - int doc_word_version = iGuessVersionNumber(file, (int) buf_len); - if (doc_word_version < 0 || doc_word_version == 3) { - free(buf); - return; - } - rewind(file); - - size_t out_len; - char *out_buf; - - FILE *file_out = open_memstream(&out_buf, &out_len); - - diagram_type *diag = pCreateDiagram("antiword", NULL, file_out); - if (diag == NULL) { - return; - } - - bWordDecryptor(file, (int) buf_len, diag); - vDestroyDiagram(diag); - - fclose(file_out); - - parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc, TRUE); - - free(buf); - free(out_buf); -} - void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { size_t buf_len; @@ -138,11 +85,6 @@ void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { return; } - if (ctx->enable_tn) { - char *buf_pdf = malloc(buf_len); - memcpy(buf_pdf, buf, buf_len); - parse_msdoc_pdf(ctx, doc, file, buf_pdf, buf_len); - } parse_msdoc_text(ctx, doc, file, buf, buf_len); fclose(file); } diff --git a/third-party/libscan/libscan/msdoc/msdoc.h b/third-party/libscan/libscan/msdoc/msdoc.h index ff72b00..a5b14bc 100644 --- a/third-party/libscan/libscan/msdoc/msdoc.h +++ b/third-party/libscan/libscan/msdoc/msdoc.h @@ -5,8 +5,6 @@ typedef struct { long content_size; - int enable_tn; - int tn_size; log_callback_t log; logf_callback_t logf; store_callback_t store; diff --git a/third-party/libscan/test/main.cpp b/third-party/libscan/test/main.cpp index df57379..f9b561e 100644 --- a/third-party/libscan/test/main.cpp +++ b/third-party/libscan/test/main.cpp @@ -916,15 +916,12 @@ TEST(Msdoc, Test1Pdf) { document_t doc; load_doc_file("libscan-test-files/test_files/msdoc/test1.doc", &f, &doc); - size_t size_before = store_size; - parse_msdoc(&msdoc_ctx, &f, &doc); ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "October 2000") != nullptr); ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "INTERNATIONAL ORGANIZATION FOR STANDARDIZATION"); ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Oliver Morgan"); ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); - ASSERT_NE(size_before, store_size); cleanup(&doc, &f); } @@ -934,15 +931,12 @@ TEST(Msdoc, Test1Text) { document_t doc; load_doc_file("libscan-test-files/test_files/msdoc/test1.doc", &f, &doc); - size_t size_before = store_size; - parse_msdoc(&msdoc_text_ctx, &f, &doc); ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "October 2000") != nullptr); ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "INTERNATIONAL ORGANIZATION FOR STANDARDIZATION"); ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Oliver Morgan"); ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); - ASSERT_EQ(size_before, store_size); cleanup(&doc, &f); } @@ -952,15 +946,12 @@ TEST(Msdoc, Test2Pdf) { document_t doc; load_doc_file("libscan-test-files/test_files/msdoc/test2.doc", &f, &doc); - size_t size_before = store_size; - parse_msdoc(&msdoc_ctx, &f, &doc); ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "GNU Free Documentation License") != nullptr); ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "DWARF Debugging Information Format"); ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Ron Brender"); ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); - ASSERT_NE(size_before, store_size); cleanup(&doc, &f); } @@ -970,15 +961,12 @@ TEST(Msdoc, Test3Pdf) { document_t doc; load_doc_file("libscan-test-files/test_files/msdoc/test3.doc", &f, &doc); - size_t size_before = store_size; - parse_msdoc(&msdoc_ctx, &f, &doc); ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "INTERNATIONAL PATENT CLASSIFICATION") != nullptr); ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "IPC Fixed Texts Specification"); ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Fievet"); ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); - ASSERT_NE(size_before, store_size); cleanup(&doc, &f); } @@ -988,15 +976,12 @@ TEST(Msdoc, Test4Pdf) { document_t doc; load_doc_file("libscan-test-files/test_files/msdoc/test4.doc", &f, &doc); - size_t size_before = store_size; - parse_msdoc(&msdoc_ctx, &f, &doc); ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "SQL Server international data types") != nullptr); ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "MSDN Authoring Template"); ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Brenda Yen"); ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); - ASSERT_NE(size_before, store_size); cleanup(&doc, &f); } @@ -1012,7 +997,6 @@ TEST(Msdoc, TestUtf8Pdf) { ASSERT_NE(get_meta(&doc, MetaContent), nullptr); ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "调查项目 A questionnaire") != nullptr); - ASSERT_NE(size_before, store_size); cleanup(&doc, &f); } @@ -1034,14 +1018,11 @@ TEST(Msdoc, Test5Pdf) { document_t doc; load_doc_file("libscan-test-files/test_files/msdoc/test5.doc", &f, &doc); - size_t size_before = store_size; - parse_msdoc(&msdoc_ctx, &f, &doc); ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "орган Федеральной") != nullptr); ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "uswo"); ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); - ASSERT_NE(size_before, store_size); cleanup(&doc, &f); } @@ -1184,15 +1165,11 @@ int main(int argc, char **argv) { msdoc_ctx.logf = noop_logf; msdoc_ctx.store = counter_store; msdoc_ctx.content_size = 500; - msdoc_ctx.tn_size = 500; - msdoc_ctx.enable_tn = TRUE; msdoc_text_ctx.log = noop_log; msdoc_text_ctx.logf = noop_logf; msdoc_text_ctx.store = counter_store; msdoc_text_ctx.content_size = 500; - msdoc_text_ctx.tn_size = 0; - msdoc_text_ctx.enable_tn = FALSE; wpd_ctx.log = noop_log; wpd_ctx.logf = noop_logf;