diff --git a/.gitmodules b/.gitmodules index a1b4278..d91406e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "third-party/utf8.h"] path = third-party/utf8.h url = https://github.com/sheredom/utf8.h +[submodule "third-party/antiword"] + path = third-party/antiword + url = https://github.com/simon987/antiword diff --git a/CMakeLists.txt b/CMakeLists.txt index cda33d8..ac09c74 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,12 @@ set(CMAKE_C_STANDARD 11) option(BUILD_TESTS "Build tests" off) +add_subdirectory(third-party/antiword) +add_compile_definitions( + antiword + NDEBUG +) + add_library( scan libscan/util.c libscan/util.h @@ -18,6 +24,7 @@ add_library( libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h libscan/media/media.c libscan/media/media.h libscan/font/font.c libscan/font/font.h + libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h third-party/utf8.h libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h) @@ -110,6 +117,7 @@ add_dependencies( scan libmobi ffmpeg + antiword ) target_link_libraries( @@ -161,6 +169,7 @@ target_link_libraries( ${JAS_LIB} ${GUMBO_LIB} dl + antiword ) target_include_directories( @@ -183,4 +192,8 @@ if (BUILD_TESTS) add_executable(scan_a_test test/main.cpp test/test_util.cpp test/test_util.h) target_compile_options(scan_a_test PRIVATE -g -fsanitize=address -fno-omit-frame-pointer) target_link_libraries(scan_a_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=address scan) + + add_executable(scan_test test/main.cpp test/test_util.cpp test/test_util.h) + target_compile_options(scan_test PRIVATE -g -fno-omit-frame-pointer) + target_link_libraries(scan_test PRIVATE GTest::gtest GTest::gtest_main scan) endif() diff --git a/libscan/ebook/ebook.c b/libscan/ebook/ebook.c index ad3c3fa..6d062c3 100644 --- a/libscan/ebook/ebook.c +++ b/libscan/ebook/ebook.c @@ -292,10 +292,7 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha ; if (strlen(title) > 0) { - meta_line_t *meta_title = malloc(sizeof(meta_line_t) + strlen(title)); - meta_title->key = MetaTitle; - strcpy(meta_title->str_val, title); - APPEND_META(doc, meta_title) + APPEND_UTF8_META(doc, MetaTitle, title) } char author[4096] = {'\0',}; @@ -305,10 +302,7 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha ; if (strlen(author) > 0) { - meta_line_t *meta_author = malloc(sizeof(meta_line_t) + strlen(author)); - meta_author->key = MetaAuthor; - strcpy(meta_author->str_val, author); - APPEND_META(doc, meta_author) + APPEND_UTF8_META(doc, MetaAuthor, author) } int page_count = -1; diff --git a/libscan/macros.h b/libscan/macros.h index 600f3ba..cd6cb5e 100644 --- a/libscan/macros.h +++ b/libscan/macros.h @@ -1,16 +1,16 @@ -#ifndef FALSE -#define FALSE (0) +#ifndef FALSE +#define FALSE (0) #define BOOL int #endif -#ifndef TRUE -#define TRUE (!FALSE) +#ifndef TRUE +#define TRUE (!FALSE) #endif -#undef MAX +#undef MAX #define MAX(a, b) (((a) > (b)) ? (a) : (b)) -#undef MIN +#undef MIN #define MIN(a, b) (((a) < (b)) ? (a) : (b)) #ifndef PATH_MAX @@ -18,7 +18,7 @@ #endif #undef ABS -#define ABS(a) (((a) < 0) ? -(a) : (a)) +#define ABS(a) (((a) < 0) ? -(a) : (a)) #define APPEND_STR_META(doc, keyname, value) \ {meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \ @@ -37,3 +37,23 @@ meta_str->key = MetaThumbnail; \ sprintf(meta_str->str_val, "%04d,%04d", width, height); \ APPEND_META(doc, meta_str)} + +#define APPEND_META(doc, meta) \ + meta->next = NULL;\ + if (doc->meta_head == NULL) {\ + doc->meta_head = meta;\ + doc->meta_tail = doc->meta_head;\ + } else {\ + doc->meta_tail->next = meta;\ + doc->meta_tail = meta;\ + } + +#define APPEND_UTF8_META(doc, keyname, str) \ + text_buffer_t tex = text_buffer_create(-1); \ + text_buffer_append_string0(&tex, str); \ + text_buffer_terminate_string(&tex); \ + meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \ + meta_tag->key = keyname; \ + strcpy(meta_tag->str_val, tex.dyn_buffer.buf); \ + APPEND_META(doc, meta_tag) \ + text_buffer_destroy(&tex); diff --git a/libscan/media/media.c b/libscan/media/media.c index 7cdabcb..65bd5e4 100644 --- a/libscan/media/media.c +++ b/libscan/media/media.c @@ -166,15 +166,8 @@ void append_tag_meta_if_not_exists(scan_media_ctx_t *ctx, document_t *doc, AVDic text_buffer_destroy(&tex); } -#define APPEND_TAG_META(doc, tag_, keyname) \ - text_buffer_t tex = text_buffer_create(-1); \ - text_buffer_append_string0(&tex, tag_->value); \ - text_buffer_terminate_string(&tex); \ - meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \ - meta_tag->key = keyname; \ - strcpy(meta_tag->str_val, tex.dyn_buffer.buf); \ - APPEND_META(doc, meta_tag) \ - text_buffer_destroy(&tex); +#define APPEND_TAG_META(keyname) \ + APPEND_UTF8_META(doc, keyname, tag->value) #define STRCPY_TOLOWER(dst, str) \ strncpy(dst, str, sizeof(dst)); \ @@ -190,17 +183,17 @@ static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) { STRCPY_TOLOWER(key, tag->key) if (strcmp(key, "artist") == 0) { - APPEND_TAG_META(doc, tag, MetaArtist) + APPEND_TAG_META(MetaArtist) } else if (strcmp(key, "genre") == 0) { - APPEND_TAG_META(doc, tag, MetaGenre) + APPEND_TAG_META(MetaGenre) } else if (strcmp(key, "title") == 0) { - APPEND_TAG_META(doc, tag, MetaTitle) + APPEND_TAG_META(MetaTitle) } else if (strcmp(key, "album_artist") == 0) { - APPEND_TAG_META(doc, tag, MetaAlbumArtist) + APPEND_TAG_META(MetaAlbumArtist) } else if (strcmp(key, "album") == 0) { - APPEND_TAG_META(doc, tag, MetaAlbum) + APPEND_TAG_META(MetaAlbum) } else if (strcmp(key, "comment") == 0) { - APPEND_TAG_META(doc, tag, MetaContent) + APPEND_TAG_META(MetaContent) } } } @@ -244,25 +237,25 @@ append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *f if (strcmp(key, "artist") == 0) { append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist); } else if (strcmp(tag->key, "ImageDescription") == 0) { - APPEND_TAG_META(doc, tag, MetaContent) + APPEND_TAG_META(MetaContent) } else if (strcmp(tag->key, "Make") == 0) { - APPEND_TAG_META(doc, tag, MetaExifMake) + APPEND_TAG_META(MetaExifMake) } else if (strcmp(tag->key, "Model") == 0) { - APPEND_TAG_META(doc, tag, MetaExifModel) + APPEND_TAG_META(MetaExifModel) } else if (strcmp(tag->key, "Software") == 0) { - APPEND_TAG_META(doc, tag, MetaExifSoftware) + APPEND_TAG_META(MetaExifSoftware) } else if (strcmp(tag->key, "FNumber") == 0) { - APPEND_TAG_META(doc, tag, MetaExifFNumber) + APPEND_TAG_META(MetaExifFNumber) } else if (strcmp(tag->key, "FocalLength") == 0) { - APPEND_TAG_META(doc, tag, MetaExifFocalLength) + APPEND_TAG_META(MetaExifFocalLength) } else if (strcmp(tag->key, "UserComment") == 0) { - APPEND_TAG_META(doc, tag, MetaExifUserComment) + APPEND_TAG_META(MetaExifUserComment) } else if (strcmp(tag->key, "ISOSpeedRatings") == 0) { - APPEND_TAG_META(doc, tag, MetaExifIsoSpeedRatings) + APPEND_TAG_META(MetaExifIsoSpeedRatings) } else if (strcmp(tag->key, "ExposureTime") == 0) { - APPEND_TAG_META(doc, tag, MetaExifExposureTime) + APPEND_TAG_META(MetaExifExposureTime) } else if (strcmp(tag->key, "DateTime") == 0) { - APPEND_TAG_META(doc, tag, MetaExifDateTime) + APPEND_TAG_META(MetaExifDateTime) } } } diff --git a/libscan/msdoc/msdoc.c b/libscan/msdoc/msdoc.c new file mode 100644 index 0000000..21775e6 --- /dev/null +++ b/libscan/msdoc/msdoc.c @@ -0,0 +1,165 @@ +#include "msdoc.h" +#include + +#include +#include "../../third-party/antiword/src/antiword.h" + +#include "../ebook/ebook.h" + +void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { + + // Open file + size_t buf_len; + char *buf = read_all(f, &buf_len); + if (buf == NULL) { + CTX_LOG_ERROR(f->filepath, "read_all() failed") + return; + } + + FILE *file_in = fmemopen(buf, buf_len, "rb"); + if (file_in == NULL) { + free(buf); + CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno) + return; + } + + // Open word doc + options_type *opts = direct_vGetOptions(); + opts->iParagraphBreak = 74; + opts->eConversionType = conversion_text; + opts->bHideHiddenText = 1; + opts->bRemoveRemovedText = 1; + opts->bUseLandscape = 0; + opts->eEncoding = encoding_utf_8; + opts->iPageHeight = 842; // A4 + opts->iPageWidth = 595; + opts->eImageLevel = level_ps_3; + + int doc_word_version = iGuessVersionNumber(file_in, buf_len); + if (doc_word_version < 0 || doc_word_version == 3) { + fclose(file_in); + free(buf); + return; + } + rewind(file_in); + + size_t out_len; + char *out_buf; + + FILE *file_out = open_memstream(&out_buf, &out_len); + + diagram_type *diag = pCreateDiagram("antiword", NULL, file_out); + if (diag == NULL) { + fclose(file_in); + return; + } + + iInitDocument(file_in, buf_len); + const char* author = szGetAuthor(); + if (author != NULL) { + APPEND_UTF8_META(doc, MetaAuthor, author) + } + + const char* title = szGetTitle(); + if (title != NULL) { + APPEND_UTF8_META(doc, MetaTitle, title) + } + vFreeDocument(); + + bWordDecryptor(file_in, buf_len, diag); + vDestroyDiagram(diag); + fclose(file_out); + + if (buf_len > 0) { + text_buffer_t tex = text_buffer_create(ctx->content_size); + text_buffer_append_string(&tex, out_buf, out_len); + text_buffer_terminate_string(&tex); + + meta_line_t *meta_content = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); + meta_content->key = MetaContent; + memcpy(meta_content->str_val, tex.dyn_buffer.buf, tex.dyn_buffer.cur); + APPEND_META(doc, meta_content) + + text_buffer_destroy(&tex); + } + + fclose(file_in); + free(buf); + free(out_buf); +} + +void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { + + scan_ebook_ctx_t ebook_ctx = { + .content_size = ctx->content_size, + .tn_size = ctx->tn_size, + .log = ctx->log, + .logf = ctx->logf, + .store = ctx->store, + }; + + // Open file + size_t buf_len; + char *buf = read_all(f, &buf_len); + if (buf == NULL) { + CTX_LOG_ERROR(f->filepath, "read_all() failed") + return; + } + + FILE *file = fmemopen(buf, buf_len, "rb"); + if (file == NULL) { + free(buf); + CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno) + return; + } + // Open word doc + + options_type *opts = direct_vGetOptions(); + opts->iParagraphBreak = 74; + opts->eConversionType = conversion_pdf; + opts->bHideHiddenText = 1; + opts->bRemoveRemovedText = 1; + opts->bUseLandscape = 0; + opts->eEncoding = encoding_latin_2; + opts->iPageHeight = 842; // A4 + opts->iPageWidth = 595; + opts->eImageLevel = level_ps_3; + + int doc_word_version = iGuessVersionNumber(file, buf_len); + if (doc_word_version < 0 || doc_word_version == 3) { + fclose(file); + free(buf); + return; + } + rewind(file); + + size_t out_len; + char *out_buf; + + FILE *file_out = open_memstream(&out_buf, &out_len); + + diagram_type *diag = pCreateDiagram("antiword", NULL, file_out); + if (diag == NULL) { + fclose(file); + return; + } + + int ret = bWordDecryptor(file, buf_len, diag); + vDestroyDiagram(diag); + + fclose(file_out); + + parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc); + + fclose(file); + free(buf); + free(out_buf); +} + +void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { + if (ctx->tn_size > 0) { + parse_msdoc_pdf(ctx, f, doc); + } else { + parse_msdoc_text(ctx, f, doc); + } +} diff --git a/libscan/msdoc/msdoc.h b/libscan/msdoc/msdoc.h new file mode 100644 index 0000000..21579c6 --- /dev/null +++ b/libscan/msdoc/msdoc.h @@ -0,0 +1,22 @@ +#ifndef SCAN_SCAN_MSDOC_H +#define SCAN_SCAN_MSDOC_H + +#include "../scan.h" + +typedef struct { + long content_size; + int tn_size; + log_callback_t log; + logf_callback_t logf; + store_callback_t store; + unsigned int msdoc_mime; +} scan_msdoc_ctx_t; + +__always_inline +static int is_msdoc(scan_msdoc_ctx_t *ctx, unsigned int mime) { + return mime == ctx->msdoc_mime; +} + +void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc); + +#endif diff --git a/libscan/scan.h b/libscan/scan.h index 802ae77..055f595 100644 --- a/libscan/scan.h +++ b/libscan/scan.h @@ -1,6 +1,8 @@ #ifndef SCAN_SCAN_H #define SCAN_SCAN_H +#define _GNU_SOURCE + #include #include #include @@ -147,16 +149,6 @@ typedef struct parse_job_t { } parse_job_t; -#define APPEND_META(doc, meta) \ - meta->next = NULL;\ - if (doc->meta_head == NULL) {\ - doc->meta_head = meta;\ - doc->meta_tail = doc->meta_head;\ - } else {\ - doc->meta_tail->next = meta;\ - doc->meta_tail = meta;\ - } - #include "util.h" typedef void (*parse_callback_t)(parse_job_t *job); diff --git a/libscan/util.h b/libscan/util.h index 4e3fe28..959fda3 100644 --- a/libscan/util.h +++ b/libscan/util.h @@ -273,7 +273,7 @@ static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t return 0; } -static int text_buffer_append_string0(text_buffer_t *buf, char *str) { +static int text_buffer_append_string0(text_buffer_t *buf, const char *str) { return text_buffer_append_string(buf, str, strlen(str)); } diff --git a/test/main.cpp b/test/main.cpp index f0182c8..67d186b 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -10,6 +10,7 @@ extern "C" { #include "../libscan/ooxml/ooxml.h" #include "../libscan/mobi/scan_mobi.h" #include "../libscan/raw/raw.h" +#include "../libscan/msdoc/msdoc.h" #include } @@ -33,6 +34,10 @@ static scan_mobi_ctx_t mobi_500_ctx; static scan_raw_ctx_t raw_ctx; +static scan_msdoc_ctx_t msdoc_ctx; + +static scan_msdoc_ctx_t msdoc_text_ctx; + document_t LastSubDoc; @@ -689,6 +694,98 @@ TEST(RAW, Fuji) { cleanup(&doc, &f); } +/* msdoc */ +TEST(Msdoc, Test1Pdf) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/msdoc/test1.doc", &f, &doc); + + size_t size_before = store_size; + + parse_msdoc(&msdoc_ctx, &f, &doc); + + ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "October 2000") != nullptr); + ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "INTERNATIONAL ORGANIZATION FOR STANDARDIZATION"); + ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Oliver Morgan"); + ASSERT_EQ(get_meta(&doc, MetaPages)->int_val, 57); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); + ASSERT_NE(size_before, store_size); + + cleanup(&doc, &f); +} + +TEST(Msdoc, Test1Text) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/msdoc/test1.doc", &f, &doc); + + size_t size_before = store_size; + + parse_msdoc(&msdoc_text_ctx, &f, &doc); + + ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "October 2000") != nullptr); + ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "INTERNATIONAL ORGANIZATION FOR STANDARDIZATION"); + ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Oliver Morgan"); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); + ASSERT_EQ(size_before, store_size); + + cleanup(&doc, &f); +} + +TEST(Msdoc, Test2Pdf) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/msdoc/test2.doc", &f, &doc); + + size_t size_before = store_size; + + parse_msdoc(&msdoc_ctx, &f, &doc); + + ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "GNU Free Documentation License") != nullptr); + ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "DWARF Debugging Information Format"); + ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Ron Brender"); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); + ASSERT_NE(size_before, store_size); + + cleanup(&doc, &f); +} + +TEST(Msdoc, Test3Pdf) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/msdoc/test3.doc", &f, &doc); + + size_t size_before = store_size; + + parse_msdoc(&msdoc_ctx, &f, &doc); + + ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "INTERNATIONAL PATENT CLASSIFICATION") != nullptr); + ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "IPC Fixed Texts Specification"); + ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Fievet"); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); + ASSERT_NE(size_before, store_size); + + cleanup(&doc, &f); +} + +TEST(Msdoc, Test4Pdf) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/msdoc/test4.doc", &f, &doc); + + size_t size_before = store_size; + + parse_msdoc(&msdoc_ctx, &f, &doc); + + ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "SQL Server international data types") != nullptr); + ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "MSDN Authoring Template"); + ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Brenda Yen"); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); + ASSERT_NE(size_before, store_size); + + cleanup(&doc, &f); +} + int main(int argc, char **argv) { setlocale(LC_ALL, ""); @@ -753,6 +850,18 @@ int main(int argc, char **argv) { raw_ctx.tn_size = 500; raw_ctx.tn_qscale = 5.0; + msdoc_ctx.log = noop_log; + msdoc_ctx.logf = noop_logf; + msdoc_ctx.store = counter_store; + msdoc_ctx.content_size = 500; + msdoc_ctx.tn_size = 500; + + msdoc_text_ctx.log = noop_log; + msdoc_text_ctx.logf = noop_logf; + msdoc_text_ctx.store = counter_store; + msdoc_text_ctx.content_size = 500; + msdoc_text_ctx.tn_size = 0; + av_log_set_level(AV_LOG_QUIET); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/third-party/antiword b/third-party/antiword new file mode 160000 index 0000000..be5e260 --- /dev/null +++ b/third-party/antiword @@ -0,0 +1 @@ +Subproject commit be5e260190d807fdfb9ed1d64cf62d6649de3030 diff --git a/third-party/utf8.h b/third-party/utf8.h index fdcacc0..e976254 160000 --- a/third-party/utf8.h +++ b/third-party/utf8.h @@ -1 +1 @@ -Subproject commit fdcacc00ff48f7d268108dfb0ec7ebc485f1eb16 +Subproject commit e9762540f33eed32d9a568e20ce4c4a836722a50