From 070186fea07883bf4984831470158c2f68263e5d Mon Sep 17 00:00:00 2001
From: simon987 <me@simon987.net>
Date: Sun, 15 Nov 2020 21:17:37 -0500
Subject: [PATCH] Add .doc support

---
 .gitmodules           |   3 +
 CMakeLists.txt        |  13 ++++
 libscan/ebook/ebook.c |  10 +--
 libscan/macros.h      |  34 +++++++--
 libscan/media/media.c |  43 +++++------
 libscan/msdoc/msdoc.c | 165 ++++++++++++++++++++++++++++++++++++++++++
 libscan/msdoc/msdoc.h |  22 ++++++
 libscan/scan.h        |  12 +--
 libscan/util.h        |   2 +-
 test/main.cpp         | 109 ++++++++++++++++++++++++++++
 third-party/antiword  |   1 +
 third-party/utf8.h    |   2 +-
 12 files changed, 364 insertions(+), 52 deletions(-)
 create mode 100644 libscan/msdoc/msdoc.c
 create mode 100644 libscan/msdoc/msdoc.h
 create mode 160000 third-party/antiword

diff --git a/.gitmodules b/.gitmodules
index a1b4278..d91406e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "third-party/utf8.h"]
 	path = third-party/utf8.h
 	url = https://github.com/sheredom/utf8.h
+[submodule "third-party/antiword"]
+	path = third-party/antiword
+	url = https://github.com/simon987/antiword
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cda33d8..ac09c74 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,12 @@ set(CMAKE_C_STANDARD 11)
 
 option(BUILD_TESTS "Build tests" off)
 
+add_subdirectory(third-party/antiword)
+add_compile_definitions(
+        antiword
+        NDEBUG
+)
+
 add_library(
         scan
         libscan/util.c libscan/util.h
@@ -18,6 +24,7 @@ add_library(
         libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h
         libscan/media/media.c libscan/media/media.h
         libscan/font/font.c libscan/font/font.h
+        libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h
 
         third-party/utf8.h
         libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h)
@@ -110,6 +117,7 @@ add_dependencies(
         scan
         libmobi
         ffmpeg
+        antiword
 )
 
 target_link_libraries(
@@ -161,6 +169,7 @@ target_link_libraries(
         ${JAS_LIB}
         ${GUMBO_LIB}
         dl
+        antiword
 )
 
 target_include_directories(
@@ -183,4 +192,8 @@ if (BUILD_TESTS)
     add_executable(scan_a_test test/main.cpp test/test_util.cpp test/test_util.h)
     target_compile_options(scan_a_test PRIVATE -g -fsanitize=address -fno-omit-frame-pointer)
     target_link_libraries(scan_a_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=address scan)
+
+    add_executable(scan_test test/main.cpp test/test_util.cpp test/test_util.h)
+    target_compile_options(scan_test PRIVATE -g -fno-omit-frame-pointer)
+    target_link_libraries(scan_test PRIVATE GTest::gtest GTest::gtest_main scan)
 endif()
diff --git a/libscan/ebook/ebook.c b/libscan/ebook/ebook.c
index ad3c3fa..6d062c3 100644
--- a/libscan/ebook/ebook.c
+++ b/libscan/ebook/ebook.c
@@ -292,10 +292,7 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha
         ;
 
     if (strlen(title) > 0) {
-        meta_line_t *meta_title = malloc(sizeof(meta_line_t) + strlen(title));
-        meta_title->key = MetaTitle;
-        strcpy(meta_title->str_val, title);
-        APPEND_META(doc, meta_title)
+        APPEND_UTF8_META(doc, MetaTitle, title)
     }
 
     char author[4096] = {'\0',};
@@ -305,10 +302,7 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha
         ;
 
     if (strlen(author) > 0) {
-        meta_line_t *meta_author = malloc(sizeof(meta_line_t) + strlen(author));
-        meta_author->key = MetaAuthor;
-        strcpy(meta_author->str_val, author);
-        APPEND_META(doc, meta_author)
+        APPEND_UTF8_META(doc, MetaAuthor, author)
     }
 
     int page_count = -1;
diff --git a/libscan/macros.h b/libscan/macros.h
index 600f3ba..cd6cb5e 100644
--- a/libscan/macros.h
+++ b/libscan/macros.h
@@ -1,16 +1,16 @@
-#ifndef	FALSE
-#define	FALSE	(0)
+#ifndef FALSE
+#define FALSE (0)
 #define BOOL int
 #endif
 
-#ifndef	TRUE
-#define	TRUE	(!FALSE)
+#ifndef TRUE
+#define TRUE (!FALSE)
 #endif
 
-#undef	MAX
+#undef MAX
 #define MAX(a, b)  (((a) > (b)) ? (a) : (b))
 
-#undef	MIN
+#undef MIN
 #define MIN(a, b)  (((a) < (b)) ? (a) : (b))
 
 #ifndef PATH_MAX
@@ -18,7 +18,7 @@
 #endif
 
 #undef ABS
-#define ABS(a)	   (((a) < 0) ? -(a) : (a))
+#define ABS(a) (((a) < 0) ? -(a) : (a))
 
 #define APPEND_STR_META(doc, keyname, value) \
     {meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \
@@ -37,3 +37,23 @@
     meta_str->key = MetaThumbnail; \
     sprintf(meta_str->str_val, "%04d,%04d", width, height); \
     APPEND_META(doc, meta_str)}
+
+#define APPEND_META(doc, meta) \
+    meta->next = NULL;\
+    if (doc->meta_head == NULL) {\
+        doc->meta_head = meta;\
+        doc->meta_tail = doc->meta_head;\
+    } else {\
+        doc->meta_tail->next = meta;\
+        doc->meta_tail = meta;\
+    }
+
+#define APPEND_UTF8_META(doc, keyname, str) \
+    text_buffer_t tex = text_buffer_create(-1); \
+    text_buffer_append_string0(&tex, str); \
+    text_buffer_terminate_string(&tex); \
+    meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \
+    meta_tag->key = keyname; \
+    strcpy(meta_tag->str_val, tex.dyn_buffer.buf); \
+    APPEND_META(doc, meta_tag) \
+    text_buffer_destroy(&tex);
diff --git a/libscan/media/media.c b/libscan/media/media.c
index 7cdabcb..65bd5e4 100644
--- a/libscan/media/media.c
+++ b/libscan/media/media.c
@@ -166,15 +166,8 @@ void append_tag_meta_if_not_exists(scan_media_ctx_t *ctx, document_t *doc, AVDic
     text_buffer_destroy(&tex);
 }
 
-#define APPEND_TAG_META(doc, tag_, keyname) \
-    text_buffer_t tex = text_buffer_create(-1); \
-    text_buffer_append_string0(&tex, tag_->value); \
-    text_buffer_terminate_string(&tex); \
-    meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \
-    meta_tag->key = keyname; \
-    strcpy(meta_tag->str_val, tex.dyn_buffer.buf); \
-    APPEND_META(doc, meta_tag) \
-    text_buffer_destroy(&tex);
+#define APPEND_TAG_META(keyname) \
+    APPEND_UTF8_META(doc, keyname, tag->value)
 
 #define STRCPY_TOLOWER(dst, str) \
     strncpy(dst, str, sizeof(dst)); \
@@ -190,17 +183,17 @@ static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
         STRCPY_TOLOWER(key, tag->key)
 
         if (strcmp(key, "artist") == 0) {
-            APPEND_TAG_META(doc, tag, MetaArtist)
+            APPEND_TAG_META(MetaArtist)
         } else if (strcmp(key, "genre") == 0) {
-            APPEND_TAG_META(doc, tag, MetaGenre)
+            APPEND_TAG_META(MetaGenre)
         } else if (strcmp(key, "title") == 0) {
-            APPEND_TAG_META(doc, tag, MetaTitle)
+            APPEND_TAG_META(MetaTitle)
         } else if (strcmp(key, "album_artist") == 0) {
-            APPEND_TAG_META(doc, tag, MetaAlbumArtist)
+            APPEND_TAG_META(MetaAlbumArtist)
         } else if (strcmp(key, "album") == 0) {
-            APPEND_TAG_META(doc, tag, MetaAlbum)
+            APPEND_TAG_META(MetaAlbum)
         } else if (strcmp(key, "comment") == 0) {
-            APPEND_TAG_META(doc, tag, MetaContent)
+            APPEND_TAG_META(MetaContent)
         }
     }
 }
@@ -244,25 +237,25 @@ append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *f
             if (strcmp(key, "artist") == 0) {
                 append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
             } else if (strcmp(tag->key, "ImageDescription") == 0) {
-                APPEND_TAG_META(doc, tag, MetaContent)
+                APPEND_TAG_META(MetaContent)
             } else if (strcmp(tag->key, "Make") == 0) {
-                APPEND_TAG_META(doc, tag, MetaExifMake)
+                APPEND_TAG_META(MetaExifMake)
             } else if (strcmp(tag->key, "Model") == 0) {
-                APPEND_TAG_META(doc, tag, MetaExifModel)
+                APPEND_TAG_META(MetaExifModel)
             } else if (strcmp(tag->key, "Software") == 0) {
-                APPEND_TAG_META(doc, tag, MetaExifSoftware)
+                APPEND_TAG_META(MetaExifSoftware)
             } else if (strcmp(tag->key, "FNumber") == 0) {
-                APPEND_TAG_META(doc, tag, MetaExifFNumber)
+                APPEND_TAG_META(MetaExifFNumber)
             } else if (strcmp(tag->key, "FocalLength") == 0) {
-                APPEND_TAG_META(doc, tag, MetaExifFocalLength)
+                APPEND_TAG_META(MetaExifFocalLength)
             } else if (strcmp(tag->key, "UserComment") == 0) {
-                APPEND_TAG_META(doc, tag, MetaExifUserComment)
+                APPEND_TAG_META(MetaExifUserComment)
             } else if (strcmp(tag->key, "ISOSpeedRatings") == 0) {
-                APPEND_TAG_META(doc, tag, MetaExifIsoSpeedRatings)
+                APPEND_TAG_META(MetaExifIsoSpeedRatings)
             } else if (strcmp(tag->key, "ExposureTime") == 0) {
-                APPEND_TAG_META(doc, tag, MetaExifExposureTime)
+                APPEND_TAG_META(MetaExifExposureTime)
             } else if (strcmp(tag->key, "DateTime") == 0) {
-                APPEND_TAG_META(doc, tag, MetaExifDateTime)
+                APPEND_TAG_META(MetaExifDateTime)
             }
         }
     }
diff --git a/libscan/msdoc/msdoc.c b/libscan/msdoc/msdoc.c
new file mode 100644
index 0000000..21775e6
--- /dev/null
+++ b/libscan/msdoc/msdoc.c
@@ -0,0 +1,165 @@
+#include "msdoc.h"
+#include <errno.h>
+
+#include <sys/mman.h>
+#include "../../third-party/antiword/src/antiword.h"
+
+#include "../ebook/ebook.h"
+
+void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
+
+    // Open file
+    size_t buf_len;
+    char *buf = read_all(f, &buf_len);
+    if (buf == NULL) {
+        CTX_LOG_ERROR(f->filepath, "read_all() failed")
+        return;
+    }
+
+    FILE *file_in = fmemopen(buf, buf_len, "rb");
+    if (file_in == NULL) {
+        free(buf);
+        CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
+        return;
+    }
+
+    // Open word doc
+    options_type *opts = direct_vGetOptions();
+    opts->iParagraphBreak = 74;
+    opts->eConversionType = conversion_text;
+    opts->bHideHiddenText = 1;
+    opts->bRemoveRemovedText = 1;
+    opts->bUseLandscape = 0;
+    opts->eEncoding = encoding_utf_8;
+    opts->iPageHeight = 842; // A4
+    opts->iPageWidth = 595;
+    opts->eImageLevel = level_ps_3;
+
+    int doc_word_version = iGuessVersionNumber(file_in, buf_len);
+    if (doc_word_version < 0 || doc_word_version == 3) {
+        fclose(file_in);
+        free(buf);
+        return;
+    }
+    rewind(file_in);
+
+    size_t out_len;
+    char *out_buf;
+
+    FILE *file_out = open_memstream(&out_buf, &out_len);
+
+    diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
+    if (diag == NULL) {
+        fclose(file_in);
+        return;
+    }
+
+    iInitDocument(file_in, buf_len);
+    const char* author = szGetAuthor();
+    if (author != NULL) {
+        APPEND_UTF8_META(doc, MetaAuthor, author)
+    }
+
+    const char* title = szGetTitle();
+    if (title != NULL) {
+        APPEND_UTF8_META(doc, MetaTitle, title)
+    }
+    vFreeDocument();
+
+    bWordDecryptor(file_in, buf_len, diag);
+    vDestroyDiagram(diag);
+    fclose(file_out);
+
+    if (buf_len > 0) {
+        text_buffer_t tex = text_buffer_create(ctx->content_size);
+        text_buffer_append_string(&tex, out_buf, out_len);
+        text_buffer_terminate_string(&tex);
+
+        meta_line_t *meta_content = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
+        meta_content->key = MetaContent;
+        memcpy(meta_content->str_val, tex.dyn_buffer.buf, tex.dyn_buffer.cur);
+        APPEND_META(doc, meta_content)
+
+        text_buffer_destroy(&tex);
+    }
+
+    fclose(file_in);
+    free(buf);
+    free(out_buf);
+}
+
+void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
+
+    scan_ebook_ctx_t ebook_ctx = {
+            .content_size = ctx->content_size,
+            .tn_size = ctx->tn_size,
+            .log = ctx->log,
+            .logf = ctx->logf,
+            .store = ctx->store,
+    };
+
+    // Open file
+    size_t buf_len;
+    char *buf = read_all(f, &buf_len);
+    if (buf == NULL) {
+        CTX_LOG_ERROR(f->filepath, "read_all() failed")
+        return;
+    }
+
+    FILE *file = fmemopen(buf, buf_len, "rb");
+    if (file == NULL) {
+        free(buf);
+        CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
+        return;
+    }
+    // Open word doc
+
+    options_type *opts = direct_vGetOptions();
+    opts->iParagraphBreak = 74;
+    opts->eConversionType = conversion_pdf;
+    opts->bHideHiddenText = 1;
+    opts->bRemoveRemovedText = 1;
+    opts->bUseLandscape = 0;
+    opts->eEncoding = encoding_latin_2;
+    opts->iPageHeight = 842; // A4
+    opts->iPageWidth = 595;
+    opts->eImageLevel = level_ps_3;
+
+    int doc_word_version = iGuessVersionNumber(file, buf_len);
+    if (doc_word_version < 0 || doc_word_version == 3) {
+        fclose(file);
+        free(buf);
+        return;
+    }
+    rewind(file);
+
+    size_t out_len;
+    char *out_buf;
+
+    FILE *file_out = open_memstream(&out_buf, &out_len);
+
+    diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
+    if (diag == NULL) {
+        fclose(file);
+        return;
+    }
+
+    int ret = bWordDecryptor(file, buf_len, diag);
+    vDestroyDiagram(diag);
+
+    fclose(file_out);
+
+    parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc);
+
+    fclose(file);
+    free(buf);
+    free(out_buf);
+}
+
+void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
+    if (ctx->tn_size > 0) {
+        parse_msdoc_pdf(ctx, f, doc);
+    } else {
+        parse_msdoc_text(ctx, f, doc);
+    }
+}
diff --git a/libscan/msdoc/msdoc.h b/libscan/msdoc/msdoc.h
new file mode 100644
index 0000000..21579c6
--- /dev/null
+++ b/libscan/msdoc/msdoc.h
@@ -0,0 +1,22 @@
+#ifndef SCAN_SCAN_MSDOC_H
+#define SCAN_SCAN_MSDOC_H
+
+#include "../scan.h"
+
+typedef struct {
+    long content_size;
+    int tn_size;
+    log_callback_t log;
+    logf_callback_t logf;
+    store_callback_t store;
+    unsigned int msdoc_mime;
+} scan_msdoc_ctx_t;
+
+__always_inline
+static int is_msdoc(scan_msdoc_ctx_t *ctx, unsigned int mime) {
+    return mime == ctx->msdoc_mime;
+}
+
+void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc);
+
+#endif
diff --git a/libscan/scan.h b/libscan/scan.h
index 802ae77..055f595 100644
--- a/libscan/scan.h
+++ b/libscan/scan.h
@@ -1,6 +1,8 @@
 #ifndef SCAN_SCAN_H
 #define SCAN_SCAN_H
 
+#define _GNU_SOURCE
+
 #include <stdio.h>
 #include <sys/stat.h>
 #include <uuid/uuid.h>
@@ -147,16 +149,6 @@ typedef struct parse_job_t {
 } parse_job_t;
 
 
-#define APPEND_META(doc, meta) \
-    meta->next = NULL;\
-    if (doc->meta_head == NULL) {\
-        doc->meta_head = meta;\
-        doc->meta_tail = doc->meta_head;\
-    } else {\
-        doc->meta_tail->next = meta;\
-        doc->meta_tail = meta;\
-    }
-
 #include "util.h"
 
 typedef void (*parse_callback_t)(parse_job_t *job);
diff --git a/libscan/util.h b/libscan/util.h
index 4e3fe28..959fda3 100644
--- a/libscan/util.h
+++ b/libscan/util.h
@@ -273,7 +273,7 @@ static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t
     return 0;
 }
 
-static int text_buffer_append_string0(text_buffer_t *buf, char *str) {
+static int text_buffer_append_string0(text_buffer_t *buf, const char *str) {
     return text_buffer_append_string(buf, str, strlen(str));
 }
 
diff --git a/test/main.cpp b/test/main.cpp
index f0182c8..67d186b 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -10,6 +10,7 @@ extern "C" {
 #include "../libscan/ooxml/ooxml.h"
 #include "../libscan/mobi/scan_mobi.h"
 #include "../libscan/raw/raw.h"
+#include "../libscan/msdoc/msdoc.h"
 #include <libavutil/avutil.h>
 }
 
@@ -33,6 +34,10 @@ static scan_mobi_ctx_t mobi_500_ctx;
 
 static scan_raw_ctx_t raw_ctx;
 
+static scan_msdoc_ctx_t msdoc_ctx;
+
+static scan_msdoc_ctx_t msdoc_text_ctx;
+
 
 document_t LastSubDoc;
 
@@ -689,6 +694,98 @@ TEST(RAW, Fuji) {
     cleanup(&doc, &f);
 }
 
+/* msdoc */
+TEST(Msdoc, Test1Pdf) {
+    vfile_t f;
+    document_t doc;
+    load_doc_file("libscan-test-files/test_files/msdoc/test1.doc", &f, &doc);
+
+    size_t size_before = store_size;
+
+    parse_msdoc(&msdoc_ctx, &f, &doc);
+
+    ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "October 2000") != nullptr);
+    ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "INTERNATIONAL ORGANIZATION FOR STANDARDIZATION");
+    ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Oliver Morgan");
+    ASSERT_EQ(get_meta(&doc, MetaPages)->int_val, 57);
+    ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
+    ASSERT_NE(size_before, store_size);
+
+    cleanup(&doc, &f);
+}
+
+TEST(Msdoc, Test1Text) {
+    vfile_t f;
+    document_t doc;
+    load_doc_file("libscan-test-files/test_files/msdoc/test1.doc", &f, &doc);
+
+    size_t size_before = store_size;
+
+    parse_msdoc(&msdoc_text_ctx, &f, &doc);
+
+    ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "October 2000") != nullptr);
+    ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "INTERNATIONAL ORGANIZATION FOR STANDARDIZATION");
+    ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Oliver Morgan");
+    ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
+    ASSERT_EQ(size_before, store_size);
+
+    cleanup(&doc, &f);
+}
+
+TEST(Msdoc, Test2Pdf) {
+    vfile_t f;
+    document_t doc;
+    load_doc_file("libscan-test-files/test_files/msdoc/test2.doc", &f, &doc);
+
+    size_t size_before = store_size;
+
+    parse_msdoc(&msdoc_ctx, &f, &doc);
+
+    ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "GNU Free Documentation License") != nullptr);
+    ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "DWARF Debugging Information Format");
+    ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Ron Brender");
+    ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
+    ASSERT_NE(size_before, store_size);
+
+    cleanup(&doc, &f);
+}
+
+TEST(Msdoc, Test3Pdf) {
+    vfile_t f;
+    document_t doc;
+    load_doc_file("libscan-test-files/test_files/msdoc/test3.doc", &f, &doc);
+
+    size_t size_before = store_size;
+
+    parse_msdoc(&msdoc_ctx, &f, &doc);
+
+    ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "INTERNATIONAL PATENT CLASSIFICATION") != nullptr);
+    ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "IPC Fixed Texts Specification");
+    ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Fievet");
+    ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
+    ASSERT_NE(size_before, store_size);
+
+    cleanup(&doc, &f);
+}
+
+TEST(Msdoc, Test4Pdf) {
+    vfile_t f;
+    document_t doc;
+    load_doc_file("libscan-test-files/test_files/msdoc/test4.doc", &f, &doc);
+
+    size_t size_before = store_size;
+
+    parse_msdoc(&msdoc_ctx, &f, &doc);
+
+    ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "SQL Server international data types") != nullptr);
+    ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "MSDN Authoring Template");
+    ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Brenda Yen");
+    ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
+    ASSERT_NE(size_before, store_size);
+
+    cleanup(&doc, &f);
+}
+
 
 int main(int argc, char **argv) {
     setlocale(LC_ALL, "");
@@ -753,6 +850,18 @@ int main(int argc, char **argv) {
     raw_ctx.tn_size = 500;
     raw_ctx.tn_qscale = 5.0;
 
+    msdoc_ctx.log = noop_log;
+    msdoc_ctx.logf = noop_logf;
+    msdoc_ctx.store = counter_store;
+    msdoc_ctx.content_size = 500;
+    msdoc_ctx.tn_size = 500;
+
+    msdoc_text_ctx.log = noop_log;
+    msdoc_text_ctx.logf = noop_logf;
+    msdoc_text_ctx.store = counter_store;
+    msdoc_text_ctx.content_size = 500;
+    msdoc_text_ctx.tn_size = 0;
+
     av_log_set_level(AV_LOG_QUIET);
     ::testing::InitGoogleTest(&argc, argv);
     return RUN_ALL_TESTS();
diff --git a/third-party/antiword b/third-party/antiword
new file mode 160000
index 0000000..be5e260
--- /dev/null
+++ b/third-party/antiword
@@ -0,0 +1 @@
+Subproject commit be5e260190d807fdfb9ed1d64cf62d6649de3030
diff --git a/third-party/utf8.h b/third-party/utf8.h
index fdcacc0..e976254 160000
--- a/third-party/utf8.h
+++ b/third-party/utf8.h
@@ -1 +1 @@
-Subproject commit fdcacc00ff48f7d268108dfb0ec7ebc485f1eb16
+Subproject commit e9762540f33eed32d9a568e20ce4c4a836722a50