From 1dad8fae20c85de130f61f27060de692b6f89956 Mon Sep 17 00:00:00 2001 From: simon987 Date: Mon, 6 Sep 2021 09:47:57 -0400 Subject: [PATCH] Parse page numbers from .docx files --- libscan/ooxml/ooxml.c | 48 +++++++++++++++++++++++++++++++++++++++---- test/main.cpp | 2 ++ 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/libscan/ooxml/ooxml.c b/libscan/ooxml/ooxml.c index 2fd4909..d4fd7b3 100644 --- a/libscan/ooxml/ooxml.c +++ b/libscan/ooxml/ooxml.c @@ -70,7 +70,7 @@ int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_ int xml_io_read(void *context, char *buffer, int len) { struct archive *a = context; - return archive_read_data(a, buffer, len); + return (int) archive_read_data(a, buffer, len); } int xml_io_close(UNUSED(void *context)) { @@ -78,7 +78,7 @@ int xml_io_close(UNUSED(void *context)) { return 0; } -#define READ_PART_ERR -2 +#define READ_PART_ERR (-2) __always_inline static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) { @@ -104,6 +104,42 @@ static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *bu return ret; } +__always_inline +static int read_doc_props_app(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) { + xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL, + XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET); + + if (xml == NULL) { + CTX_LOG_ERROR(doc->filepath, "Could not parse XML") + return -1; + } + + xmlNode *root = xmlDocGetRootElement(xml); + if (root == NULL) { + CTX_LOG_ERROR(doc->filepath, "Empty document") + xmlFreeDoc(xml); + return -1; + } + + if (xmlStrEqual(root->name, _X("Properties"))) { + for (xmlNode *child = root->children; child; child = child->next) { + xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1); + if (text == NULL) { + continue; + } + + if (xmlStrEqual(child->name, _X("Pages"))) { + APPEND_LONG_META(doc, MetaPages, strtol((char *) text, NULL, 10)) + } + + xmlFree(text); + } + } + xmlFreeDoc(xml); + + return 0; +} + __always_inline static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) { xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL, @@ -144,7 +180,7 @@ static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, document_t * return 0; } -#define MAX_TN_SIZE 1024 * 1024 * 15 +#define MAX_TN_SIZE (1024 * 1024 * 15) void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, struct archive_entry *entry) { size_t entry_size = archive_entry_size(entry); @@ -153,7 +189,7 @@ void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, s return; } - char* buf = malloc(entry_size); + char *buf = malloc(entry_size); archive_read_data(a, buf, entry_size); APPEND_TN_META(doc, 1, 1) // Size unknown @@ -196,6 +232,10 @@ void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) { } else if (ret == TEXT_BUF_FULL) { buffer_full = TRUE; } + } else if (strcmp(path, "docProps/app.xml") == 0) { + if (read_doc_props_app(ctx, a, doc) != 0) { + break; + } } else if (strcmp(path, "docProps/core.xml") == 0) { if (read_doc_props(ctx, a, doc) != 0) { break; diff --git a/test/main.cpp b/test/main.cpp index af1e170..8e3b49d 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -587,6 +587,7 @@ TEST(Ooxml, Docx1) { ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Thomas"); ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Thomas"); + ASSERT_EQ(get_meta(&doc, MetaPages)->long_val, 2); ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); cleanup(&doc, &f); @@ -602,6 +603,7 @@ TEST(Ooxml, Docx2Thumbnail) { parse_ooxml(&ooxml_500_ctx, &f, &doc); ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); + ASSERT_EQ(get_meta(&doc, MetaPages)->long_val, 2); ASSERT_NE(size_before, store_size); cleanup(&doc, &f);