diff --git a/CMakeLists.txt b/CMakeLists.txt index 05890f8..8d983f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,7 +53,6 @@ find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/5 target_compile_options( scan PRIVATE - -Werror -g ) diff --git a/libscan/ooxml/ooxml.c b/libscan/ooxml/ooxml.c index 573c2ad..b9a9d17 100644 --- a/libscan/ooxml/ooxml.c +++ b/libscan/ooxml/ooxml.c @@ -145,6 +145,23 @@ static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_ return 0; } +#define MAX_TN_SIZE 1024 * 1024 * 15 + +void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, struct archive_entry *entry) { + size_t entry_size = archive_entry_size(entry); + + if (entry_size <= 0 || entry_size > MAX_TN_SIZE) { + return; + } + + char* buf = malloc(entry_size); + archive_read_data(a, buf, entry_size); + + APPEND_TN_META(doc, 1, 1) // Size unknown + ctx->store((char *) doc->uuid, sizeof(doc->uuid), buf, entry_size); + free(buf); +} + void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) { size_t buf_len; @@ -180,6 +197,8 @@ void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) { if (read_doc_props(ctx, a, &tex, doc) != 0) { break; } + } else if (strcmp(path, "docProps/thumbnail.jpeg") == 0) { + read_thumbnail(ctx, doc, a, entry); } } } diff --git a/libscan/ooxml/ooxml.h b/libscan/ooxml/ooxml.h index 19c31ae..5b2513c 100644 --- a/libscan/ooxml/ooxml.h +++ b/libscan/ooxml/ooxml.h @@ -8,6 +8,7 @@ typedef struct { long content_size; log_callback_t log; logf_callback_t logf; + store_callback_t store; } scan_ooxml_ctx_t; void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc); diff --git a/test/main.cpp b/test/main.cpp index 9c8e546..95ca84d 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -388,6 +388,21 @@ TEST(Ooxml, Docx1) { cleanup(&doc, &f); } +TEST(Ooxml, Docx2Thumbnail) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/ooxml/embed_tn.docx", &f, &doc); + + size_t size_before = store_size; + + parse_ooxml(&ooxml_500_ctx, &f, &doc); + + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); + ASSERT_NE(size_before, store_size); + + cleanup(&doc, &f); +} + TEST(Ooxml, Xlsx1) { vfile_t f; document_t doc; @@ -550,13 +565,13 @@ int main(int argc, char **argv) { arc_recurse_media_ctx.log = noop_log; arc_recurse_media_ctx.logf = noop_logf; - arc_recurse_media_ctx.store = noop_store; + arc_recurse_media_ctx.store = counter_store; arc_recurse_media_ctx.mode = ARC_MODE_RECURSE; arc_recurse_media_ctx.parse = _parse_media; arc_list_ctx.log = noop_log; arc_list_ctx.logf = noop_logf; - arc_list_ctx.store = noop_store; + arc_list_ctx.store = counter_store; arc_list_ctx.mode = ARC_MODE_LIST; text_500_ctx.content_size = 500; @@ -564,7 +579,7 @@ int main(int argc, char **argv) { text_500_ctx.logf = noop_logf; ebook_ctx.content_size = 999999999999; - ebook_ctx.store = noop_store; + ebook_ctx.store = counter_store; ebook_ctx.tesseract_lang = "eng"; ebook_ctx.tesseract_path = "./tessdata"; ebook_ctx.tn_size = 500; @@ -576,7 +591,7 @@ int main(int argc, char **argv) { media_ctx.log = noop_log; media_ctx.logf = noop_logf; - media_ctx.store = noop_store; + media_ctx.store = counter_store; media_ctx.tn_size = 500; media_ctx.tn_qscale = 1.0; media_ctx.max_media_buffer = (long)2000 * 1024 * 1024; @@ -584,6 +599,7 @@ int main(int argc, char **argv) { ooxml_500_ctx.content_size = 500; ooxml_500_ctx.log = noop_log; ooxml_500_ctx.logf = noop_logf; + ooxml_500_ctx.store = counter_store; mobi_500_ctx.content_size = 500; mobi_500_ctx.log = noop_log; @@ -591,7 +607,7 @@ int main(int argc, char **argv) { raw_ctx.log = noop_log; raw_ctx.logf = noop_logf; - raw_ctx.store = noop_store; + raw_ctx.store = counter_store; raw_ctx.tn_size = 500; raw_ctx.tn_qscale = 5.0; diff --git a/test/test_util.h b/test/test_util.h index 97ba2b8..26022be 100644 --- a/test/test_util.h +++ b/test/test_util.h @@ -17,8 +17,10 @@ static void noop_log(const char *filepath, int level, char *str) { // noop } -static void noop_store(char* key, size_t key_len, char *value, size_t value_len) { - // noop +static size_t store_size = 0; + +static void counter_store(char* key, size_t key_len, char *value, size_t value_len) { + store_size += value_len; } meta_line_t *get_meta(document_t *doc, metakey key);