From 6b47b4dfbb28490f0bb2d30c0ca75ac945db2160 Mon Sep 17 00:00:00 2001 From: simon987 Date: Wed, 16 Dec 2020 20:04:26 -0500 Subject: [PATCH] Better support for .doc files --- README.md | 6 +++++ libscan/msdoc/msdoc.c | 52 ++++++++++++++++--------------------------- libscan/msdoc/msdoc.h | 2 ++ test/main.cpp | 23 ++++++++++++++++++- test/test_util.cpp | 16 +++++++++++++ test/test_util.h | 2 ++ third-party/antiword | 2 +- third-party/utf8.h | 2 +- 8 files changed, 69 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index ed5aec4..edaf389 100644 --- a/README.md +++ b/README.md @@ -1 +1,7 @@ Please use [sist2](https://github.com/simon987/sist2) tracker for issues + + +### Run fuzz tests: +```bash +./scan_a_test --gtest_filter=*Fuzz* --gtest_repeat=100 +``` \ No newline at end of file diff --git a/libscan/msdoc/msdoc.c b/libscan/msdoc/msdoc.c index 21775e6..033a63a 100644 --- a/libscan/msdoc/msdoc.c +++ b/libscan/msdoc/msdoc.c @@ -6,22 +6,7 @@ #include "../ebook/ebook.h" -void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { - - // Open file - size_t buf_len; - char *buf = read_all(f, &buf_len); - if (buf == NULL) { - CTX_LOG_ERROR(f->filepath, "read_all() failed") - return; - } - - FILE *file_in = fmemopen(buf, buf_len, "rb"); - if (file_in == NULL) { - free(buf); - CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno) - return; - } +void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc, FILE *file_in, void* buf, size_t buf_len) { // Open word doc options_type *opts = direct_vGetOptions(); @@ -88,7 +73,7 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { free(out_buf); } -void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { +void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc, FILE *file, void* buf, size_t buf_len) { scan_ebook_ctx_t ebook_ctx = { .content_size = ctx->content_size, @@ -98,20 +83,6 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { .store = ctx->store, }; - // Open file - size_t buf_len; - char *buf = read_all(f, &buf_len); - if (buf == NULL) { - CTX_LOG_ERROR(f->filepath, "read_all() failed") - return; - } - - FILE *file = fmemopen(buf, buf_len, "rb"); - if (file == NULL) { - free(buf); - CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno) - return; - } // Open word doc options_type *opts = direct_vGetOptions(); @@ -157,9 +128,24 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { } void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { + + size_t buf_len; + char *buf = read_all(f, &buf_len); + if (buf == NULL) { + CTX_LOG_ERROR(f->filepath, "read_all() failed") + return; + } + + FILE *file = fmemopen(buf, buf_len, "rb"); + if (file == NULL) { + free(buf); + CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno) + return; + } + if (ctx->tn_size > 0) { - parse_msdoc_pdf(ctx, f, doc); + parse_msdoc_pdf(ctx, f, doc, file, buf, buf_len); } else { - parse_msdoc_text(ctx, f, doc); + parse_msdoc_text(ctx, f, doc, file, buf, buf_len); } } diff --git a/libscan/msdoc/msdoc.h b/libscan/msdoc/msdoc.h index 21579c6..aac8f6d 100644 --- a/libscan/msdoc/msdoc.h +++ b/libscan/msdoc/msdoc.h @@ -19,4 +19,6 @@ static int is_msdoc(scan_msdoc_ctx_t *ctx, unsigned int mime) { void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc); +void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc, FILE *file_in, void* buf, size_t buf_len); + #endif diff --git a/test/main.cpp b/test/main.cpp index 67d186b..65d925b 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -786,6 +786,27 @@ TEST(Msdoc, Test4Pdf) { cleanup(&doc, &f); } +TEST(Msdoc, TestFuzz1) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/msdoc/fuzz_ole.doc", &f, &doc); + + size_t buf_len; + char *buf = (char *) read_all(&f, &buf_len); + + for (int i = 0; i < 1000; i++) { + size_t buf_len_copy = buf_len; + char *buf_copy = (char*)malloc(buf_len); + memcpy(buf_copy, buf, buf_len); + + fuzz_buffer(buf_copy, &buf_len_copy, 3, 8, 5); + FILE *file = fmemopen(buf_copy, buf_len_copy, "rb"); + parse_msdoc_text(&msdoc_text_ctx, &f, &doc, file, buf_copy, buf_len_copy); + } + free(buf); + cleanup(&doc, &f); +} + int main(int argc, char **argv) { setlocale(LC_ALL, ""); @@ -833,7 +854,7 @@ int main(int argc, char **argv) { media_ctx.store = counter_store; media_ctx.tn_size = 500; media_ctx.tn_qscale = 1.0; - media_ctx.max_media_buffer = (long)2000 * (long)1024 * (long)1024; + media_ctx.max_media_buffer = (long) 2000 * (long) 1024 * (long) 1024; ooxml_500_ctx.content_size = 500; ooxml_500_ctx.log = noop_log; diff --git a/test/test_util.cpp b/test/test_util.cpp index efbc45a..9e134b0 100644 --- a/test/test_util.cpp +++ b/test/test_util.cpp @@ -94,3 +94,19 @@ void destroy_doc(document_t *doc) { free(tmp); } } + +void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p) { + for (int i = 0; i < n; i++) { + + size_t offset = rand() % (*buf_len - width - 1); + + if (rand() % 100 < trunc_p) { + *buf_len = MAX(offset, 1000); + continue; + } + + for (int disp = 0; disp < width; disp++) { + buf[offset + disp] = (int8_t)rand(); + } + } +} diff --git a/test/test_util.h b/test/test_util.h index 32bde2a..8b33009 100644 --- a/test/test_util.h +++ b/test/test_util.h @@ -42,4 +42,6 @@ meta_line_t *get_meta_from(meta_line_t *meta, metakey key); void destroy_doc(document_t *doc); +void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p); + #endif diff --git a/third-party/antiword b/third-party/antiword index be5e260..eb8d737 160000 --- a/third-party/antiword +++ b/third-party/antiword @@ -1 +1 @@ -Subproject commit be5e260190d807fdfb9ed1d64cf62d6649de3030 +Subproject commit eb8d737eea2866bfb45e50423a1fd6c51454c2f6 diff --git a/third-party/utf8.h b/third-party/utf8.h index e976254..a67acc7 160000 --- a/third-party/utf8.h +++ b/third-party/utf8.h @@ -1 +1 @@ -Subproject commit e9762540f33eed32d9a568e20ce4c4a836722a50 +Subproject commit a67acc78fd0fc272ad45362b828efdcb24874e64