Better support for .doc files

This commit is contained in:
simon987 2020-12-16 20:04:26 -05:00
parent 11876ffbad
commit 6b47b4dfbb
8 changed files with 69 additions and 36 deletions

View File

@ -1 +1,7 @@
Please use [sist2](https://github.com/simon987/sist2) tracker for issues
### Run fuzz tests:
```bash
./scan_a_test --gtest_filter=*Fuzz* --gtest_repeat=100
```

View File

@ -6,22 +6,7 @@
#include "../ebook/ebook.h"
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
// Open file
size_t buf_len;
char *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FILE *file_in = fmemopen(buf, buf_len, "rb");
if (file_in == NULL) {
free(buf);
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
return;
}
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc, FILE *file_in, void* buf, size_t buf_len) {
// Open word doc
options_type *opts = direct_vGetOptions();
@ -88,7 +73,7 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
free(out_buf);
}
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc, FILE *file, void* buf, size_t buf_len) {
scan_ebook_ctx_t ebook_ctx = {
.content_size = ctx->content_size,
@ -98,20 +83,6 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
.store = ctx->store,
};
// Open file
size_t buf_len;
char *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FILE *file = fmemopen(buf, buf_len, "rb");
if (file == NULL) {
free(buf);
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
return;
}
// Open word doc
options_type *opts = direct_vGetOptions();
@ -157,9 +128,24 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
}
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
char *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FILE *file = fmemopen(buf, buf_len, "rb");
if (file == NULL) {
free(buf);
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
return;
}
if (ctx->tn_size > 0) {
parse_msdoc_pdf(ctx, f, doc);
parse_msdoc_pdf(ctx, f, doc, file, buf, buf_len);
} else {
parse_msdoc_text(ctx, f, doc);
parse_msdoc_text(ctx, f, doc, file, buf, buf_len);
}
}

View File

@ -19,4 +19,6 @@ static int is_msdoc(scan_msdoc_ctx_t *ctx, unsigned int mime) {
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc);
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc, FILE *file_in, void* buf, size_t buf_len);
#endif

View File

@ -786,6 +786,27 @@ TEST(Msdoc, Test4Pdf) {
cleanup(&doc, &f);
}
TEST(Msdoc, TestFuzz1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/msdoc/fuzz_ole.doc", &f, &doc);
size_t buf_len;
char *buf = (char *) read_all(&f, &buf_len);
for (int i = 0; i < 1000; i++) {
size_t buf_len_copy = buf_len;
char *buf_copy = (char*)malloc(buf_len);
memcpy(buf_copy, buf, buf_len);
fuzz_buffer(buf_copy, &buf_len_copy, 3, 8, 5);
FILE *file = fmemopen(buf_copy, buf_len_copy, "rb");
parse_msdoc_text(&msdoc_text_ctx, &f, &doc, file, buf_copy, buf_len_copy);
}
free(buf);
cleanup(&doc, &f);
}
int main(int argc, char **argv) {
setlocale(LC_ALL, "");
@ -833,7 +854,7 @@ int main(int argc, char **argv) {
media_ctx.store = counter_store;
media_ctx.tn_size = 500;
media_ctx.tn_qscale = 1.0;
media_ctx.max_media_buffer = (long)2000 * (long)1024 * (long)1024;
media_ctx.max_media_buffer = (long) 2000 * (long) 1024 * (long) 1024;
ooxml_500_ctx.content_size = 500;
ooxml_500_ctx.log = noop_log;

View File

@ -94,3 +94,19 @@ void destroy_doc(document_t *doc) {
free(tmp);
}
}
void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p) {
for (int i = 0; i < n; i++) {
size_t offset = rand() % (*buf_len - width - 1);
if (rand() % 100 < trunc_p) {
*buf_len = MAX(offset, 1000);
continue;
}
for (int disp = 0; disp < width; disp++) {
buf[offset + disp] = (int8_t)rand();
}
}
}

View File

@ -42,4 +42,6 @@ meta_line_t *get_meta_from(meta_line_t *meta, metakey key);
void destroy_doc(document_t *doc);
void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p);
#endif

@ -1 +1 @@
Subproject commit be5e260190d807fdfb9ed1d64cf62d6649de3030
Subproject commit eb8d737eea2866bfb45e50423a1fd6c51454c2f6

2
third-party/utf8.h vendored

@ -1 +1 @@
Subproject commit e9762540f33eed32d9a568e20ce4c4a836722a50
Subproject commit a67acc78fd0fc272ad45362b828efdcb24874e64