mirror of
https://github.com/simon987/libscan.git
synced 2025-04-05 04:22:58 +00:00
Better support for .doc files
This commit is contained in:
parent
11876ffbad
commit
6b47b4dfbb
@ -1 +1,7 @@
|
||||
Please use [sist2](https://github.com/simon987/sist2) tracker for issues
|
||||
|
||||
|
||||
### Run fuzz tests:
|
||||
```bash
|
||||
./scan_a_test --gtest_filter=*Fuzz* --gtest_repeat=100
|
||||
```
|
@ -6,22 +6,7 @@
|
||||
|
||||
#include "../ebook/ebook.h"
|
||||
|
||||
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
// Open file
|
||||
size_t buf_len;
|
||||
char *buf = read_all(f, &buf_len);
|
||||
if (buf == NULL) {
|
||||
CTX_LOG_ERROR(f->filepath, "read_all() failed")
|
||||
return;
|
||||
}
|
||||
|
||||
FILE *file_in = fmemopen(buf, buf_len, "rb");
|
||||
if (file_in == NULL) {
|
||||
free(buf);
|
||||
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
|
||||
return;
|
||||
}
|
||||
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc, FILE *file_in, void* buf, size_t buf_len) {
|
||||
|
||||
// Open word doc
|
||||
options_type *opts = direct_vGetOptions();
|
||||
@ -88,7 +73,7 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
free(out_buf);
|
||||
}
|
||||
|
||||
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc, FILE *file, void* buf, size_t buf_len) {
|
||||
|
||||
scan_ebook_ctx_t ebook_ctx = {
|
||||
.content_size = ctx->content_size,
|
||||
@ -98,20 +83,6 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
.store = ctx->store,
|
||||
};
|
||||
|
||||
// Open file
|
||||
size_t buf_len;
|
||||
char *buf = read_all(f, &buf_len);
|
||||
if (buf == NULL) {
|
||||
CTX_LOG_ERROR(f->filepath, "read_all() failed")
|
||||
return;
|
||||
}
|
||||
|
||||
FILE *file = fmemopen(buf, buf_len, "rb");
|
||||
if (file == NULL) {
|
||||
free(buf);
|
||||
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
|
||||
return;
|
||||
}
|
||||
// Open word doc
|
||||
|
||||
options_type *opts = direct_vGetOptions();
|
||||
@ -157,9 +128,24 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
}
|
||||
|
||||
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
size_t buf_len;
|
||||
char *buf = read_all(f, &buf_len);
|
||||
if (buf == NULL) {
|
||||
CTX_LOG_ERROR(f->filepath, "read_all() failed")
|
||||
return;
|
||||
}
|
||||
|
||||
FILE *file = fmemopen(buf, buf_len, "rb");
|
||||
if (file == NULL) {
|
||||
free(buf);
|
||||
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
|
||||
return;
|
||||
}
|
||||
|
||||
if (ctx->tn_size > 0) {
|
||||
parse_msdoc_pdf(ctx, f, doc);
|
||||
parse_msdoc_pdf(ctx, f, doc, file, buf, buf_len);
|
||||
} else {
|
||||
parse_msdoc_text(ctx, f, doc);
|
||||
parse_msdoc_text(ctx, f, doc, file, buf, buf_len);
|
||||
}
|
||||
}
|
||||
|
@ -19,4 +19,6 @@ static int is_msdoc(scan_msdoc_ctx_t *ctx, unsigned int mime) {
|
||||
|
||||
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc, FILE *file_in, void* buf, size_t buf_len);
|
||||
|
||||
#endif
|
||||
|
@ -786,6 +786,27 @@ TEST(Msdoc, Test4Pdf) {
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Msdoc, TestFuzz1) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/msdoc/fuzz_ole.doc", &f, &doc);
|
||||
|
||||
size_t buf_len;
|
||||
char *buf = (char *) read_all(&f, &buf_len);
|
||||
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
size_t buf_len_copy = buf_len;
|
||||
char *buf_copy = (char*)malloc(buf_len);
|
||||
memcpy(buf_copy, buf, buf_len);
|
||||
|
||||
fuzz_buffer(buf_copy, &buf_len_copy, 3, 8, 5);
|
||||
FILE *file = fmemopen(buf_copy, buf_len_copy, "rb");
|
||||
parse_msdoc_text(&msdoc_text_ctx, &f, &doc, file, buf_copy, buf_len_copy);
|
||||
}
|
||||
free(buf);
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
setlocale(LC_ALL, "");
|
||||
|
@ -94,3 +94,19 @@ void destroy_doc(document_t *doc) {
|
||||
free(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
|
||||
size_t offset = rand() % (*buf_len - width - 1);
|
||||
|
||||
if (rand() % 100 < trunc_p) {
|
||||
*buf_len = MAX(offset, 1000);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int disp = 0; disp < width; disp++) {
|
||||
buf[offset + disp] = (int8_t)rand();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -42,4 +42,6 @@ meta_line_t *get_meta_from(meta_line_t *meta, metakey key);
|
||||
|
||||
void destroy_doc(document_t *doc);
|
||||
|
||||
void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p);
|
||||
|
||||
#endif
|
||||
|
2
third-party/antiword
vendored
2
third-party/antiword
vendored
@ -1 +1 @@
|
||||
Subproject commit be5e260190d807fdfb9ed1d64cf62d6649de3030
|
||||
Subproject commit eb8d737eea2866bfb45e50423a1fd6c51454c2f6
|
2
third-party/utf8.h
vendored
2
third-party/utf8.h
vendored
@ -1 +1 @@
|
||||
Subproject commit e9762540f33eed32d9a568e20ce4c4a836722a50
|
||||
Subproject commit a67acc78fd0fc272ad45362b828efdcb24874e64
|
Loading…
x
Reference in New Issue
Block a user