mirror of
https://github.com/simon987/libscan.git
synced 2025-04-05 12:23:00 +00:00
Better support for .doc files
This commit is contained in:
parent
11876ffbad
commit
6b47b4dfbb
@ -1 +1,7 @@
|
|||||||
Please use [sist2](https://github.com/simon987/sist2) tracker for issues
|
Please use [sist2](https://github.com/simon987/sist2) tracker for issues
|
||||||
|
|
||||||
|
|
||||||
|
### Run fuzz tests:
|
||||||
|
```bash
|
||||||
|
./scan_a_test --gtest_filter=*Fuzz* --gtest_repeat=100
|
||||||
|
```
|
@ -6,22 +6,7 @@
|
|||||||
|
|
||||||
#include "../ebook/ebook.h"
|
#include "../ebook/ebook.h"
|
||||||
|
|
||||||
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc, FILE *file_in, void* buf, size_t buf_len) {
|
||||||
|
|
||||||
// Open file
|
|
||||||
size_t buf_len;
|
|
||||||
char *buf = read_all(f, &buf_len);
|
|
||||||
if (buf == NULL) {
|
|
||||||
CTX_LOG_ERROR(f->filepath, "read_all() failed")
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
FILE *file_in = fmemopen(buf, buf_len, "rb");
|
|
||||||
if (file_in == NULL) {
|
|
||||||
free(buf);
|
|
||||||
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Open word doc
|
// Open word doc
|
||||||
options_type *opts = direct_vGetOptions();
|
options_type *opts = direct_vGetOptions();
|
||||||
@ -88,7 +73,7 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
|||||||
free(out_buf);
|
free(out_buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc, FILE *file, void* buf, size_t buf_len) {
|
||||||
|
|
||||||
scan_ebook_ctx_t ebook_ctx = {
|
scan_ebook_ctx_t ebook_ctx = {
|
||||||
.content_size = ctx->content_size,
|
.content_size = ctx->content_size,
|
||||||
@ -98,20 +83,6 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
|||||||
.store = ctx->store,
|
.store = ctx->store,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Open file
|
|
||||||
size_t buf_len;
|
|
||||||
char *buf = read_all(f, &buf_len);
|
|
||||||
if (buf == NULL) {
|
|
||||||
CTX_LOG_ERROR(f->filepath, "read_all() failed")
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
FILE *file = fmemopen(buf, buf_len, "rb");
|
|
||||||
if (file == NULL) {
|
|
||||||
free(buf);
|
|
||||||
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// Open word doc
|
// Open word doc
|
||||||
|
|
||||||
options_type *opts = direct_vGetOptions();
|
options_type *opts = direct_vGetOptions();
|
||||||
@ -157,9 +128,24 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||||
|
|
||||||
|
size_t buf_len;
|
||||||
|
char *buf = read_all(f, &buf_len);
|
||||||
|
if (buf == NULL) {
|
||||||
|
CTX_LOG_ERROR(f->filepath, "read_all() failed")
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
FILE *file = fmemopen(buf, buf_len, "rb");
|
||||||
|
if (file == NULL) {
|
||||||
|
free(buf);
|
||||||
|
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (ctx->tn_size > 0) {
|
if (ctx->tn_size > 0) {
|
||||||
parse_msdoc_pdf(ctx, f, doc);
|
parse_msdoc_pdf(ctx, f, doc, file, buf, buf_len);
|
||||||
} else {
|
} else {
|
||||||
parse_msdoc_text(ctx, f, doc);
|
parse_msdoc_text(ctx, f, doc, file, buf, buf_len);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,4 +19,6 @@ static int is_msdoc(scan_msdoc_ctx_t *ctx, unsigned int mime) {
|
|||||||
|
|
||||||
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc);
|
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||||
|
|
||||||
|
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc, FILE *file_in, void* buf, size_t buf_len);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -786,6 +786,27 @@ TEST(Msdoc, Test4Pdf) {
|
|||||||
cleanup(&doc, &f);
|
cleanup(&doc, &f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(Msdoc, TestFuzz1) {
|
||||||
|
vfile_t f;
|
||||||
|
document_t doc;
|
||||||
|
load_doc_file("libscan-test-files/test_files/msdoc/fuzz_ole.doc", &f, &doc);
|
||||||
|
|
||||||
|
size_t buf_len;
|
||||||
|
char *buf = (char *) read_all(&f, &buf_len);
|
||||||
|
|
||||||
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
size_t buf_len_copy = buf_len;
|
||||||
|
char *buf_copy = (char*)malloc(buf_len);
|
||||||
|
memcpy(buf_copy, buf, buf_len);
|
||||||
|
|
||||||
|
fuzz_buffer(buf_copy, &buf_len_copy, 3, 8, 5);
|
||||||
|
FILE *file = fmemopen(buf_copy, buf_len_copy, "rb");
|
||||||
|
parse_msdoc_text(&msdoc_text_ctx, &f, &doc, file, buf_copy, buf_len_copy);
|
||||||
|
}
|
||||||
|
free(buf);
|
||||||
|
cleanup(&doc, &f);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
setlocale(LC_ALL, "");
|
setlocale(LC_ALL, "");
|
||||||
@ -833,7 +854,7 @@ int main(int argc, char **argv) {
|
|||||||
media_ctx.store = counter_store;
|
media_ctx.store = counter_store;
|
||||||
media_ctx.tn_size = 500;
|
media_ctx.tn_size = 500;
|
||||||
media_ctx.tn_qscale = 1.0;
|
media_ctx.tn_qscale = 1.0;
|
||||||
media_ctx.max_media_buffer = (long)2000 * (long)1024 * (long)1024;
|
media_ctx.max_media_buffer = (long) 2000 * (long) 1024 * (long) 1024;
|
||||||
|
|
||||||
ooxml_500_ctx.content_size = 500;
|
ooxml_500_ctx.content_size = 500;
|
||||||
ooxml_500_ctx.log = noop_log;
|
ooxml_500_ctx.log = noop_log;
|
||||||
|
@ -94,3 +94,19 @@ void destroy_doc(document_t *doc) {
|
|||||||
free(tmp);
|
free(tmp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p) {
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
|
||||||
|
size_t offset = rand() % (*buf_len - width - 1);
|
||||||
|
|
||||||
|
if (rand() % 100 < trunc_p) {
|
||||||
|
*buf_len = MAX(offset, 1000);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int disp = 0; disp < width; disp++) {
|
||||||
|
buf[offset + disp] = (int8_t)rand();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -42,4 +42,6 @@ meta_line_t *get_meta_from(meta_line_t *meta, metakey key);
|
|||||||
|
|
||||||
void destroy_doc(document_t *doc);
|
void destroy_doc(document_t *doc);
|
||||||
|
|
||||||
|
void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
2
third-party/antiword
vendored
2
third-party/antiword
vendored
@ -1 +1 @@
|
|||||||
Subproject commit be5e260190d807fdfb9ed1d64cf62d6649de3030
|
Subproject commit eb8d737eea2866bfb45e50423a1fd6c51454c2f6
|
2
third-party/utf8.h
vendored
2
third-party/utf8.h
vendored
@ -1 +1 @@
|
|||||||
Subproject commit e9762540f33eed32d9a568e20ce4c4a836722a50
|
Subproject commit a67acc78fd0fc272ad45362b828efdcb24874e64
|
Loading…
x
Reference in New Issue
Block a user