Read embedded thumbnail simon987/sist2#74

This commit is contained in:
simon987 2020-07-15 20:56:25 -04:00
parent 00d5680217
commit 9a240f039b
5 changed files with 45 additions and 8 deletions

View File

@ -53,7 +53,6 @@ find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/5
target_compile_options(
scan
PRIVATE
-Werror
-g
)

View File

@ -145,6 +145,23 @@ static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_
return 0;
}
#define MAX_TN_SIZE 1024 * 1024 * 15
void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, struct archive_entry *entry) {
size_t entry_size = archive_entry_size(entry);
if (entry_size <= 0 || entry_size > MAX_TN_SIZE) {
return;
}
char* buf = malloc(entry_size);
archive_read_data(a, buf, entry_size);
APPEND_TN_META(doc, 1, 1) // Size unknown
ctx->store((char *) doc->uuid, sizeof(doc->uuid), buf, entry_size);
free(buf);
}
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
@ -180,6 +197,8 @@ void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
if (read_doc_props(ctx, a, &tex, doc) != 0) {
break;
}
} else if (strcmp(path, "docProps/thumbnail.jpeg") == 0) {
read_thumbnail(ctx, doc, a, entry);
}
}
}

View File

@ -8,6 +8,7 @@ typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
} scan_ooxml_ctx_t;
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc);

View File

@ -388,6 +388,21 @@ TEST(Ooxml, Docx1) {
cleanup(&doc, &f);
}
TEST(Ooxml, Docx2Thumbnail) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ooxml/embed_tn.docx", &f, &doc);
size_t size_before = store_size;
parse_ooxml(&ooxml_500_ctx, &f, &doc);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
ASSERT_NE(size_before, store_size);
cleanup(&doc, &f);
}
TEST(Ooxml, Xlsx1) {
vfile_t f;
document_t doc;
@ -550,13 +565,13 @@ int main(int argc, char **argv) {
arc_recurse_media_ctx.log = noop_log;
arc_recurse_media_ctx.logf = noop_logf;
arc_recurse_media_ctx.store = noop_store;
arc_recurse_media_ctx.store = counter_store;
arc_recurse_media_ctx.mode = ARC_MODE_RECURSE;
arc_recurse_media_ctx.parse = _parse_media;
arc_list_ctx.log = noop_log;
arc_list_ctx.logf = noop_logf;
arc_list_ctx.store = noop_store;
arc_list_ctx.store = counter_store;
arc_list_ctx.mode = ARC_MODE_LIST;
text_500_ctx.content_size = 500;
@ -564,7 +579,7 @@ int main(int argc, char **argv) {
text_500_ctx.logf = noop_logf;
ebook_ctx.content_size = 999999999999;
ebook_ctx.store = noop_store;
ebook_ctx.store = counter_store;
ebook_ctx.tesseract_lang = "eng";
ebook_ctx.tesseract_path = "./tessdata";
ebook_ctx.tn_size = 500;
@ -576,7 +591,7 @@ int main(int argc, char **argv) {
media_ctx.log = noop_log;
media_ctx.logf = noop_logf;
media_ctx.store = noop_store;
media_ctx.store = counter_store;
media_ctx.tn_size = 500;
media_ctx.tn_qscale = 1.0;
media_ctx.max_media_buffer = (long)2000 * 1024 * 1024;
@ -584,6 +599,7 @@ int main(int argc, char **argv) {
ooxml_500_ctx.content_size = 500;
ooxml_500_ctx.log = noop_log;
ooxml_500_ctx.logf = noop_logf;
ooxml_500_ctx.store = counter_store;
mobi_500_ctx.content_size = 500;
mobi_500_ctx.log = noop_log;
@ -591,7 +607,7 @@ int main(int argc, char **argv) {
raw_ctx.log = noop_log;
raw_ctx.logf = noop_logf;
raw_ctx.store = noop_store;
raw_ctx.store = counter_store;
raw_ctx.tn_size = 500;
raw_ctx.tn_qscale = 5.0;

View File

@ -17,8 +17,10 @@ static void noop_log(const char *filepath, int level, char *str) {
// noop
}
static void noop_store(char* key, size_t key_len, char *value, size_t value_len) {
// noop
static size_t store_size = 0;
static void counter_store(char* key, size_t key_len, char *value, size_t value_len) {
store_size += value_len;
}
meta_line_t *get_meta(document_t *doc, metakey key);