mirror of
				https://github.com/simon987/libscan.git
				synced 2025-11-04 11:06:52 +00:00 
			
		
		
		
	Read embedded thumbnail simon987/sist2#74
This commit is contained in:
		
							parent
							
								
									00d5680217
								
							
						
					
					
						commit
						9a240f039b
					
				@ -53,7 +53,6 @@ find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/5
 | 
				
			|||||||
target_compile_options(
 | 
					target_compile_options(
 | 
				
			||||||
        scan
 | 
					        scan
 | 
				
			||||||
        PRIVATE
 | 
					        PRIVATE
 | 
				
			||||||
        -Werror
 | 
					 | 
				
			||||||
        -g
 | 
					        -g
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -145,6 +145,23 @@ static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_
 | 
				
			|||||||
    return 0;
 | 
					    return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MAX_TN_SIZE 1024 * 1024 * 15
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, struct archive_entry *entry) {
 | 
				
			||||||
 | 
					    size_t entry_size = archive_entry_size(entry);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (entry_size <= 0 || entry_size > MAX_TN_SIZE) {
 | 
				
			||||||
 | 
					        return;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    char* buf = malloc(entry_size);
 | 
				
			||||||
 | 
					    archive_read_data(a, buf, entry_size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    APPEND_TN_META(doc, 1, 1) // Size unknown
 | 
				
			||||||
 | 
					    ctx->store((char *) doc->uuid, sizeof(doc->uuid), buf, entry_size);
 | 
				
			||||||
 | 
					    free(buf);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
 | 
					void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    size_t buf_len;
 | 
					    size_t buf_len;
 | 
				
			||||||
@ -180,6 +197,8 @@ void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
 | 
				
			|||||||
                if (read_doc_props(ctx, a, &tex, doc) != 0) {
 | 
					                if (read_doc_props(ctx, a, &tex, doc) != 0) {
 | 
				
			||||||
                    break;
 | 
					                    break;
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
 | 
					            } else if (strcmp(path, "docProps/thumbnail.jpeg") == 0) {
 | 
				
			||||||
 | 
					                read_thumbnail(ctx, doc, a, entry);
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
				
			|||||||
@ -8,6 +8,7 @@ typedef struct {
 | 
				
			|||||||
    long content_size;
 | 
					    long content_size;
 | 
				
			||||||
    log_callback_t log;
 | 
					    log_callback_t log;
 | 
				
			||||||
    logf_callback_t logf;
 | 
					    logf_callback_t logf;
 | 
				
			||||||
 | 
					    store_callback_t store;
 | 
				
			||||||
} scan_ooxml_ctx_t;
 | 
					} scan_ooxml_ctx_t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc);
 | 
					void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc);
 | 
				
			||||||
 | 
				
			|||||||
@ -388,6 +388,21 @@ TEST(Ooxml, Docx1) {
 | 
				
			|||||||
    cleanup(&doc, &f);
 | 
					    cleanup(&doc, &f);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TEST(Ooxml, Docx2Thumbnail) {
 | 
				
			||||||
 | 
					    vfile_t f;
 | 
				
			||||||
 | 
					    document_t doc;
 | 
				
			||||||
 | 
					    load_doc_file("libscan-test-files/test_files/ooxml/embed_tn.docx", &f, &doc);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    size_t size_before = store_size;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    parse_ooxml(&ooxml_500_ctx, &f, &doc);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
 | 
				
			||||||
 | 
					    ASSERT_NE(size_before, store_size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cleanup(&doc, &f);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TEST(Ooxml, Xlsx1) {
 | 
					TEST(Ooxml, Xlsx1) {
 | 
				
			||||||
    vfile_t f;
 | 
					    vfile_t f;
 | 
				
			||||||
    document_t doc;
 | 
					    document_t doc;
 | 
				
			||||||
@ -550,13 +565,13 @@ int main(int argc, char **argv) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    arc_recurse_media_ctx.log = noop_log;
 | 
					    arc_recurse_media_ctx.log = noop_log;
 | 
				
			||||||
    arc_recurse_media_ctx.logf = noop_logf;
 | 
					    arc_recurse_media_ctx.logf = noop_logf;
 | 
				
			||||||
    arc_recurse_media_ctx.store = noop_store;
 | 
					    arc_recurse_media_ctx.store = counter_store;
 | 
				
			||||||
    arc_recurse_media_ctx.mode = ARC_MODE_RECURSE;
 | 
					    arc_recurse_media_ctx.mode = ARC_MODE_RECURSE;
 | 
				
			||||||
    arc_recurse_media_ctx.parse = _parse_media;
 | 
					    arc_recurse_media_ctx.parse = _parse_media;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    arc_list_ctx.log = noop_log;
 | 
					    arc_list_ctx.log = noop_log;
 | 
				
			||||||
    arc_list_ctx.logf = noop_logf;
 | 
					    arc_list_ctx.logf = noop_logf;
 | 
				
			||||||
    arc_list_ctx.store = noop_store;
 | 
					    arc_list_ctx.store = counter_store;
 | 
				
			||||||
    arc_list_ctx.mode = ARC_MODE_LIST;
 | 
					    arc_list_ctx.mode = ARC_MODE_LIST;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    text_500_ctx.content_size = 500;
 | 
					    text_500_ctx.content_size = 500;
 | 
				
			||||||
@ -564,7 +579,7 @@ int main(int argc, char **argv) {
 | 
				
			|||||||
    text_500_ctx.logf = noop_logf;
 | 
					    text_500_ctx.logf = noop_logf;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ebook_ctx.content_size = 999999999999;
 | 
					    ebook_ctx.content_size = 999999999999;
 | 
				
			||||||
    ebook_ctx.store = noop_store;
 | 
					    ebook_ctx.store = counter_store;
 | 
				
			||||||
    ebook_ctx.tesseract_lang = "eng";
 | 
					    ebook_ctx.tesseract_lang = "eng";
 | 
				
			||||||
    ebook_ctx.tesseract_path = "./tessdata";
 | 
					    ebook_ctx.tesseract_path = "./tessdata";
 | 
				
			||||||
    ebook_ctx.tn_size = 500;
 | 
					    ebook_ctx.tn_size = 500;
 | 
				
			||||||
@ -576,7 +591,7 @@ int main(int argc, char **argv) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    media_ctx.log = noop_log;
 | 
					    media_ctx.log = noop_log;
 | 
				
			||||||
    media_ctx.logf = noop_logf;
 | 
					    media_ctx.logf = noop_logf;
 | 
				
			||||||
    media_ctx.store = noop_store;
 | 
					    media_ctx.store = counter_store;
 | 
				
			||||||
    media_ctx.tn_size = 500;
 | 
					    media_ctx.tn_size = 500;
 | 
				
			||||||
    media_ctx.tn_qscale = 1.0;
 | 
					    media_ctx.tn_qscale = 1.0;
 | 
				
			||||||
    media_ctx.max_media_buffer = (long)2000 * 1024 * 1024;
 | 
					    media_ctx.max_media_buffer = (long)2000 * 1024 * 1024;
 | 
				
			||||||
@ -584,6 +599,7 @@ int main(int argc, char **argv) {
 | 
				
			|||||||
    ooxml_500_ctx.content_size = 500;
 | 
					    ooxml_500_ctx.content_size = 500;
 | 
				
			||||||
    ooxml_500_ctx.log = noop_log;
 | 
					    ooxml_500_ctx.log = noop_log;
 | 
				
			||||||
    ooxml_500_ctx.logf = noop_logf;
 | 
					    ooxml_500_ctx.logf = noop_logf;
 | 
				
			||||||
 | 
					    ooxml_500_ctx.store = counter_store;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    mobi_500_ctx.content_size = 500;
 | 
					    mobi_500_ctx.content_size = 500;
 | 
				
			||||||
    mobi_500_ctx.log = noop_log;
 | 
					    mobi_500_ctx.log = noop_log;
 | 
				
			||||||
@ -591,7 +607,7 @@ int main(int argc, char **argv) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    raw_ctx.log = noop_log;
 | 
					    raw_ctx.log = noop_log;
 | 
				
			||||||
    raw_ctx.logf = noop_logf;
 | 
					    raw_ctx.logf = noop_logf;
 | 
				
			||||||
    raw_ctx.store = noop_store;
 | 
					    raw_ctx.store = counter_store;
 | 
				
			||||||
    raw_ctx.tn_size = 500;
 | 
					    raw_ctx.tn_size = 500;
 | 
				
			||||||
    raw_ctx.tn_qscale = 5.0;
 | 
					    raw_ctx.tn_qscale = 5.0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -17,8 +17,10 @@ static void noop_log(const char *filepath, int level, char *str) {
 | 
				
			|||||||
    // noop
 | 
					    // noop
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void noop_store(char* key, size_t key_len, char *value, size_t value_len) {
 | 
					static size_t store_size = 0;
 | 
				
			||||||
    // noop
 | 
					
 | 
				
			||||||
 | 
					static void counter_store(char* key, size_t key_len, char *value, size_t value_len) {
 | 
				
			||||||
 | 
					    store_size += value_len;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
meta_line_t *get_meta(document_t *doc, metakey key);
 | 
					meta_line_t *get_meta(document_t *doc, metakey key);
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user