mirror of
				https://github.com/simon987/libscan.git
				synced 2025-10-31 01:16:52 +00:00 
			
		
		
		
	Workaround for UTF8 .doc files
This commit is contained in:
		
							parent
							
								
									ae9fadec47
								
							
						
					
					
						commit
						0a9742b686
					
				| @ -255,7 +255,7 @@ void fill_image(fz_context *fzctx, UNUSED(fz_device *dev), | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc) { | ||||
| void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only) { | ||||
| 
 | ||||
|     fz_context *fzctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED); | ||||
|     thread_ctx = *ctx; | ||||
| @ -285,26 +285,6 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha | ||||
|         return; | ||||
|     } | ||||
| 
 | ||||
|     char title[8192] = {'\0',}; | ||||
|     fz_try(fzctx) | ||||
|         fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title)); | ||||
|     fz_catch(fzctx) | ||||
|         ; | ||||
| 
 | ||||
|     if (strlen(title) > 0) { | ||||
|         APPEND_UTF8_META(doc, MetaTitle, title) | ||||
|     } | ||||
| 
 | ||||
|     char author[4096] = {'\0',}; | ||||
|     fz_try(fzctx) | ||||
|         fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_AUTHOR, author, sizeof(author)); | ||||
|     fz_catch(fzctx) | ||||
|         ; | ||||
| 
 | ||||
|     if (strlen(author) > 0) { | ||||
|         APPEND_UTF8_META(doc, MetaAuthor, author) | ||||
|     } | ||||
| 
 | ||||
|     int page_count = -1; | ||||
|     fz_var(err); | ||||
|     fz_try(fzctx) | ||||
| @ -331,6 +311,33 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     if (tn_only) { | ||||
|         fz_drop_stream(fzctx, stream); | ||||
|         fz_drop_document(fzctx, fzdoc); | ||||
|         fz_drop_context(fzctx); | ||||
|         return; | ||||
|     } | ||||
| 
 | ||||
|     char title[8192] = {'\0',}; | ||||
|     fz_try(fzctx) | ||||
|                 fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title)); | ||||
|     fz_catch(fzctx) | ||||
|         ; | ||||
| 
 | ||||
|     if (strlen(title) > 0) { | ||||
|         APPEND_UTF8_META(doc, MetaTitle, title) | ||||
|     } | ||||
| 
 | ||||
|     char author[4096] = {'\0',}; | ||||
|     fz_try(fzctx) | ||||
|                 fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_AUTHOR, author, sizeof(author)); | ||||
|     fz_catch(fzctx) | ||||
|         ; | ||||
| 
 | ||||
|     if (strlen(author) > 0) { | ||||
|         APPEND_UTF8_META(doc, MetaAuthor, author) | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
|     if (ctx->content_size > 0) { | ||||
|         fz_stext_options opts = {0}; | ||||
| @ -425,6 +432,6 @@ void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, docume | ||||
|         return; | ||||
|     } | ||||
| 
 | ||||
|     parse_ebook_mem(ctx, buf, buf_len, mime_str, doc); | ||||
|     parse_ebook_mem(ctx, buf, buf_len, mime_str, doc, FALSE); | ||||
|     free(buf); | ||||
| } | ||||
|  | ||||
| @ -16,6 +16,6 @@ typedef struct { | ||||
| } scan_ebook_ctx_t; | ||||
| 
 | ||||
| void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char* mime_str,  document_t *doc); | ||||
| void parse_ebook_mem(scan_ebook_ctx_t *ctx, void* buf, size_t buf_len, const char* mime_str,  document_t *doc); | ||||
| void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only); | ||||
| 
 | ||||
| #endif | ||||
|  | ||||
| @ -22,7 +22,6 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi | ||||
| 
 | ||||
|     int doc_word_version = iGuessVersionNumber(file_in, buf_len); | ||||
|     if (doc_word_version < 0 || doc_word_version == 3) { | ||||
|         fclose(file_in); | ||||
|         free(buf); | ||||
|         return; | ||||
|     } | ||||
| @ -68,7 +67,6 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi | ||||
|         text_buffer_destroy(&tex); | ||||
|     } | ||||
| 
 | ||||
|     fclose(file_in); | ||||
|     free(buf); | ||||
|     free(out_buf); | ||||
| } | ||||
| @ -84,7 +82,6 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b | ||||
|     }; | ||||
| 
 | ||||
|     // Open word doc
 | ||||
| 
 | ||||
|     options_type *opts = direct_vGetOptions(); | ||||
|     opts->iParagraphBreak = 74; | ||||
|     opts->eConversionType = conversion_pdf; | ||||
| @ -98,7 +95,6 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b | ||||
| 
 | ||||
|     int doc_word_version = iGuessVersionNumber(file, buf_len); | ||||
|     if (doc_word_version < 0 || doc_word_version == 3) { | ||||
|         fclose(file); | ||||
|         free(buf); | ||||
|         return; | ||||
|     } | ||||
| @ -111,7 +107,6 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b | ||||
| 
 | ||||
|     diagram_type *diag = pCreateDiagram("antiword", NULL, file_out); | ||||
|     if (diag == NULL) { | ||||
|         fclose(file); | ||||
|         return; | ||||
|     } | ||||
| 
 | ||||
| @ -120,9 +115,8 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b | ||||
| 
 | ||||
|     fclose(file_out); | ||||
| 
 | ||||
|     parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc); | ||||
|     parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc, TRUE); | ||||
| 
 | ||||
|     fclose(file); | ||||
|     free(buf); | ||||
|     free(out_buf); | ||||
| } | ||||
| @ -144,8 +138,10 @@ void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { | ||||
|     } | ||||
| 
 | ||||
|     if (ctx->tn_size > 0) { | ||||
|         parse_msdoc_pdf(ctx, doc, file, buf, buf_len); | ||||
|     } else { | ||||
|         parse_msdoc_text(ctx, doc, file, buf, buf_len); | ||||
|         char *buf_pdf = malloc(buf_len); | ||||
|         memcpy(buf_pdf, buf, buf_len); | ||||
|         parse_msdoc_pdf(ctx, doc, file, buf_pdf, buf_len); | ||||
|     } | ||||
|     parse_msdoc_text(ctx, doc, file, buf, buf_len); | ||||
|     fclose(file); | ||||
| } | ||||
|  | ||||
| @ -786,6 +786,34 @@ TEST(Msdoc, Test4Pdf) { | ||||
|     cleanup(&doc, &f); | ||||
| } | ||||
| 
 | ||||
| TEST(Msdoc, TestUtf8Pdf) { | ||||
|     vfile_t f; | ||||
|     document_t doc; | ||||
|     load_doc_file("libscan-test-files/test_files/msdoc/japanese.doc", &f, &doc); | ||||
| 
 | ||||
|     size_t size_before = store_size; | ||||
| 
 | ||||
|     parse_msdoc(&msdoc_ctx, &f, &doc); | ||||
| 
 | ||||
|     ASSERT_NE(get_meta(&doc, MetaContent), nullptr); | ||||
|     ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "调查项目 A questionnaire") != nullptr); | ||||
|     ASSERT_NE(size_before, store_size); | ||||
| 
 | ||||
|     cleanup(&doc, &f); | ||||
| } | ||||
| 
 | ||||
| TEST(Msdoc, TestUtf8Text) { | ||||
|     vfile_t f; | ||||
|     document_t doc; | ||||
|     load_doc_file("libscan-test-files/test_files/msdoc/japanese.doc", &f, &doc); | ||||
| 
 | ||||
|     parse_msdoc(&msdoc_text_ctx, &f, &doc); | ||||
| 
 | ||||
|     ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "调查项目 A questionnaire") != nullptr); | ||||
| 
 | ||||
|     cleanup(&doc, &f); | ||||
| } | ||||
| 
 | ||||
| TEST(Msdoc, TestFuzz1) { | ||||
|     vfile_t f; | ||||
|     document_t doc; | ||||
|  | ||||
							
								
								
									
										2
									
								
								third-party/antiword
									
									
									
									
										vendored
									
									
								
							
							
								
								
								
								
								
								
									
									
								
							
						
						
									
										2
									
								
								third-party/antiword
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | ||||
| Subproject commit eb8d737eea2866bfb45e50423a1fd6c51454c2f6 | ||||
| Subproject commit 62ae66db99e9dd88dfa31999f516f71bb8bdc8b2 | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user