mirror of
				https://github.com/simon987/libscan.git
				synced 2025-10-31 09:26:53 +00:00 
			
		
		
		
	Workaround for UTF8 .doc files
This commit is contained in:
		
							parent
							
								
									ae9fadec47
								
							
						
					
					
						commit
						0a9742b686
					
				| @ -255,7 +255,7 @@ void fill_image(fz_context *fzctx, UNUSED(fz_device *dev), | |||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc) { | void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only) { | ||||||
| 
 | 
 | ||||||
|     fz_context *fzctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED); |     fz_context *fzctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED); | ||||||
|     thread_ctx = *ctx; |     thread_ctx = *ctx; | ||||||
| @ -285,26 +285,6 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     char title[8192] = {'\0',}; |  | ||||||
|     fz_try(fzctx) |  | ||||||
|         fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title)); |  | ||||||
|     fz_catch(fzctx) |  | ||||||
|         ; |  | ||||||
| 
 |  | ||||||
|     if (strlen(title) > 0) { |  | ||||||
|         APPEND_UTF8_META(doc, MetaTitle, title) |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     char author[4096] = {'\0',}; |  | ||||||
|     fz_try(fzctx) |  | ||||||
|         fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_AUTHOR, author, sizeof(author)); |  | ||||||
|     fz_catch(fzctx) |  | ||||||
|         ; |  | ||||||
| 
 |  | ||||||
|     if (strlen(author) > 0) { |  | ||||||
|         APPEND_UTF8_META(doc, MetaAuthor, author) |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     int page_count = -1; |     int page_count = -1; | ||||||
|     fz_var(err); |     fz_var(err); | ||||||
|     fz_try(fzctx) |     fz_try(fzctx) | ||||||
| @ -331,6 +311,33 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     if (tn_only) { | ||||||
|  |         fz_drop_stream(fzctx, stream); | ||||||
|  |         fz_drop_document(fzctx, fzdoc); | ||||||
|  |         fz_drop_context(fzctx); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     char title[8192] = {'\0',}; | ||||||
|  |     fz_try(fzctx) | ||||||
|  |                 fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title)); | ||||||
|  |     fz_catch(fzctx) | ||||||
|  |         ; | ||||||
|  | 
 | ||||||
|  |     if (strlen(title) > 0) { | ||||||
|  |         APPEND_UTF8_META(doc, MetaTitle, title) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     char author[4096] = {'\0',}; | ||||||
|  |     fz_try(fzctx) | ||||||
|  |                 fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_AUTHOR, author, sizeof(author)); | ||||||
|  |     fz_catch(fzctx) | ||||||
|  |         ; | ||||||
|  | 
 | ||||||
|  |     if (strlen(author) > 0) { | ||||||
|  |         APPEND_UTF8_META(doc, MetaAuthor, author) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
|     if (ctx->content_size > 0) { |     if (ctx->content_size > 0) { | ||||||
|         fz_stext_options opts = {0}; |         fz_stext_options opts = {0}; | ||||||
| @ -425,6 +432,6 @@ void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, docume | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     parse_ebook_mem(ctx, buf, buf_len, mime_str, doc); |     parse_ebook_mem(ctx, buf, buf_len, mime_str, doc, FALSE); | ||||||
|     free(buf); |     free(buf); | ||||||
| } | } | ||||||
|  | |||||||
| @ -16,6 +16,6 @@ typedef struct { | |||||||
| } scan_ebook_ctx_t; | } scan_ebook_ctx_t; | ||||||
| 
 | 
 | ||||||
| void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char* mime_str,  document_t *doc); | void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char* mime_str,  document_t *doc); | ||||||
| void parse_ebook_mem(scan_ebook_ctx_t *ctx, void* buf, size_t buf_len, const char* mime_str,  document_t *doc); | void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only); | ||||||
| 
 | 
 | ||||||
| #endif | #endif | ||||||
|  | |||||||
| @ -22,7 +22,6 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi | |||||||
| 
 | 
 | ||||||
|     int doc_word_version = iGuessVersionNumber(file_in, buf_len); |     int doc_word_version = iGuessVersionNumber(file_in, buf_len); | ||||||
|     if (doc_word_version < 0 || doc_word_version == 3) { |     if (doc_word_version < 0 || doc_word_version == 3) { | ||||||
|         fclose(file_in); |  | ||||||
|         free(buf); |         free(buf); | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
| @ -68,7 +67,6 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi | |||||||
|         text_buffer_destroy(&tex); |         text_buffer_destroy(&tex); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     fclose(file_in); |  | ||||||
|     free(buf); |     free(buf); | ||||||
|     free(out_buf); |     free(out_buf); | ||||||
| } | } | ||||||
| @ -84,7 +82,6 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b | |||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|     // Open word doc
 |     // Open word doc
 | ||||||
| 
 |  | ||||||
|     options_type *opts = direct_vGetOptions(); |     options_type *opts = direct_vGetOptions(); | ||||||
|     opts->iParagraphBreak = 74; |     opts->iParagraphBreak = 74; | ||||||
|     opts->eConversionType = conversion_pdf; |     opts->eConversionType = conversion_pdf; | ||||||
| @ -98,7 +95,6 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b | |||||||
| 
 | 
 | ||||||
|     int doc_word_version = iGuessVersionNumber(file, buf_len); |     int doc_word_version = iGuessVersionNumber(file, buf_len); | ||||||
|     if (doc_word_version < 0 || doc_word_version == 3) { |     if (doc_word_version < 0 || doc_word_version == 3) { | ||||||
|         fclose(file); |  | ||||||
|         free(buf); |         free(buf); | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
| @ -111,7 +107,6 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b | |||||||
| 
 | 
 | ||||||
|     diagram_type *diag = pCreateDiagram("antiword", NULL, file_out); |     diagram_type *diag = pCreateDiagram("antiword", NULL, file_out); | ||||||
|     if (diag == NULL) { |     if (diag == NULL) { | ||||||
|         fclose(file); |  | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
| @ -120,9 +115,8 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b | |||||||
| 
 | 
 | ||||||
|     fclose(file_out); |     fclose(file_out); | ||||||
| 
 | 
 | ||||||
|     parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc); |     parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc, TRUE); | ||||||
| 
 | 
 | ||||||
|     fclose(file); |  | ||||||
|     free(buf); |     free(buf); | ||||||
|     free(out_buf); |     free(out_buf); | ||||||
| } | } | ||||||
| @ -144,8 +138,10 @@ void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { | |||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     if (ctx->tn_size > 0) { |     if (ctx->tn_size > 0) { | ||||||
|         parse_msdoc_pdf(ctx, doc, file, buf, buf_len); |         char *buf_pdf = malloc(buf_len); | ||||||
|     } else { |         memcpy(buf_pdf, buf, buf_len); | ||||||
|         parse_msdoc_text(ctx, doc, file, buf, buf_len); |         parse_msdoc_pdf(ctx, doc, file, buf_pdf, buf_len); | ||||||
|     } |     } | ||||||
|  |     parse_msdoc_text(ctx, doc, file, buf, buf_len); | ||||||
|  |     fclose(file); | ||||||
| } | } | ||||||
|  | |||||||
| @ -786,6 +786,34 @@ TEST(Msdoc, Test4Pdf) { | |||||||
|     cleanup(&doc, &f); |     cleanup(&doc, &f); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | TEST(Msdoc, TestUtf8Pdf) { | ||||||
|  |     vfile_t f; | ||||||
|  |     document_t doc; | ||||||
|  |     load_doc_file("libscan-test-files/test_files/msdoc/japanese.doc", &f, &doc); | ||||||
|  | 
 | ||||||
|  |     size_t size_before = store_size; | ||||||
|  | 
 | ||||||
|  |     parse_msdoc(&msdoc_ctx, &f, &doc); | ||||||
|  | 
 | ||||||
|  |     ASSERT_NE(get_meta(&doc, MetaContent), nullptr); | ||||||
|  |     ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "调查项目 A questionnaire") != nullptr); | ||||||
|  |     ASSERT_NE(size_before, store_size); | ||||||
|  | 
 | ||||||
|  |     cleanup(&doc, &f); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | TEST(Msdoc, TestUtf8Text) { | ||||||
|  |     vfile_t f; | ||||||
|  |     document_t doc; | ||||||
|  |     load_doc_file("libscan-test-files/test_files/msdoc/japanese.doc", &f, &doc); | ||||||
|  | 
 | ||||||
|  |     parse_msdoc(&msdoc_text_ctx, &f, &doc); | ||||||
|  | 
 | ||||||
|  |     ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "调查项目 A questionnaire") != nullptr); | ||||||
|  | 
 | ||||||
|  |     cleanup(&doc, &f); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| TEST(Msdoc, TestFuzz1) { | TEST(Msdoc, TestFuzz1) { | ||||||
|     vfile_t f; |     vfile_t f; | ||||||
|     document_t doc; |     document_t doc; | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								third-party/antiword
									
									
									
									
										vendored
									
									
								
							
							
								
								
								
								
								
								
									
									
								
							
						
						
									
										2
									
								
								third-party/antiword
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | |||||||
| Subproject commit eb8d737eea2866bfb45e50423a1fd6c51454c2f6 | Subproject commit 62ae66db99e9dd88dfa31999f516f71bb8bdc8b2 | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user