mirror of
				https://github.com/simon987/libscan.git
				synced 2025-10-31 01:16:52 +00:00 
			
		
		
		
	Add .doc support
This commit is contained in:
		
							parent
							
								
									8643f5fb65
								
							
						
					
					
						commit
						070186fea0
					
				
							
								
								
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							| @ -1,3 +1,6 @@ | |||||||
| [submodule "third-party/utf8.h"] | [submodule "third-party/utf8.h"] | ||||||
| 	path = third-party/utf8.h | 	path = third-party/utf8.h | ||||||
| 	url = https://github.com/sheredom/utf8.h | 	url = https://github.com/sheredom/utf8.h | ||||||
|  | [submodule "third-party/antiword"] | ||||||
|  | 	path = third-party/antiword | ||||||
|  | 	url = https://github.com/simon987/antiword | ||||||
|  | |||||||
| @ -5,6 +5,12 @@ set(CMAKE_C_STANDARD 11) | |||||||
| 
 | 
 | ||||||
| option(BUILD_TESTS "Build tests" off) | option(BUILD_TESTS "Build tests" off) | ||||||
| 
 | 
 | ||||||
|  | add_subdirectory(third-party/antiword) | ||||||
|  | add_compile_definitions( | ||||||
|  |         antiword | ||||||
|  |         NDEBUG | ||||||
|  | ) | ||||||
|  | 
 | ||||||
| add_library( | add_library( | ||||||
|         scan |         scan | ||||||
|         libscan/util.c libscan/util.h |         libscan/util.c libscan/util.h | ||||||
| @ -18,6 +24,7 @@ add_library( | |||||||
|         libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h |         libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h | ||||||
|         libscan/media/media.c libscan/media/media.h |         libscan/media/media.c libscan/media/media.h | ||||||
|         libscan/font/font.c libscan/font/font.h |         libscan/font/font.c libscan/font/font.h | ||||||
|  |         libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h | ||||||
| 
 | 
 | ||||||
|         third-party/utf8.h |         third-party/utf8.h | ||||||
|         libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h) |         libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h) | ||||||
| @ -110,6 +117,7 @@ add_dependencies( | |||||||
|         scan |         scan | ||||||
|         libmobi |         libmobi | ||||||
|         ffmpeg |         ffmpeg | ||||||
|  |         antiword | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| target_link_libraries( | target_link_libraries( | ||||||
| @ -161,6 +169,7 @@ target_link_libraries( | |||||||
|         ${JAS_LIB} |         ${JAS_LIB} | ||||||
|         ${GUMBO_LIB} |         ${GUMBO_LIB} | ||||||
|         dl |         dl | ||||||
|  |         antiword | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| target_include_directories( | target_include_directories( | ||||||
| @ -183,4 +192,8 @@ if (BUILD_TESTS) | |||||||
|     add_executable(scan_a_test test/main.cpp test/test_util.cpp test/test_util.h) |     add_executable(scan_a_test test/main.cpp test/test_util.cpp test/test_util.h) | ||||||
|     target_compile_options(scan_a_test PRIVATE -g -fsanitize=address -fno-omit-frame-pointer) |     target_compile_options(scan_a_test PRIVATE -g -fsanitize=address -fno-omit-frame-pointer) | ||||||
|     target_link_libraries(scan_a_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=address scan) |     target_link_libraries(scan_a_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=address scan) | ||||||
|  | 
 | ||||||
|  |     add_executable(scan_test test/main.cpp test/test_util.cpp test/test_util.h) | ||||||
|  |     target_compile_options(scan_test PRIVATE -g -fno-omit-frame-pointer) | ||||||
|  |     target_link_libraries(scan_test PRIVATE GTest::gtest GTest::gtest_main scan) | ||||||
| endif() | endif() | ||||||
|  | |||||||
| @ -292,10 +292,7 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha | |||||||
|         ; |         ; | ||||||
| 
 | 
 | ||||||
|     if (strlen(title) > 0) { |     if (strlen(title) > 0) { | ||||||
|         meta_line_t *meta_title = malloc(sizeof(meta_line_t) + strlen(title)); |         APPEND_UTF8_META(doc, MetaTitle, title) | ||||||
|         meta_title->key = MetaTitle; |  | ||||||
|         strcpy(meta_title->str_val, title); |  | ||||||
|         APPEND_META(doc, meta_title) |  | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     char author[4096] = {'\0',}; |     char author[4096] = {'\0',}; | ||||||
| @ -305,10 +302,7 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha | |||||||
|         ; |         ; | ||||||
| 
 | 
 | ||||||
|     if (strlen(author) > 0) { |     if (strlen(author) > 0) { | ||||||
|         meta_line_t *meta_author = malloc(sizeof(meta_line_t) + strlen(author)); |         APPEND_UTF8_META(doc, MetaAuthor, author) | ||||||
|         meta_author->key = MetaAuthor; |  | ||||||
|         strcpy(meta_author->str_val, author); |  | ||||||
|         APPEND_META(doc, meta_author) |  | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     int page_count = -1; |     int page_count = -1; | ||||||
|  | |||||||
| @ -1,16 +1,16 @@ | |||||||
| #ifndef	FALSE | #ifndef FALSE | ||||||
| #define	FALSE	(0) | #define FALSE (0) | ||||||
| #define BOOL int | #define BOOL int | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #ifndef	TRUE | #ifndef TRUE | ||||||
| #define	TRUE	(!FALSE) | #define TRUE (!FALSE) | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #undef	MAX | #undef MAX | ||||||
| #define MAX(a, b)  (((a) > (b)) ? (a) : (b)) | #define MAX(a, b)  (((a) > (b)) ? (a) : (b)) | ||||||
| 
 | 
 | ||||||
| #undef	MIN | #undef MIN | ||||||
| #define MIN(a, b)  (((a) < (b)) ? (a) : (b)) | #define MIN(a, b)  (((a) < (b)) ? (a) : (b)) | ||||||
| 
 | 
 | ||||||
| #ifndef PATH_MAX | #ifndef PATH_MAX | ||||||
| @ -18,7 +18,7 @@ | |||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #undef ABS | #undef ABS | ||||||
| #define ABS(a)	   (((a) < 0) ? -(a) : (a)) | #define ABS(a) (((a) < 0) ? -(a) : (a)) | ||||||
| 
 | 
 | ||||||
| #define APPEND_STR_META(doc, keyname, value) \ | #define APPEND_STR_META(doc, keyname, value) \ | ||||||
|     {meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \ |     {meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \ | ||||||
| @ -37,3 +37,23 @@ | |||||||
|     meta_str->key = MetaThumbnail; \ |     meta_str->key = MetaThumbnail; \ | ||||||
|     sprintf(meta_str->str_val, "%04d,%04d", width, height); \ |     sprintf(meta_str->str_val, "%04d,%04d", width, height); \ | ||||||
|     APPEND_META(doc, meta_str)} |     APPEND_META(doc, meta_str)} | ||||||
|  | 
 | ||||||
|  | #define APPEND_META(doc, meta) \ | ||||||
|  |     meta->next = NULL;\ | ||||||
|  |     if (doc->meta_head == NULL) {\ | ||||||
|  |         doc->meta_head = meta;\ | ||||||
|  |         doc->meta_tail = doc->meta_head;\ | ||||||
|  |     } else {\ | ||||||
|  |         doc->meta_tail->next = meta;\ | ||||||
|  |         doc->meta_tail = meta;\ | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  | #define APPEND_UTF8_META(doc, keyname, str) \ | ||||||
|  |     text_buffer_t tex = text_buffer_create(-1); \ | ||||||
|  |     text_buffer_append_string0(&tex, str); \ | ||||||
|  |     text_buffer_terminate_string(&tex); \ | ||||||
|  |     meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \ | ||||||
|  |     meta_tag->key = keyname; \ | ||||||
|  |     strcpy(meta_tag->str_val, tex.dyn_buffer.buf); \ | ||||||
|  |     APPEND_META(doc, meta_tag) \ | ||||||
|  |     text_buffer_destroy(&tex); | ||||||
|  | |||||||
| @ -166,15 +166,8 @@ void append_tag_meta_if_not_exists(scan_media_ctx_t *ctx, document_t *doc, AVDic | |||||||
|     text_buffer_destroy(&tex); |     text_buffer_destroy(&tex); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #define APPEND_TAG_META(doc, tag_, keyname) \ | #define APPEND_TAG_META(keyname) \ | ||||||
|     text_buffer_t tex = text_buffer_create(-1); \ |     APPEND_UTF8_META(doc, keyname, tag->value) | ||||||
|     text_buffer_append_string0(&tex, tag_->value); \ |  | ||||||
|     text_buffer_terminate_string(&tex); \ |  | ||||||
|     meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \ |  | ||||||
|     meta_tag->key = keyname; \ |  | ||||||
|     strcpy(meta_tag->str_val, tex.dyn_buffer.buf); \ |  | ||||||
|     APPEND_META(doc, meta_tag) \ |  | ||||||
|     text_buffer_destroy(&tex); |  | ||||||
| 
 | 
 | ||||||
| #define STRCPY_TOLOWER(dst, str) \ | #define STRCPY_TOLOWER(dst, str) \ | ||||||
|     strncpy(dst, str, sizeof(dst)); \ |     strncpy(dst, str, sizeof(dst)); \ | ||||||
| @ -190,17 +183,17 @@ static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) { | |||||||
|         STRCPY_TOLOWER(key, tag->key) |         STRCPY_TOLOWER(key, tag->key) | ||||||
| 
 | 
 | ||||||
|         if (strcmp(key, "artist") == 0) { |         if (strcmp(key, "artist") == 0) { | ||||||
|             APPEND_TAG_META(doc, tag, MetaArtist) |             APPEND_TAG_META(MetaArtist) | ||||||
|         } else if (strcmp(key, "genre") == 0) { |         } else if (strcmp(key, "genre") == 0) { | ||||||
|             APPEND_TAG_META(doc, tag, MetaGenre) |             APPEND_TAG_META(MetaGenre) | ||||||
|         } else if (strcmp(key, "title") == 0) { |         } else if (strcmp(key, "title") == 0) { | ||||||
|             APPEND_TAG_META(doc, tag, MetaTitle) |             APPEND_TAG_META(MetaTitle) | ||||||
|         } else if (strcmp(key, "album_artist") == 0) { |         } else if (strcmp(key, "album_artist") == 0) { | ||||||
|             APPEND_TAG_META(doc, tag, MetaAlbumArtist) |             APPEND_TAG_META(MetaAlbumArtist) | ||||||
|         } else if (strcmp(key, "album") == 0) { |         } else if (strcmp(key, "album") == 0) { | ||||||
|             APPEND_TAG_META(doc, tag, MetaAlbum) |             APPEND_TAG_META(MetaAlbum) | ||||||
|         } else if (strcmp(key, "comment") == 0) { |         } else if (strcmp(key, "comment") == 0) { | ||||||
|             APPEND_TAG_META(doc, tag, MetaContent) |             APPEND_TAG_META(MetaContent) | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @ -244,25 +237,25 @@ append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *f | |||||||
|             if (strcmp(key, "artist") == 0) { |             if (strcmp(key, "artist") == 0) { | ||||||
|                 append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist); |                 append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist); | ||||||
|             } else if (strcmp(tag->key, "ImageDescription") == 0) { |             } else if (strcmp(tag->key, "ImageDescription") == 0) { | ||||||
|                 APPEND_TAG_META(doc, tag, MetaContent) |                 APPEND_TAG_META(MetaContent) | ||||||
|             } else if (strcmp(tag->key, "Make") == 0) { |             } else if (strcmp(tag->key, "Make") == 0) { | ||||||
|                 APPEND_TAG_META(doc, tag, MetaExifMake) |                 APPEND_TAG_META(MetaExifMake) | ||||||
|             } else if (strcmp(tag->key, "Model") == 0) { |             } else if (strcmp(tag->key, "Model") == 0) { | ||||||
|                 APPEND_TAG_META(doc, tag, MetaExifModel) |                 APPEND_TAG_META(MetaExifModel) | ||||||
|             } else if (strcmp(tag->key, "Software") == 0) { |             } else if (strcmp(tag->key, "Software") == 0) { | ||||||
|                 APPEND_TAG_META(doc, tag, MetaExifSoftware) |                 APPEND_TAG_META(MetaExifSoftware) | ||||||
|             } else if (strcmp(tag->key, "FNumber") == 0) { |             } else if (strcmp(tag->key, "FNumber") == 0) { | ||||||
|                 APPEND_TAG_META(doc, tag, MetaExifFNumber) |                 APPEND_TAG_META(MetaExifFNumber) | ||||||
|             } else if (strcmp(tag->key, "FocalLength") == 0) { |             } else if (strcmp(tag->key, "FocalLength") == 0) { | ||||||
|                 APPEND_TAG_META(doc, tag, MetaExifFocalLength) |                 APPEND_TAG_META(MetaExifFocalLength) | ||||||
|             } else if (strcmp(tag->key, "UserComment") == 0) { |             } else if (strcmp(tag->key, "UserComment") == 0) { | ||||||
|                 APPEND_TAG_META(doc, tag, MetaExifUserComment) |                 APPEND_TAG_META(MetaExifUserComment) | ||||||
|             } else if (strcmp(tag->key, "ISOSpeedRatings") == 0) { |             } else if (strcmp(tag->key, "ISOSpeedRatings") == 0) { | ||||||
|                 APPEND_TAG_META(doc, tag, MetaExifIsoSpeedRatings) |                 APPEND_TAG_META(MetaExifIsoSpeedRatings) | ||||||
|             } else if (strcmp(tag->key, "ExposureTime") == 0) { |             } else if (strcmp(tag->key, "ExposureTime") == 0) { | ||||||
|                 APPEND_TAG_META(doc, tag, MetaExifExposureTime) |                 APPEND_TAG_META(MetaExifExposureTime) | ||||||
|             } else if (strcmp(tag->key, "DateTime") == 0) { |             } else if (strcmp(tag->key, "DateTime") == 0) { | ||||||
|                 APPEND_TAG_META(doc, tag, MetaExifDateTime) |                 APPEND_TAG_META(MetaExifDateTime) | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  | |||||||
							
								
								
									
										165
									
								
								libscan/msdoc/msdoc.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										165
									
								
								libscan/msdoc/msdoc.c
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,165 @@ | |||||||
|  | #include "msdoc.h" | ||||||
|  | #include <errno.h> | ||||||
|  | 
 | ||||||
|  | #include <sys/mman.h> | ||||||
|  | #include "../../third-party/antiword/src/antiword.h" | ||||||
|  | 
 | ||||||
|  | #include "../ebook/ebook.h" | ||||||
|  | 
 | ||||||
|  | void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { | ||||||
|  | 
 | ||||||
|  |     // Open file
 | ||||||
|  |     size_t buf_len; | ||||||
|  |     char *buf = read_all(f, &buf_len); | ||||||
|  |     if (buf == NULL) { | ||||||
|  |         CTX_LOG_ERROR(f->filepath, "read_all() failed") | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     FILE *file_in = fmemopen(buf, buf_len, "rb"); | ||||||
|  |     if (file_in == NULL) { | ||||||
|  |         free(buf); | ||||||
|  |         CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno) | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // Open word doc
 | ||||||
|  |     options_type *opts = direct_vGetOptions(); | ||||||
|  |     opts->iParagraphBreak = 74; | ||||||
|  |     opts->eConversionType = conversion_text; | ||||||
|  |     opts->bHideHiddenText = 1; | ||||||
|  |     opts->bRemoveRemovedText = 1; | ||||||
|  |     opts->bUseLandscape = 0; | ||||||
|  |     opts->eEncoding = encoding_utf_8; | ||||||
|  |     opts->iPageHeight = 842; // A4
 | ||||||
|  |     opts->iPageWidth = 595; | ||||||
|  |     opts->eImageLevel = level_ps_3; | ||||||
|  | 
 | ||||||
|  |     int doc_word_version = iGuessVersionNumber(file_in, buf_len); | ||||||
|  |     if (doc_word_version < 0 || doc_word_version == 3) { | ||||||
|  |         fclose(file_in); | ||||||
|  |         free(buf); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     rewind(file_in); | ||||||
|  | 
 | ||||||
|  |     size_t out_len; | ||||||
|  |     char *out_buf; | ||||||
|  | 
 | ||||||
|  |     FILE *file_out = open_memstream(&out_buf, &out_len); | ||||||
|  | 
 | ||||||
|  |     diagram_type *diag = pCreateDiagram("antiword", NULL, file_out); | ||||||
|  |     if (diag == NULL) { | ||||||
|  |         fclose(file_in); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     iInitDocument(file_in, buf_len); | ||||||
|  |     const char* author = szGetAuthor(); | ||||||
|  |     if (author != NULL) { | ||||||
|  |         APPEND_UTF8_META(doc, MetaAuthor, author) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const char* title = szGetTitle(); | ||||||
|  |     if (title != NULL) { | ||||||
|  |         APPEND_UTF8_META(doc, MetaTitle, title) | ||||||
|  |     } | ||||||
|  |     vFreeDocument(); | ||||||
|  | 
 | ||||||
|  |     bWordDecryptor(file_in, buf_len, diag); | ||||||
|  |     vDestroyDiagram(diag); | ||||||
|  |     fclose(file_out); | ||||||
|  | 
 | ||||||
|  |     if (buf_len > 0) { | ||||||
|  |         text_buffer_t tex = text_buffer_create(ctx->content_size); | ||||||
|  |         text_buffer_append_string(&tex, out_buf, out_len); | ||||||
|  |         text_buffer_terminate_string(&tex); | ||||||
|  | 
 | ||||||
|  |         meta_line_t *meta_content = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); | ||||||
|  |         meta_content->key = MetaContent; | ||||||
|  |         memcpy(meta_content->str_val, tex.dyn_buffer.buf, tex.dyn_buffer.cur); | ||||||
|  |         APPEND_META(doc, meta_content) | ||||||
|  | 
 | ||||||
|  |         text_buffer_destroy(&tex); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     fclose(file_in); | ||||||
|  |     free(buf); | ||||||
|  |     free(out_buf); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { | ||||||
|  | 
 | ||||||
|  |     scan_ebook_ctx_t ebook_ctx = { | ||||||
|  |             .content_size = ctx->content_size, | ||||||
|  |             .tn_size = ctx->tn_size, | ||||||
|  |             .log = ctx->log, | ||||||
|  |             .logf = ctx->logf, | ||||||
|  |             .store = ctx->store, | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     // Open file
 | ||||||
|  |     size_t buf_len; | ||||||
|  |     char *buf = read_all(f, &buf_len); | ||||||
|  |     if (buf == NULL) { | ||||||
|  |         CTX_LOG_ERROR(f->filepath, "read_all() failed") | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     FILE *file = fmemopen(buf, buf_len, "rb"); | ||||||
|  |     if (file == NULL) { | ||||||
|  |         free(buf); | ||||||
|  |         CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno) | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     // Open word doc
 | ||||||
|  | 
 | ||||||
|  |     options_type *opts = direct_vGetOptions(); | ||||||
|  |     opts->iParagraphBreak = 74; | ||||||
|  |     opts->eConversionType = conversion_pdf; | ||||||
|  |     opts->bHideHiddenText = 1; | ||||||
|  |     opts->bRemoveRemovedText = 1; | ||||||
|  |     opts->bUseLandscape = 0; | ||||||
|  |     opts->eEncoding = encoding_latin_2; | ||||||
|  |     opts->iPageHeight = 842; // A4
 | ||||||
|  |     opts->iPageWidth = 595; | ||||||
|  |     opts->eImageLevel = level_ps_3; | ||||||
|  | 
 | ||||||
|  |     int doc_word_version = iGuessVersionNumber(file, buf_len); | ||||||
|  |     if (doc_word_version < 0 || doc_word_version == 3) { | ||||||
|  |         fclose(file); | ||||||
|  |         free(buf); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     rewind(file); | ||||||
|  | 
 | ||||||
|  |     size_t out_len; | ||||||
|  |     char *out_buf; | ||||||
|  | 
 | ||||||
|  |     FILE *file_out = open_memstream(&out_buf, &out_len); | ||||||
|  | 
 | ||||||
|  |     diagram_type *diag = pCreateDiagram("antiword", NULL, file_out); | ||||||
|  |     if (diag == NULL) { | ||||||
|  |         fclose(file); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     int ret = bWordDecryptor(file, buf_len, diag); | ||||||
|  |     vDestroyDiagram(diag); | ||||||
|  | 
 | ||||||
|  |     fclose(file_out); | ||||||
|  | 
 | ||||||
|  |     parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc); | ||||||
|  | 
 | ||||||
|  |     fclose(file); | ||||||
|  |     free(buf); | ||||||
|  |     free(out_buf); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { | ||||||
|  |     if (ctx->tn_size > 0) { | ||||||
|  |         parse_msdoc_pdf(ctx, f, doc); | ||||||
|  |     } else { | ||||||
|  |         parse_msdoc_text(ctx, f, doc); | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										22
									
								
								libscan/msdoc/msdoc.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								libscan/msdoc/msdoc.h
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,22 @@ | |||||||
|  | #ifndef SCAN_SCAN_MSDOC_H | ||||||
|  | #define SCAN_SCAN_MSDOC_H | ||||||
|  | 
 | ||||||
|  | #include "../scan.h" | ||||||
|  | 
 | ||||||
|  | typedef struct { | ||||||
|  |     long content_size; | ||||||
|  |     int tn_size; | ||||||
|  |     log_callback_t log; | ||||||
|  |     logf_callback_t logf; | ||||||
|  |     store_callback_t store; | ||||||
|  |     unsigned int msdoc_mime; | ||||||
|  | } scan_msdoc_ctx_t; | ||||||
|  | 
 | ||||||
|  | __always_inline | ||||||
|  | static int is_msdoc(scan_msdoc_ctx_t *ctx, unsigned int mime) { | ||||||
|  |     return mime == ctx->msdoc_mime; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc); | ||||||
|  | 
 | ||||||
|  | #endif | ||||||
| @ -1,6 +1,8 @@ | |||||||
| #ifndef SCAN_SCAN_H | #ifndef SCAN_SCAN_H | ||||||
| #define SCAN_SCAN_H | #define SCAN_SCAN_H | ||||||
| 
 | 
 | ||||||
|  | #define _GNU_SOURCE | ||||||
|  | 
 | ||||||
| #include <stdio.h> | #include <stdio.h> | ||||||
| #include <sys/stat.h> | #include <sys/stat.h> | ||||||
| #include <uuid/uuid.h> | #include <uuid/uuid.h> | ||||||
| @ -147,16 +149,6 @@ typedef struct parse_job_t { | |||||||
| } parse_job_t; | } parse_job_t; | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| #define APPEND_META(doc, meta) \ |  | ||||||
|     meta->next = NULL;\ |  | ||||||
|     if (doc->meta_head == NULL) {\ |  | ||||||
|         doc->meta_head = meta;\ |  | ||||||
|         doc->meta_tail = doc->meta_head;\ |  | ||||||
|     } else {\ |  | ||||||
|         doc->meta_tail->next = meta;\ |  | ||||||
|         doc->meta_tail = meta;\ |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
| #include "util.h" | #include "util.h" | ||||||
| 
 | 
 | ||||||
| typedef void (*parse_callback_t)(parse_job_t *job); | typedef void (*parse_callback_t)(parse_job_t *job); | ||||||
|  | |||||||
| @ -273,7 +273,7 @@ static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t | |||||||
|     return 0; |     return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int text_buffer_append_string0(text_buffer_t *buf, char *str) { | static int text_buffer_append_string0(text_buffer_t *buf, const char *str) { | ||||||
|     return text_buffer_append_string(buf, str, strlen(str)); |     return text_buffer_append_string(buf, str, strlen(str)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										109
									
								
								test/main.cpp
									
									
									
									
									
								
							
							
						
						
									
										109
									
								
								test/main.cpp
									
									
									
									
									
								
							| @ -10,6 +10,7 @@ extern "C" { | |||||||
| #include "../libscan/ooxml/ooxml.h" | #include "../libscan/ooxml/ooxml.h" | ||||||
| #include "../libscan/mobi/scan_mobi.h" | #include "../libscan/mobi/scan_mobi.h" | ||||||
| #include "../libscan/raw/raw.h" | #include "../libscan/raw/raw.h" | ||||||
|  | #include "../libscan/msdoc/msdoc.h" | ||||||
| #include <libavutil/avutil.h> | #include <libavutil/avutil.h> | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @ -33,6 +34,10 @@ static scan_mobi_ctx_t mobi_500_ctx; | |||||||
| 
 | 
 | ||||||
| static scan_raw_ctx_t raw_ctx; | static scan_raw_ctx_t raw_ctx; | ||||||
| 
 | 
 | ||||||
|  | static scan_msdoc_ctx_t msdoc_ctx; | ||||||
|  | 
 | ||||||
|  | static scan_msdoc_ctx_t msdoc_text_ctx; | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| document_t LastSubDoc; | document_t LastSubDoc; | ||||||
| 
 | 
 | ||||||
| @ -689,6 +694,98 @@ TEST(RAW, Fuji) { | |||||||
|     cleanup(&doc, &f); |     cleanup(&doc, &f); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /* msdoc */ | ||||||
|  | TEST(Msdoc, Test1Pdf) { | ||||||
|  |     vfile_t f; | ||||||
|  |     document_t doc; | ||||||
|  |     load_doc_file("libscan-test-files/test_files/msdoc/test1.doc", &f, &doc); | ||||||
|  | 
 | ||||||
|  |     size_t size_before = store_size; | ||||||
|  | 
 | ||||||
|  |     parse_msdoc(&msdoc_ctx, &f, &doc); | ||||||
|  | 
 | ||||||
|  |     ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "October 2000") != nullptr); | ||||||
|  |     ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "INTERNATIONAL ORGANIZATION FOR STANDARDIZATION"); | ||||||
|  |     ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Oliver Morgan"); | ||||||
|  |     ASSERT_EQ(get_meta(&doc, MetaPages)->int_val, 57); | ||||||
|  |     ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); | ||||||
|  |     ASSERT_NE(size_before, store_size); | ||||||
|  | 
 | ||||||
|  |     cleanup(&doc, &f); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | TEST(Msdoc, Test1Text) { | ||||||
|  |     vfile_t f; | ||||||
|  |     document_t doc; | ||||||
|  |     load_doc_file("libscan-test-files/test_files/msdoc/test1.doc", &f, &doc); | ||||||
|  | 
 | ||||||
|  |     size_t size_before = store_size; | ||||||
|  | 
 | ||||||
|  |     parse_msdoc(&msdoc_text_ctx, &f, &doc); | ||||||
|  | 
 | ||||||
|  |     ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "October 2000") != nullptr); | ||||||
|  |     ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "INTERNATIONAL ORGANIZATION FOR STANDARDIZATION"); | ||||||
|  |     ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Oliver Morgan"); | ||||||
|  |     ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); | ||||||
|  |     ASSERT_EQ(size_before, store_size); | ||||||
|  | 
 | ||||||
|  |     cleanup(&doc, &f); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | TEST(Msdoc, Test2Pdf) { | ||||||
|  |     vfile_t f; | ||||||
|  |     document_t doc; | ||||||
|  |     load_doc_file("libscan-test-files/test_files/msdoc/test2.doc", &f, &doc); | ||||||
|  | 
 | ||||||
|  |     size_t size_before = store_size; | ||||||
|  | 
 | ||||||
|  |     parse_msdoc(&msdoc_ctx, &f, &doc); | ||||||
|  | 
 | ||||||
|  |     ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "GNU Free Documentation License") != nullptr); | ||||||
|  |     ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "DWARF Debugging Information Format"); | ||||||
|  |     ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Ron Brender"); | ||||||
|  |     ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); | ||||||
|  |     ASSERT_NE(size_before, store_size); | ||||||
|  | 
 | ||||||
|  |     cleanup(&doc, &f); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | TEST(Msdoc, Test3Pdf) { | ||||||
|  |     vfile_t f; | ||||||
|  |     document_t doc; | ||||||
|  |     load_doc_file("libscan-test-files/test_files/msdoc/test3.doc", &f, &doc); | ||||||
|  | 
 | ||||||
|  |     size_t size_before = store_size; | ||||||
|  | 
 | ||||||
|  |     parse_msdoc(&msdoc_ctx, &f, &doc); | ||||||
|  | 
 | ||||||
|  |     ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "INTERNATIONAL PATENT CLASSIFICATION") != nullptr); | ||||||
|  |     ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "IPC Fixed Texts Specification"); | ||||||
|  |     ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Fievet"); | ||||||
|  |     ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); | ||||||
|  |     ASSERT_NE(size_before, store_size); | ||||||
|  | 
 | ||||||
|  |     cleanup(&doc, &f); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | TEST(Msdoc, Test4Pdf) { | ||||||
|  |     vfile_t f; | ||||||
|  |     document_t doc; | ||||||
|  |     load_doc_file("libscan-test-files/test_files/msdoc/test4.doc", &f, &doc); | ||||||
|  | 
 | ||||||
|  |     size_t size_before = store_size; | ||||||
|  | 
 | ||||||
|  |     parse_msdoc(&msdoc_ctx, &f, &doc); | ||||||
|  | 
 | ||||||
|  |     ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "SQL Server international data types") != nullptr); | ||||||
|  |     ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "MSDN Authoring Template"); | ||||||
|  |     ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Brenda Yen"); | ||||||
|  |     ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); | ||||||
|  |     ASSERT_NE(size_before, store_size); | ||||||
|  | 
 | ||||||
|  |     cleanup(&doc, &f); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| int main(int argc, char **argv) { | int main(int argc, char **argv) { | ||||||
|     setlocale(LC_ALL, ""); |     setlocale(LC_ALL, ""); | ||||||
| @ -753,6 +850,18 @@ int main(int argc, char **argv) { | |||||||
|     raw_ctx.tn_size = 500; |     raw_ctx.tn_size = 500; | ||||||
|     raw_ctx.tn_qscale = 5.0; |     raw_ctx.tn_qscale = 5.0; | ||||||
| 
 | 
 | ||||||
|  |     msdoc_ctx.log = noop_log; | ||||||
|  |     msdoc_ctx.logf = noop_logf; | ||||||
|  |     msdoc_ctx.store = counter_store; | ||||||
|  |     msdoc_ctx.content_size = 500; | ||||||
|  |     msdoc_ctx.tn_size = 500; | ||||||
|  | 
 | ||||||
|  |     msdoc_text_ctx.log = noop_log; | ||||||
|  |     msdoc_text_ctx.logf = noop_logf; | ||||||
|  |     msdoc_text_ctx.store = counter_store; | ||||||
|  |     msdoc_text_ctx.content_size = 500; | ||||||
|  |     msdoc_text_ctx.tn_size = 0; | ||||||
|  | 
 | ||||||
|     av_log_set_level(AV_LOG_QUIET); |     av_log_set_level(AV_LOG_QUIET); | ||||||
|     ::testing::InitGoogleTest(&argc, argv); |     ::testing::InitGoogleTest(&argc, argv); | ||||||
|     return RUN_ALL_TESTS(); |     return RUN_ALL_TESTS(); | ||||||
|  | |||||||
							
								
								
									
										1
									
								
								third-party/antiword
									
									
									
									
										vendored
									
									
										Submodule
									
								
							
							
								
								
								
								
								
								
									
									
								
							
						
						
									
										1
									
								
								third-party/antiword
									
									
									
									
										vendored
									
									
										Submodule
									
								
							| @ -0,0 +1 @@ | |||||||
|  | Subproject commit be5e260190d807fdfb9ed1d64cf62d6649de3030 | ||||||
							
								
								
									
										2
									
								
								third-party/utf8.h
									
									
									
									
										vendored
									
									
								
							
							
								
								
								
								
								
								
									
									
								
							
						
						
									
										2
									
								
								third-party/utf8.h
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | |||||||
| Subproject commit fdcacc00ff48f7d268108dfb0ec7ebc485f1eb16 | Subproject commit e9762540f33eed32d9a568e20ce4c4a836722a50 | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user