mirror of
https://github.com/simon987/libscan.git
synced 2025-04-05 04:22:58 +00:00
Add .doc support
This commit is contained in:
parent
8643f5fb65
commit
070186fea0
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -1,3 +1,6 @@
|
|||||||
[submodule "third-party/utf8.h"]
|
[submodule "third-party/utf8.h"]
|
||||||
path = third-party/utf8.h
|
path = third-party/utf8.h
|
||||||
url = https://github.com/sheredom/utf8.h
|
url = https://github.com/sheredom/utf8.h
|
||||||
|
[submodule "third-party/antiword"]
|
||||||
|
path = third-party/antiword
|
||||||
|
url = https://github.com/simon987/antiword
|
||||||
|
@ -5,6 +5,12 @@ set(CMAKE_C_STANDARD 11)
|
|||||||
|
|
||||||
option(BUILD_TESTS "Build tests" off)
|
option(BUILD_TESTS "Build tests" off)
|
||||||
|
|
||||||
|
add_subdirectory(third-party/antiword)
|
||||||
|
add_compile_definitions(
|
||||||
|
antiword
|
||||||
|
NDEBUG
|
||||||
|
)
|
||||||
|
|
||||||
add_library(
|
add_library(
|
||||||
scan
|
scan
|
||||||
libscan/util.c libscan/util.h
|
libscan/util.c libscan/util.h
|
||||||
@ -18,6 +24,7 @@ add_library(
|
|||||||
libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h
|
libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h
|
||||||
libscan/media/media.c libscan/media/media.h
|
libscan/media/media.c libscan/media/media.h
|
||||||
libscan/font/font.c libscan/font/font.h
|
libscan/font/font.c libscan/font/font.h
|
||||||
|
libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h
|
||||||
|
|
||||||
third-party/utf8.h
|
third-party/utf8.h
|
||||||
libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h)
|
libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h)
|
||||||
@ -110,6 +117,7 @@ add_dependencies(
|
|||||||
scan
|
scan
|
||||||
libmobi
|
libmobi
|
||||||
ffmpeg
|
ffmpeg
|
||||||
|
antiword
|
||||||
)
|
)
|
||||||
|
|
||||||
target_link_libraries(
|
target_link_libraries(
|
||||||
@ -161,6 +169,7 @@ target_link_libraries(
|
|||||||
${JAS_LIB}
|
${JAS_LIB}
|
||||||
${GUMBO_LIB}
|
${GUMBO_LIB}
|
||||||
dl
|
dl
|
||||||
|
antiword
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(
|
target_include_directories(
|
||||||
@ -183,4 +192,8 @@ if (BUILD_TESTS)
|
|||||||
add_executable(scan_a_test test/main.cpp test/test_util.cpp test/test_util.h)
|
add_executable(scan_a_test test/main.cpp test/test_util.cpp test/test_util.h)
|
||||||
target_compile_options(scan_a_test PRIVATE -g -fsanitize=address -fno-omit-frame-pointer)
|
target_compile_options(scan_a_test PRIVATE -g -fsanitize=address -fno-omit-frame-pointer)
|
||||||
target_link_libraries(scan_a_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=address scan)
|
target_link_libraries(scan_a_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=address scan)
|
||||||
|
|
||||||
|
add_executable(scan_test test/main.cpp test/test_util.cpp test/test_util.h)
|
||||||
|
target_compile_options(scan_test PRIVATE -g -fno-omit-frame-pointer)
|
||||||
|
target_link_libraries(scan_test PRIVATE GTest::gtest GTest::gtest_main scan)
|
||||||
endif()
|
endif()
|
||||||
|
@ -292,10 +292,7 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha
|
|||||||
;
|
;
|
||||||
|
|
||||||
if (strlen(title) > 0) {
|
if (strlen(title) > 0) {
|
||||||
meta_line_t *meta_title = malloc(sizeof(meta_line_t) + strlen(title));
|
APPEND_UTF8_META(doc, MetaTitle, title)
|
||||||
meta_title->key = MetaTitle;
|
|
||||||
strcpy(meta_title->str_val, title);
|
|
||||||
APPEND_META(doc, meta_title)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
char author[4096] = {'\0',};
|
char author[4096] = {'\0',};
|
||||||
@ -305,10 +302,7 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha
|
|||||||
;
|
;
|
||||||
|
|
||||||
if (strlen(author) > 0) {
|
if (strlen(author) > 0) {
|
||||||
meta_line_t *meta_author = malloc(sizeof(meta_line_t) + strlen(author));
|
APPEND_UTF8_META(doc, MetaAuthor, author)
|
||||||
meta_author->key = MetaAuthor;
|
|
||||||
strcpy(meta_author->str_val, author);
|
|
||||||
APPEND_META(doc, meta_author)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int page_count = -1;
|
int page_count = -1;
|
||||||
|
@ -1,16 +1,16 @@
|
|||||||
#ifndef FALSE
|
#ifndef FALSE
|
||||||
#define FALSE (0)
|
#define FALSE (0)
|
||||||
#define BOOL int
|
#define BOOL int
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef TRUE
|
#ifndef TRUE
|
||||||
#define TRUE (!FALSE)
|
#define TRUE (!FALSE)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#undef MAX
|
#undef MAX
|
||||||
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
|
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
|
||||||
|
|
||||||
#undef MIN
|
#undef MIN
|
||||||
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
|
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
|
||||||
|
|
||||||
#ifndef PATH_MAX
|
#ifndef PATH_MAX
|
||||||
@ -18,7 +18,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#undef ABS
|
#undef ABS
|
||||||
#define ABS(a) (((a) < 0) ? -(a) : (a))
|
#define ABS(a) (((a) < 0) ? -(a) : (a))
|
||||||
|
|
||||||
#define APPEND_STR_META(doc, keyname, value) \
|
#define APPEND_STR_META(doc, keyname, value) \
|
||||||
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \
|
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \
|
||||||
@ -37,3 +37,23 @@
|
|||||||
meta_str->key = MetaThumbnail; \
|
meta_str->key = MetaThumbnail; \
|
||||||
sprintf(meta_str->str_val, "%04d,%04d", width, height); \
|
sprintf(meta_str->str_val, "%04d,%04d", width, height); \
|
||||||
APPEND_META(doc, meta_str)}
|
APPEND_META(doc, meta_str)}
|
||||||
|
|
||||||
|
#define APPEND_META(doc, meta) \
|
||||||
|
meta->next = NULL;\
|
||||||
|
if (doc->meta_head == NULL) {\
|
||||||
|
doc->meta_head = meta;\
|
||||||
|
doc->meta_tail = doc->meta_head;\
|
||||||
|
} else {\
|
||||||
|
doc->meta_tail->next = meta;\
|
||||||
|
doc->meta_tail = meta;\
|
||||||
|
}
|
||||||
|
|
||||||
|
#define APPEND_UTF8_META(doc, keyname, str) \
|
||||||
|
text_buffer_t tex = text_buffer_create(-1); \
|
||||||
|
text_buffer_append_string0(&tex, str); \
|
||||||
|
text_buffer_terminate_string(&tex); \
|
||||||
|
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \
|
||||||
|
meta_tag->key = keyname; \
|
||||||
|
strcpy(meta_tag->str_val, tex.dyn_buffer.buf); \
|
||||||
|
APPEND_META(doc, meta_tag) \
|
||||||
|
text_buffer_destroy(&tex);
|
||||||
|
@ -166,15 +166,8 @@ void append_tag_meta_if_not_exists(scan_media_ctx_t *ctx, document_t *doc, AVDic
|
|||||||
text_buffer_destroy(&tex);
|
text_buffer_destroy(&tex);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define APPEND_TAG_META(doc, tag_, keyname) \
|
#define APPEND_TAG_META(keyname) \
|
||||||
text_buffer_t tex = text_buffer_create(-1); \
|
APPEND_UTF8_META(doc, keyname, tag->value)
|
||||||
text_buffer_append_string0(&tex, tag_->value); \
|
|
||||||
text_buffer_terminate_string(&tex); \
|
|
||||||
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \
|
|
||||||
meta_tag->key = keyname; \
|
|
||||||
strcpy(meta_tag->str_val, tex.dyn_buffer.buf); \
|
|
||||||
APPEND_META(doc, meta_tag) \
|
|
||||||
text_buffer_destroy(&tex);
|
|
||||||
|
|
||||||
#define STRCPY_TOLOWER(dst, str) \
|
#define STRCPY_TOLOWER(dst, str) \
|
||||||
strncpy(dst, str, sizeof(dst)); \
|
strncpy(dst, str, sizeof(dst)); \
|
||||||
@ -190,17 +183,17 @@ static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
|
|||||||
STRCPY_TOLOWER(key, tag->key)
|
STRCPY_TOLOWER(key, tag->key)
|
||||||
|
|
||||||
if (strcmp(key, "artist") == 0) {
|
if (strcmp(key, "artist") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaArtist)
|
APPEND_TAG_META(MetaArtist)
|
||||||
} else if (strcmp(key, "genre") == 0) {
|
} else if (strcmp(key, "genre") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaGenre)
|
APPEND_TAG_META(MetaGenre)
|
||||||
} else if (strcmp(key, "title") == 0) {
|
} else if (strcmp(key, "title") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaTitle)
|
APPEND_TAG_META(MetaTitle)
|
||||||
} else if (strcmp(key, "album_artist") == 0) {
|
} else if (strcmp(key, "album_artist") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaAlbumArtist)
|
APPEND_TAG_META(MetaAlbumArtist)
|
||||||
} else if (strcmp(key, "album") == 0) {
|
} else if (strcmp(key, "album") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaAlbum)
|
APPEND_TAG_META(MetaAlbum)
|
||||||
} else if (strcmp(key, "comment") == 0) {
|
} else if (strcmp(key, "comment") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaContent)
|
APPEND_TAG_META(MetaContent)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -244,25 +237,25 @@ append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *f
|
|||||||
if (strcmp(key, "artist") == 0) {
|
if (strcmp(key, "artist") == 0) {
|
||||||
append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
|
append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
|
||||||
} else if (strcmp(tag->key, "ImageDescription") == 0) {
|
} else if (strcmp(tag->key, "ImageDescription") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaContent)
|
APPEND_TAG_META(MetaContent)
|
||||||
} else if (strcmp(tag->key, "Make") == 0) {
|
} else if (strcmp(tag->key, "Make") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaExifMake)
|
APPEND_TAG_META(MetaExifMake)
|
||||||
} else if (strcmp(tag->key, "Model") == 0) {
|
} else if (strcmp(tag->key, "Model") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaExifModel)
|
APPEND_TAG_META(MetaExifModel)
|
||||||
} else if (strcmp(tag->key, "Software") == 0) {
|
} else if (strcmp(tag->key, "Software") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaExifSoftware)
|
APPEND_TAG_META(MetaExifSoftware)
|
||||||
} else if (strcmp(tag->key, "FNumber") == 0) {
|
} else if (strcmp(tag->key, "FNumber") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaExifFNumber)
|
APPEND_TAG_META(MetaExifFNumber)
|
||||||
} else if (strcmp(tag->key, "FocalLength") == 0) {
|
} else if (strcmp(tag->key, "FocalLength") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaExifFocalLength)
|
APPEND_TAG_META(MetaExifFocalLength)
|
||||||
} else if (strcmp(tag->key, "UserComment") == 0) {
|
} else if (strcmp(tag->key, "UserComment") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaExifUserComment)
|
APPEND_TAG_META(MetaExifUserComment)
|
||||||
} else if (strcmp(tag->key, "ISOSpeedRatings") == 0) {
|
} else if (strcmp(tag->key, "ISOSpeedRatings") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaExifIsoSpeedRatings)
|
APPEND_TAG_META(MetaExifIsoSpeedRatings)
|
||||||
} else if (strcmp(tag->key, "ExposureTime") == 0) {
|
} else if (strcmp(tag->key, "ExposureTime") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaExifExposureTime)
|
APPEND_TAG_META(MetaExifExposureTime)
|
||||||
} else if (strcmp(tag->key, "DateTime") == 0) {
|
} else if (strcmp(tag->key, "DateTime") == 0) {
|
||||||
APPEND_TAG_META(doc, tag, MetaExifDateTime)
|
APPEND_TAG_META(MetaExifDateTime)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
165
libscan/msdoc/msdoc.c
Normal file
165
libscan/msdoc/msdoc.c
Normal file
@ -0,0 +1,165 @@
|
|||||||
|
#include "msdoc.h"
|
||||||
|
#include <errno.h>
|
||||||
|
|
||||||
|
#include <sys/mman.h>
|
||||||
|
#include "../../third-party/antiword/src/antiword.h"
|
||||||
|
|
||||||
|
#include "../ebook/ebook.h"
|
||||||
|
|
||||||
|
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||||
|
|
||||||
|
// Open file
|
||||||
|
size_t buf_len;
|
||||||
|
char *buf = read_all(f, &buf_len);
|
||||||
|
if (buf == NULL) {
|
||||||
|
CTX_LOG_ERROR(f->filepath, "read_all() failed")
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
FILE *file_in = fmemopen(buf, buf_len, "rb");
|
||||||
|
if (file_in == NULL) {
|
||||||
|
free(buf);
|
||||||
|
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Open word doc
|
||||||
|
options_type *opts = direct_vGetOptions();
|
||||||
|
opts->iParagraphBreak = 74;
|
||||||
|
opts->eConversionType = conversion_text;
|
||||||
|
opts->bHideHiddenText = 1;
|
||||||
|
opts->bRemoveRemovedText = 1;
|
||||||
|
opts->bUseLandscape = 0;
|
||||||
|
opts->eEncoding = encoding_utf_8;
|
||||||
|
opts->iPageHeight = 842; // A4
|
||||||
|
opts->iPageWidth = 595;
|
||||||
|
opts->eImageLevel = level_ps_3;
|
||||||
|
|
||||||
|
int doc_word_version = iGuessVersionNumber(file_in, buf_len);
|
||||||
|
if (doc_word_version < 0 || doc_word_version == 3) {
|
||||||
|
fclose(file_in);
|
||||||
|
free(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
rewind(file_in);
|
||||||
|
|
||||||
|
size_t out_len;
|
||||||
|
char *out_buf;
|
||||||
|
|
||||||
|
FILE *file_out = open_memstream(&out_buf, &out_len);
|
||||||
|
|
||||||
|
diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
|
||||||
|
if (diag == NULL) {
|
||||||
|
fclose(file_in);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
iInitDocument(file_in, buf_len);
|
||||||
|
const char* author = szGetAuthor();
|
||||||
|
if (author != NULL) {
|
||||||
|
APPEND_UTF8_META(doc, MetaAuthor, author)
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* title = szGetTitle();
|
||||||
|
if (title != NULL) {
|
||||||
|
APPEND_UTF8_META(doc, MetaTitle, title)
|
||||||
|
}
|
||||||
|
vFreeDocument();
|
||||||
|
|
||||||
|
bWordDecryptor(file_in, buf_len, diag);
|
||||||
|
vDestroyDiagram(diag);
|
||||||
|
fclose(file_out);
|
||||||
|
|
||||||
|
if (buf_len > 0) {
|
||||||
|
text_buffer_t tex = text_buffer_create(ctx->content_size);
|
||||||
|
text_buffer_append_string(&tex, out_buf, out_len);
|
||||||
|
text_buffer_terminate_string(&tex);
|
||||||
|
|
||||||
|
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
|
||||||
|
meta_content->key = MetaContent;
|
||||||
|
memcpy(meta_content->str_val, tex.dyn_buffer.buf, tex.dyn_buffer.cur);
|
||||||
|
APPEND_META(doc, meta_content)
|
||||||
|
|
||||||
|
text_buffer_destroy(&tex);
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(file_in);
|
||||||
|
free(buf);
|
||||||
|
free(out_buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||||
|
|
||||||
|
scan_ebook_ctx_t ebook_ctx = {
|
||||||
|
.content_size = ctx->content_size,
|
||||||
|
.tn_size = ctx->tn_size,
|
||||||
|
.log = ctx->log,
|
||||||
|
.logf = ctx->logf,
|
||||||
|
.store = ctx->store,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Open file
|
||||||
|
size_t buf_len;
|
||||||
|
char *buf = read_all(f, &buf_len);
|
||||||
|
if (buf == NULL) {
|
||||||
|
CTX_LOG_ERROR(f->filepath, "read_all() failed")
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
FILE *file = fmemopen(buf, buf_len, "rb");
|
||||||
|
if (file == NULL) {
|
||||||
|
free(buf);
|
||||||
|
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Open word doc
|
||||||
|
|
||||||
|
options_type *opts = direct_vGetOptions();
|
||||||
|
opts->iParagraphBreak = 74;
|
||||||
|
opts->eConversionType = conversion_pdf;
|
||||||
|
opts->bHideHiddenText = 1;
|
||||||
|
opts->bRemoveRemovedText = 1;
|
||||||
|
opts->bUseLandscape = 0;
|
||||||
|
opts->eEncoding = encoding_latin_2;
|
||||||
|
opts->iPageHeight = 842; // A4
|
||||||
|
opts->iPageWidth = 595;
|
||||||
|
opts->eImageLevel = level_ps_3;
|
||||||
|
|
||||||
|
int doc_word_version = iGuessVersionNumber(file, buf_len);
|
||||||
|
if (doc_word_version < 0 || doc_word_version == 3) {
|
||||||
|
fclose(file);
|
||||||
|
free(buf);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
rewind(file);
|
||||||
|
|
||||||
|
size_t out_len;
|
||||||
|
char *out_buf;
|
||||||
|
|
||||||
|
FILE *file_out = open_memstream(&out_buf, &out_len);
|
||||||
|
|
||||||
|
diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
|
||||||
|
if (diag == NULL) {
|
||||||
|
fclose(file);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ret = bWordDecryptor(file, buf_len, diag);
|
||||||
|
vDestroyDiagram(diag);
|
||||||
|
|
||||||
|
fclose(file_out);
|
||||||
|
|
||||||
|
parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc);
|
||||||
|
|
||||||
|
fclose(file);
|
||||||
|
free(buf);
|
||||||
|
free(out_buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||||
|
if (ctx->tn_size > 0) {
|
||||||
|
parse_msdoc_pdf(ctx, f, doc);
|
||||||
|
} else {
|
||||||
|
parse_msdoc_text(ctx, f, doc);
|
||||||
|
}
|
||||||
|
}
|
22
libscan/msdoc/msdoc.h
Normal file
22
libscan/msdoc/msdoc.h
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
#ifndef SCAN_SCAN_MSDOC_H
|
||||||
|
#define SCAN_SCAN_MSDOC_H
|
||||||
|
|
||||||
|
#include "../scan.h"
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
long content_size;
|
||||||
|
int tn_size;
|
||||||
|
log_callback_t log;
|
||||||
|
logf_callback_t logf;
|
||||||
|
store_callback_t store;
|
||||||
|
unsigned int msdoc_mime;
|
||||||
|
} scan_msdoc_ctx_t;
|
||||||
|
|
||||||
|
__always_inline
|
||||||
|
static int is_msdoc(scan_msdoc_ctx_t *ctx, unsigned int mime) {
|
||||||
|
return mime == ctx->msdoc_mime;
|
||||||
|
}
|
||||||
|
|
||||||
|
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||||
|
|
||||||
|
#endif
|
@ -1,6 +1,8 @@
|
|||||||
#ifndef SCAN_SCAN_H
|
#ifndef SCAN_SCAN_H
|
||||||
#define SCAN_SCAN_H
|
#define SCAN_SCAN_H
|
||||||
|
|
||||||
|
#define _GNU_SOURCE
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#include <uuid/uuid.h>
|
#include <uuid/uuid.h>
|
||||||
@ -147,16 +149,6 @@ typedef struct parse_job_t {
|
|||||||
} parse_job_t;
|
} parse_job_t;
|
||||||
|
|
||||||
|
|
||||||
#define APPEND_META(doc, meta) \
|
|
||||||
meta->next = NULL;\
|
|
||||||
if (doc->meta_head == NULL) {\
|
|
||||||
doc->meta_head = meta;\
|
|
||||||
doc->meta_tail = doc->meta_head;\
|
|
||||||
} else {\
|
|
||||||
doc->meta_tail->next = meta;\
|
|
||||||
doc->meta_tail = meta;\
|
|
||||||
}
|
|
||||||
|
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
|
|
||||||
typedef void (*parse_callback_t)(parse_job_t *job);
|
typedef void (*parse_callback_t)(parse_job_t *job);
|
||||||
|
@ -273,7 +273,7 @@ static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int text_buffer_append_string0(text_buffer_t *buf, char *str) {
|
static int text_buffer_append_string0(text_buffer_t *buf, const char *str) {
|
||||||
return text_buffer_append_string(buf, str, strlen(str));
|
return text_buffer_append_string(buf, str, strlen(str));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
109
test/main.cpp
109
test/main.cpp
@ -10,6 +10,7 @@ extern "C" {
|
|||||||
#include "../libscan/ooxml/ooxml.h"
|
#include "../libscan/ooxml/ooxml.h"
|
||||||
#include "../libscan/mobi/scan_mobi.h"
|
#include "../libscan/mobi/scan_mobi.h"
|
||||||
#include "../libscan/raw/raw.h"
|
#include "../libscan/raw/raw.h"
|
||||||
|
#include "../libscan/msdoc/msdoc.h"
|
||||||
#include <libavutil/avutil.h>
|
#include <libavutil/avutil.h>
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -33,6 +34,10 @@ static scan_mobi_ctx_t mobi_500_ctx;
|
|||||||
|
|
||||||
static scan_raw_ctx_t raw_ctx;
|
static scan_raw_ctx_t raw_ctx;
|
||||||
|
|
||||||
|
static scan_msdoc_ctx_t msdoc_ctx;
|
||||||
|
|
||||||
|
static scan_msdoc_ctx_t msdoc_text_ctx;
|
||||||
|
|
||||||
|
|
||||||
document_t LastSubDoc;
|
document_t LastSubDoc;
|
||||||
|
|
||||||
@ -689,6 +694,98 @@ TEST(RAW, Fuji) {
|
|||||||
cleanup(&doc, &f);
|
cleanup(&doc, &f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* msdoc */
|
||||||
|
TEST(Msdoc, Test1Pdf) {
|
||||||
|
vfile_t f;
|
||||||
|
document_t doc;
|
||||||
|
load_doc_file("libscan-test-files/test_files/msdoc/test1.doc", &f, &doc);
|
||||||
|
|
||||||
|
size_t size_before = store_size;
|
||||||
|
|
||||||
|
parse_msdoc(&msdoc_ctx, &f, &doc);
|
||||||
|
|
||||||
|
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "October 2000") != nullptr);
|
||||||
|
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "INTERNATIONAL ORGANIZATION FOR STANDARDIZATION");
|
||||||
|
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Oliver Morgan");
|
||||||
|
ASSERT_EQ(get_meta(&doc, MetaPages)->int_val, 57);
|
||||||
|
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
|
||||||
|
ASSERT_NE(size_before, store_size);
|
||||||
|
|
||||||
|
cleanup(&doc, &f);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Msdoc, Test1Text) {
|
||||||
|
vfile_t f;
|
||||||
|
document_t doc;
|
||||||
|
load_doc_file("libscan-test-files/test_files/msdoc/test1.doc", &f, &doc);
|
||||||
|
|
||||||
|
size_t size_before = store_size;
|
||||||
|
|
||||||
|
parse_msdoc(&msdoc_text_ctx, &f, &doc);
|
||||||
|
|
||||||
|
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "October 2000") != nullptr);
|
||||||
|
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "INTERNATIONAL ORGANIZATION FOR STANDARDIZATION");
|
||||||
|
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Oliver Morgan");
|
||||||
|
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
|
||||||
|
ASSERT_EQ(size_before, store_size);
|
||||||
|
|
||||||
|
cleanup(&doc, &f);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Msdoc, Test2Pdf) {
|
||||||
|
vfile_t f;
|
||||||
|
document_t doc;
|
||||||
|
load_doc_file("libscan-test-files/test_files/msdoc/test2.doc", &f, &doc);
|
||||||
|
|
||||||
|
size_t size_before = store_size;
|
||||||
|
|
||||||
|
parse_msdoc(&msdoc_ctx, &f, &doc);
|
||||||
|
|
||||||
|
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "GNU Free Documentation License") != nullptr);
|
||||||
|
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "DWARF Debugging Information Format");
|
||||||
|
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Ron Brender");
|
||||||
|
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
|
||||||
|
ASSERT_NE(size_before, store_size);
|
||||||
|
|
||||||
|
cleanup(&doc, &f);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Msdoc, Test3Pdf) {
|
||||||
|
vfile_t f;
|
||||||
|
document_t doc;
|
||||||
|
load_doc_file("libscan-test-files/test_files/msdoc/test3.doc", &f, &doc);
|
||||||
|
|
||||||
|
size_t size_before = store_size;
|
||||||
|
|
||||||
|
parse_msdoc(&msdoc_ctx, &f, &doc);
|
||||||
|
|
||||||
|
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "INTERNATIONAL PATENT CLASSIFICATION") != nullptr);
|
||||||
|
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "IPC Fixed Texts Specification");
|
||||||
|
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Fievet");
|
||||||
|
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
|
||||||
|
ASSERT_NE(size_before, store_size);
|
||||||
|
|
||||||
|
cleanup(&doc, &f);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Msdoc, Test4Pdf) {
|
||||||
|
vfile_t f;
|
||||||
|
document_t doc;
|
||||||
|
load_doc_file("libscan-test-files/test_files/msdoc/test4.doc", &f, &doc);
|
||||||
|
|
||||||
|
size_t size_before = store_size;
|
||||||
|
|
||||||
|
parse_msdoc(&msdoc_ctx, &f, &doc);
|
||||||
|
|
||||||
|
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "SQL Server international data types") != nullptr);
|
||||||
|
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "MSDN Authoring Template");
|
||||||
|
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Brenda Yen");
|
||||||
|
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
|
||||||
|
ASSERT_NE(size_before, store_size);
|
||||||
|
|
||||||
|
cleanup(&doc, &f);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
setlocale(LC_ALL, "");
|
setlocale(LC_ALL, "");
|
||||||
@ -753,6 +850,18 @@ int main(int argc, char **argv) {
|
|||||||
raw_ctx.tn_size = 500;
|
raw_ctx.tn_size = 500;
|
||||||
raw_ctx.tn_qscale = 5.0;
|
raw_ctx.tn_qscale = 5.0;
|
||||||
|
|
||||||
|
msdoc_ctx.log = noop_log;
|
||||||
|
msdoc_ctx.logf = noop_logf;
|
||||||
|
msdoc_ctx.store = counter_store;
|
||||||
|
msdoc_ctx.content_size = 500;
|
||||||
|
msdoc_ctx.tn_size = 500;
|
||||||
|
|
||||||
|
msdoc_text_ctx.log = noop_log;
|
||||||
|
msdoc_text_ctx.logf = noop_logf;
|
||||||
|
msdoc_text_ctx.store = counter_store;
|
||||||
|
msdoc_text_ctx.content_size = 500;
|
||||||
|
msdoc_text_ctx.tn_size = 0;
|
||||||
|
|
||||||
av_log_set_level(AV_LOG_QUIET);
|
av_log_set_level(AV_LOG_QUIET);
|
||||||
::testing::InitGoogleTest(&argc, argv);
|
::testing::InitGoogleTest(&argc, argv);
|
||||||
return RUN_ALL_TESTS();
|
return RUN_ALL_TESTS();
|
||||||
|
1
third-party/antiword
vendored
Submodule
1
third-party/antiword
vendored
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit be5e260190d807fdfb9ed1d64cf62d6649de3030
|
2
third-party/utf8.h
vendored
2
third-party/utf8.h
vendored
@ -1 +1 @@
|
|||||||
Subproject commit fdcacc00ff48f7d268108dfb0ec7ebc485f1eb16
|
Subproject commit e9762540f33eed32d9a568e20ce4c4a836722a50
|
Loading…
x
Reference in New Issue
Block a user