diff --git a/CMakeLists.txt b/CMakeLists.txt index b707ce6..52f05b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,7 @@ add_library( libscan/media/media.c libscan/media/media.h libscan/font/font.c libscan/font/font.h libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h + libscan/wpd/wpd.c libscan/wpd/wpd.h libscan/wpd/libwpd_c_api.h libscan/wpd/libwpd_c_api.cpp third-party/utf8.h libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h) @@ -117,31 +118,31 @@ ExternalProject_Add( SET(FFMPEG_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg) SET(FFMPEG_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg) -#ExternalProject_Add( -# libwpd -# URL http://prdownloads.sourceforge.net/libwpd/libwpd-0.9.9.tar.gz -# -# UPDATE_COMMAND "" -# PATCH_COMMAND "" -# TEST_COMMAND "" -# CONFIGURE_COMMAND ./configure --without-docs --enable-static --disable-shared -# INSTALL_COMMAND "" -# -# PREFIX "third-party/ext_libwpd" -# SOURCE_DIR "third-party/ext_libwpd/src/libwpd" -# BINARY_DIR "third-party/ext_libwpd/src/libwpd" -# -# BUILD_COMMAND ${MAKE_EXE} -j33 -#) -#SET(WPD_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/src/lib/.libs/) -#SET(WPD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/inc/) +ExternalProject_Add( + libwpd + URL http://prdownloads.sourceforge.net/libwpd/libwpd-0.9.9.tar.gz + + UPDATE_COMMAND "" + PATCH_COMMAND "" + TEST_COMMAND "" + CONFIGURE_COMMAND ./configure --without-docs --enable-static --disable-shared + INSTALL_COMMAND "" + + PREFIX "third-party/ext_libwpd" + SOURCE_DIR "third-party/ext_libwpd/src/libwpd" + BINARY_DIR "third-party/ext_libwpd/src/libwpd" + + BUILD_COMMAND ${MAKE_EXE} -j33 +) +SET(WPD_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/src/lib/.libs/) +SET(WPD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/inc/) add_dependencies( scan libmobi ffmpeg antiword -# libwpd + libwpd ) target_link_libraries( @@ -160,7 +161,8 @@ target_link_libraries( ${MOBI_LIB_DIR}/libmobi.a -# ${WPD_LIB_DIR}/libwpd-0.9.a + ${WPD_LIB_DIR}/libwpd-0.9.a + ${WPD_LIB_DIR}/libwpd-stream-0.9.a ${FREETYPE_LIB} ${HARFBUZZ_LIB} @@ -205,7 +207,7 @@ target_include_directories( ${LIBXML2_INCLUDE_DIR} ${FFMPEG_INCLUDE_DIR} ${MOBI_INCLUDE_DIR} -# ${WPD_INCLUDE_DIR} + ${WPD_INCLUDE_DIR} ) if (BUILD_TESTS) diff --git a/libscan/wpd/libwpd_c_api.cpp b/libscan/wpd/libwpd_c_api.cpp new file mode 100644 index 0000000..c0f5c82 --- /dev/null +++ b/libscan/wpd/libwpd_c_api.cpp @@ -0,0 +1,200 @@ +#include "libwpd_c_api.h" +#include "libwpd/libwpd.h" +#include "libwpd/WPXProperty.h" +#include "libwpd-stream/libwpd-stream.h" + +class StringDocument : public WPXDocumentInterface { + +private: + text_buffer_t *tex; + document_t *doc; + bool is_full; +public: + + StringDocument(text_buffer_t *tex, document_t *doc) { + this->tex = tex; + this->doc = doc; + this->is_full = false; + } + + void setDocumentMetaData(const WPXPropertyList &propList) override { + + WPXPropertyList::Iter propIter(propList); + for (propIter.rewind(); propIter.next();) { + // TODO: Read metadata here ?! + } + } + + void endDocument() override { + text_buffer_terminate_string(this->tex); + } + + void closeParagraph() override { + if (!this->is_full) { + if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) { + this->is_full = true; + }; + } + } + + void closeSpan() override { + if (!this->is_full) { + if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) { + this->is_full = true; + }; + } + } + + void closeSection() override { + if (!this->is_full) { + if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) { + this->is_full = true; + }; + } + } + + void insertTab() override { + if (!this->is_full) { + if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) { + this->is_full = true; + }; + } + } + + void insertSpace() override { + if (!this->is_full) { + if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) { + this->is_full = true; + }; + } + } + + void insertText(const WPXString &text) override { + if (!this->is_full) { + if (text_buffer_append_string0(tex, text.cstr()) == TEXT_BUF_FULL) { + this->is_full = true; + }; + } + } + + void insertLineBreak() override { + if (!this->is_full) { + if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) { + this->is_full = true; + }; + } + } + + void definePageStyle(const WPXPropertyList &propList) override { /* noop */ } + + void closePageSpan() override { /* noop */ } + + void openHeader(const WPXPropertyList &propList) override { /* noop */ } + + void closeHeader() override { /* noop */ } + + void openFooter(const WPXPropertyList &propList) override { /* noop */ } + + void closeFooter() override { /* noop */ } + + void + defineParagraphStyle(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ } + + void openParagraph(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ } + + void defineCharacterStyle(const WPXPropertyList &propList) override { /* noop */ } + + void openSpan(const WPXPropertyList &propList) override { /* noop */ } + + void + defineSectionStyle(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ } + + void openSection(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ } + + void insertField(const WPXString &type, const WPXPropertyList &propList) override { /* noop */ } + + void defineOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ } + + void defineUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ } + + void openOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ } + + void openUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ } + + void closeOrderedListLevel() override { /* noop */ } + + void closeUnorderedListLevel() override { /* noop */ } + + void openListElement(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ } + + void closeListElement() override { /* noop */ } + + void openFootnote(const WPXPropertyList &propList) override { /* noop */ } + + void closeFootnote() override { /* noop */ } + + void openEndnote(const WPXPropertyList &propList) override { /* noop */ } + + void closeEndnote() override { /* noop */ } + + void openComment(const WPXPropertyList &propList) override { /* noop */ } + + void closeComment() override { /* noop */ } + + void openTextBox(const WPXPropertyList &propList) override { /* noop */ } + + void closeTextBox() override { /* noop */ } + + void openTable(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ } + + void openTableRow(const WPXPropertyList &propList) override { /* noop */ } + + void closeTableRow() override { /* noop */ } + + void openTableCell(const WPXPropertyList &propList) override { /* noop */ } + + void closeTableCell() override { /* noop */ } + + void insertCoveredTableCell(const WPXPropertyList &propList) override { /* noop */ } + + void closeTable() override { /* noop */ } + + void openFrame(const WPXPropertyList &propList) override { /* noop */ } + + void closeFrame() override { /* noop */ } + + void insertBinaryObject(const WPXPropertyList &propList, const WPXBinaryData &data) override { /* noop */ } + + void insertEquation(const WPXPropertyList &propList, const WPXString &data) override { /* noop */ } + + void openPageSpan(const WPXPropertyList &propList) override { /* noop */ } + + void startDocument() override { /* noop */ }; +}; + + +wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len) { + auto *input = new WPXStringStream(buf, buf_len); + return input; +} + +wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t ptr) { + auto *stream = (WPXStringStream *) ptr; + WPDConfidence confidence = WPDocument::isFileFormatSupported(stream); + + return (wpd_confidence_t) confidence; +} + +wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc) { + auto *stream = (WPXStringStream *) ptr; + + auto myDoc = StringDocument(tex, doc); + WPDResult result2 = WPDocument::parse(stream, &myDoc, nullptr); + + return (wpd_result_t) result2; +} + +void wpd_memory_stream_destroy(wpd_stream_t ptr) { + auto *stream = (WPXStringStream *) ptr; + delete stream; +} diff --git a/libscan/wpd/libwpd_c_api.h b/libscan/wpd/libwpd_c_api.h new file mode 100644 index 0000000..822b12e --- /dev/null +++ b/libscan/wpd/libwpd_c_api.h @@ -0,0 +1,50 @@ +#ifndef SIST2_LIBWPD_C_API_H +#define SIST2_LIBWPD_C_API_H + +#include "stdlib.h" + +#ifdef __cplusplus +#define EXTERNC extern "C" +#else +#define EXTERNC +#endif + +#ifdef __cplusplus +extern "C" { +#endif +#include "../scan.h" +#include "../util.h" +#ifdef __cplusplus +}; +#endif + + +typedef void *wpd_stream_t; + +typedef enum { + C_WPD_CONFIDENCE_NONE = 0, + C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION, + C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION, + C_WPD_CONFIDENCE_EXCELLENT +} wpd_confidence_t; + +typedef enum { + C_WPD_OK, + C_WPD_FILE_ACCESS_ERROR, + C_WPD_PARSE_ERROR, + C_WPD_UNSUPPORTED_ENCRYPTION_ERROR, + C_WPD_PASSWORD_MISSMATCH_ERROR, + C_WPD_OLE_ERROR, + C_WPD_UNKNOWN_ERROR +} wpd_result_t; + + +EXTERNC wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t stream); + +EXTERNC wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len); + +EXTERNC void wpd_memory_stream_destroy(wpd_stream_t stream); + +EXTERNC wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc); + +#endif diff --git a/libscan/wpd/wpd.c b/libscan/wpd/wpd.c new file mode 100644 index 0000000..d1c13b6 --- /dev/null +++ b/libscan/wpd/wpd.c @@ -0,0 +1,41 @@ +#include "wpd.h" +#include "libwpd_c_api.h" + +scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc) { + + size_t buf_len; + void *buf = read_all(f, &buf_len); + + void *stream = wpd_memory_stream_create(buf, buf_len); + wpd_confidence_t conf = wpd_is_file_format_supported(stream); + + if (conf == C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION || conf == C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION) { + CTX_LOG_DEBUGF("wpd.c", "File is encrypted! Password-protected WPD files are not supported yet (conf=%d)", conf) + wpd_memory_stream_destroy(stream); + free(buf); + return SCAN_ERR_READ; + } + + if (conf != C_WPD_CONFIDENCE_EXCELLENT) { + CTX_LOG_ERRORF("wpd.c", "Unsupported file format! [%s] (conf=%d)", doc->filepath, conf) + wpd_memory_stream_destroy(stream); + free(buf); + return SCAN_ERR_READ; + } + + text_buffer_t tex = text_buffer_create(-1); + wpd_result_t res = wpd_parse(stream, &tex, doc); + + if (res != C_WPD_OK) { + CTX_LOG_ERRORF("wpd.c", "Error while parsing WPD file [%s] (%d)", + doc->filepath, res) + } + + if (tex.dyn_buffer.cur != 0) { + APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf) + } + + text_buffer_destroy(&tex); + wpd_memory_stream_destroy(stream); + free(buf); +} diff --git a/libscan/wpd/wpd.h b/libscan/wpd/wpd.h new file mode 100644 index 0000000..3639e13 --- /dev/null +++ b/libscan/wpd/wpd.h @@ -0,0 +1,23 @@ +#ifndef SIST2_WPD_H +#define SIST2_WPD_H + +#include "../scan.h" +#include "../util.h" + +typedef struct { + long content_size; + + log_callback_t log; + logf_callback_t logf; + + unsigned int wpd_mime; +} scan_wpd_ctx_t; + +scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc); + +__always_inline +static int is_wpd(scan_wpd_ctx_t *ctx, unsigned int mime) { + return mime == ctx->wpd_mime; +} + +#endif diff --git a/test/main.cpp b/test/main.cpp index 8e3b49d..19d59b7 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -11,6 +11,7 @@ extern "C" { #include "../libscan/mobi/scan_mobi.h" #include "../libscan/raw/raw.h" #include "../libscan/msdoc/msdoc.h" +#include "../libscan/wpd/wpd.h" #include } @@ -39,6 +40,8 @@ static scan_msdoc_ctx_t msdoc_ctx; static scan_msdoc_ctx_t msdoc_text_ctx; +static scan_wpd_ctx_t wpd_ctx; + document_t LastSubDoc; @@ -942,7 +945,7 @@ TEST(Msdoc, TestFuzz1) { for (int i = 0; i < 1000; i++) { size_t buf_len_copy = buf_len; - char *buf_copy = (char*)malloc(buf_len); + char *buf_copy = (char *) malloc(buf_len); memcpy(buf_copy, buf, buf_len); fuzz_buffer(buf_copy, &buf_len_copy, 3, 8, 5); @@ -953,6 +956,18 @@ TEST(Msdoc, TestFuzz1) { cleanup(&doc, &f); } +TEST(Wpd, Wpd51_1) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/wpd/test51_1.wpd", &f, &doc); + + parse_wpd(&wpd_ctx, &f, &doc); + + ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, + "Hello, WordPerfect This is a test This is the next page This is another page"); + + cleanup(&doc, &f); +} int main(int argc, char **argv) { setlocale(LC_ALL, ""); @@ -1034,6 +1049,10 @@ int main(int argc, char **argv) { msdoc_text_ctx.content_size = 500; msdoc_text_ctx.tn_size = 0; + wpd_ctx.log = noop_log; + wpd_ctx.logf = noop_logf; + wpd_ctx.content_size = 500; + av_log_set_level(AV_LOG_QUIET); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS();