Basic support for WordPerfect files

This commit is contained in:
simon987 2021-09-06 14:06:56 -04:00
parent 75ff57fd94
commit fe53e1a219
6 changed files with 357 additions and 22 deletions

View File

@ -25,6 +25,7 @@ add_library(
libscan/media/media.c libscan/media/media.h libscan/media/media.c libscan/media/media.h
libscan/font/font.c libscan/font/font.h libscan/font/font.c libscan/font/font.h
libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h
libscan/wpd/wpd.c libscan/wpd/wpd.h libscan/wpd/libwpd_c_api.h libscan/wpd/libwpd_c_api.cpp
third-party/utf8.h third-party/utf8.h
libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h) libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h)
@ -117,31 +118,31 @@ ExternalProject_Add(
SET(FFMPEG_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg) SET(FFMPEG_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
SET(FFMPEG_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg) SET(FFMPEG_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
#ExternalProject_Add( ExternalProject_Add(
# libwpd libwpd
# URL http://prdownloads.sourceforge.net/libwpd/libwpd-0.9.9.tar.gz URL http://prdownloads.sourceforge.net/libwpd/libwpd-0.9.9.tar.gz
#
# UPDATE_COMMAND "" UPDATE_COMMAND ""
# PATCH_COMMAND "" PATCH_COMMAND ""
# TEST_COMMAND "" TEST_COMMAND ""
# CONFIGURE_COMMAND ./configure --without-docs --enable-static --disable-shared CONFIGURE_COMMAND ./configure --without-docs --enable-static --disable-shared
# INSTALL_COMMAND "" INSTALL_COMMAND ""
#
# PREFIX "third-party/ext_libwpd" PREFIX "third-party/ext_libwpd"
# SOURCE_DIR "third-party/ext_libwpd/src/libwpd" SOURCE_DIR "third-party/ext_libwpd/src/libwpd"
# BINARY_DIR "third-party/ext_libwpd/src/libwpd" BINARY_DIR "third-party/ext_libwpd/src/libwpd"
#
# BUILD_COMMAND ${MAKE_EXE} -j33 BUILD_COMMAND ${MAKE_EXE} -j33
#) )
#SET(WPD_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/src/lib/.libs/) SET(WPD_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/src/lib/.libs/)
#SET(WPD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/inc/) SET(WPD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/inc/)
add_dependencies( add_dependencies(
scan scan
libmobi libmobi
ffmpeg ffmpeg
antiword antiword
# libwpd libwpd
) )
target_link_libraries( target_link_libraries(
@ -160,7 +161,8 @@ target_link_libraries(
${MOBI_LIB_DIR}/libmobi.a ${MOBI_LIB_DIR}/libmobi.a
# ${WPD_LIB_DIR}/libwpd-0.9.a ${WPD_LIB_DIR}/libwpd-0.9.a
${WPD_LIB_DIR}/libwpd-stream-0.9.a
${FREETYPE_LIB} ${FREETYPE_LIB}
${HARFBUZZ_LIB} ${HARFBUZZ_LIB}
@ -205,7 +207,7 @@ target_include_directories(
${LIBXML2_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR}
${FFMPEG_INCLUDE_DIR} ${FFMPEG_INCLUDE_DIR}
${MOBI_INCLUDE_DIR} ${MOBI_INCLUDE_DIR}
# ${WPD_INCLUDE_DIR} ${WPD_INCLUDE_DIR}
) )
if (BUILD_TESTS) if (BUILD_TESTS)

View File

@ -0,0 +1,200 @@
#include "libwpd_c_api.h"
#include "libwpd/libwpd.h"
#include "libwpd/WPXProperty.h"
#include "libwpd-stream/libwpd-stream.h"
class StringDocument : public WPXDocumentInterface {
private:
text_buffer_t *tex;
document_t *doc;
bool is_full;
public:
StringDocument(text_buffer_t *tex, document_t *doc) {
this->tex = tex;
this->doc = doc;
this->is_full = false;
}
void setDocumentMetaData(const WPXPropertyList &propList) override {
WPXPropertyList::Iter propIter(propList);
for (propIter.rewind(); propIter.next();) {
// TODO: Read metadata here ?!
}
}
void endDocument() override {
text_buffer_terminate_string(this->tex);
}
void closeParagraph() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void closeSpan() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void closeSection() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertTab() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertSpace() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertText(const WPXString &text) override {
if (!this->is_full) {
if (text_buffer_append_string0(tex, text.cstr()) == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertLineBreak() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void definePageStyle(const WPXPropertyList &propList) override { /* noop */ }
void closePageSpan() override { /* noop */ }
void openHeader(const WPXPropertyList &propList) override { /* noop */ }
void closeHeader() override { /* noop */ }
void openFooter(const WPXPropertyList &propList) override { /* noop */ }
void closeFooter() override { /* noop */ }
void
defineParagraphStyle(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void openParagraph(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void defineCharacterStyle(const WPXPropertyList &propList) override { /* noop */ }
void openSpan(const WPXPropertyList &propList) override { /* noop */ }
void
defineSectionStyle(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void openSection(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void insertField(const WPXString &type, const WPXPropertyList &propList) override { /* noop */ }
void defineOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void defineUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void openOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void openUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void closeOrderedListLevel() override { /* noop */ }
void closeUnorderedListLevel() override { /* noop */ }
void openListElement(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void closeListElement() override { /* noop */ }
void openFootnote(const WPXPropertyList &propList) override { /* noop */ }
void closeFootnote() override { /* noop */ }
void openEndnote(const WPXPropertyList &propList) override { /* noop */ }
void closeEndnote() override { /* noop */ }
void openComment(const WPXPropertyList &propList) override { /* noop */ }
void closeComment() override { /* noop */ }
void openTextBox(const WPXPropertyList &propList) override { /* noop */ }
void closeTextBox() override { /* noop */ }
void openTable(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void openTableRow(const WPXPropertyList &propList) override { /* noop */ }
void closeTableRow() override { /* noop */ }
void openTableCell(const WPXPropertyList &propList) override { /* noop */ }
void closeTableCell() override { /* noop */ }
void insertCoveredTableCell(const WPXPropertyList &propList) override { /* noop */ }
void closeTable() override { /* noop */ }
void openFrame(const WPXPropertyList &propList) override { /* noop */ }
void closeFrame() override { /* noop */ }
void insertBinaryObject(const WPXPropertyList &propList, const WPXBinaryData &data) override { /* noop */ }
void insertEquation(const WPXPropertyList &propList, const WPXString &data) override { /* noop */ }
void openPageSpan(const WPXPropertyList &propList) override { /* noop */ }
void startDocument() override { /* noop */ };
};
wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len) {
auto *input = new WPXStringStream(buf, buf_len);
return input;
}
wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t ptr) {
auto *stream = (WPXStringStream *) ptr;
WPDConfidence confidence = WPDocument::isFileFormatSupported(stream);
return (wpd_confidence_t) confidence;
}
wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc) {
auto *stream = (WPXStringStream *) ptr;
auto myDoc = StringDocument(tex, doc);
WPDResult result2 = WPDocument::parse(stream, &myDoc, nullptr);
return (wpd_result_t) result2;
}
void wpd_memory_stream_destroy(wpd_stream_t ptr) {
auto *stream = (WPXStringStream *) ptr;
delete stream;
}

View File

@ -0,0 +1,50 @@
#ifndef SIST2_LIBWPD_C_API_H
#define SIST2_LIBWPD_C_API_H
#include "stdlib.h"
#ifdef __cplusplus
#define EXTERNC extern "C"
#else
#define EXTERNC
#endif
#ifdef __cplusplus
extern "C" {
#endif
#include "../scan.h"
#include "../util.h"
#ifdef __cplusplus
};
#endif
typedef void *wpd_stream_t;
typedef enum {
C_WPD_CONFIDENCE_NONE = 0,
C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION,
C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION,
C_WPD_CONFIDENCE_EXCELLENT
} wpd_confidence_t;
typedef enum {
C_WPD_OK,
C_WPD_FILE_ACCESS_ERROR,
C_WPD_PARSE_ERROR,
C_WPD_UNSUPPORTED_ENCRYPTION_ERROR,
C_WPD_PASSWORD_MISSMATCH_ERROR,
C_WPD_OLE_ERROR,
C_WPD_UNKNOWN_ERROR
} wpd_result_t;
EXTERNC wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t stream);
EXTERNC wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len);
EXTERNC void wpd_memory_stream_destroy(wpd_stream_t stream);
EXTERNC wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc);
#endif

41
libscan/wpd/wpd.c Normal file
View File

@ -0,0 +1,41 @@
#include "wpd.h"
#include "libwpd_c_api.h"
scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
void *buf = read_all(f, &buf_len);
void *stream = wpd_memory_stream_create(buf, buf_len);
wpd_confidence_t conf = wpd_is_file_format_supported(stream);
if (conf == C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION || conf == C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION) {
CTX_LOG_DEBUGF("wpd.c", "File is encrypted! Password-protected WPD files are not supported yet (conf=%d)", conf)
wpd_memory_stream_destroy(stream);
free(buf);
return SCAN_ERR_READ;
}
if (conf != C_WPD_CONFIDENCE_EXCELLENT) {
CTX_LOG_ERRORF("wpd.c", "Unsupported file format! [%s] (conf=%d)", doc->filepath, conf)
wpd_memory_stream_destroy(stream);
free(buf);
return SCAN_ERR_READ;
}
text_buffer_t tex = text_buffer_create(-1);
wpd_result_t res = wpd_parse(stream, &tex, doc);
if (res != C_WPD_OK) {
CTX_LOG_ERRORF("wpd.c", "Error while parsing WPD file [%s] (%d)",
doc->filepath, res)
}
if (tex.dyn_buffer.cur != 0) {
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
}
text_buffer_destroy(&tex);
wpd_memory_stream_destroy(stream);
free(buf);
}

23
libscan/wpd/wpd.h Normal file
View File

@ -0,0 +1,23 @@
#ifndef SIST2_WPD_H
#define SIST2_WPD_H
#include "../scan.h"
#include "../util.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
unsigned int wpd_mime;
} scan_wpd_ctx_t;
scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc);
__always_inline
static int is_wpd(scan_wpd_ctx_t *ctx, unsigned int mime) {
return mime == ctx->wpd_mime;
}
#endif

View File

@ -11,6 +11,7 @@ extern "C" {
#include "../libscan/mobi/scan_mobi.h" #include "../libscan/mobi/scan_mobi.h"
#include "../libscan/raw/raw.h" #include "../libscan/raw/raw.h"
#include "../libscan/msdoc/msdoc.h" #include "../libscan/msdoc/msdoc.h"
#include "../libscan/wpd/wpd.h"
#include <libavutil/avutil.h> #include <libavutil/avutil.h>
} }
@ -39,6 +40,8 @@ static scan_msdoc_ctx_t msdoc_ctx;
static scan_msdoc_ctx_t msdoc_text_ctx; static scan_msdoc_ctx_t msdoc_text_ctx;
static scan_wpd_ctx_t wpd_ctx;
document_t LastSubDoc; document_t LastSubDoc;
@ -942,7 +945,7 @@ TEST(Msdoc, TestFuzz1) {
for (int i = 0; i < 1000; i++) { for (int i = 0; i < 1000; i++) {
size_t buf_len_copy = buf_len; size_t buf_len_copy = buf_len;
char *buf_copy = (char*)malloc(buf_len); char *buf_copy = (char *) malloc(buf_len);
memcpy(buf_copy, buf, buf_len); memcpy(buf_copy, buf, buf_len);
fuzz_buffer(buf_copy, &buf_len_copy, 3, 8, 5); fuzz_buffer(buf_copy, &buf_len_copy, 3, 8, 5);
@ -953,6 +956,18 @@ TEST(Msdoc, TestFuzz1) {
cleanup(&doc, &f); cleanup(&doc, &f);
} }
TEST(Wpd, Wpd51_1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/wpd/test51_1.wpd", &f, &doc);
parse_wpd(&wpd_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val,
"Hello, WordPerfect This is a test This is the next page This is another page");
cleanup(&doc, &f);
}
int main(int argc, char **argv) { int main(int argc, char **argv) {
setlocale(LC_ALL, ""); setlocale(LC_ALL, "");
@ -1034,6 +1049,10 @@ int main(int argc, char **argv) {
msdoc_text_ctx.content_size = 500; msdoc_text_ctx.content_size = 500;
msdoc_text_ctx.tn_size = 0; msdoc_text_ctx.tn_size = 0;
wpd_ctx.log = noop_log;
wpd_ctx.logf = noop_logf;
wpd_ctx.content_size = 500;
av_log_set_level(AV_LOG_QUIET); av_log_set_level(AV_LOG_QUIET);
::testing::InitGoogleTest(&argc, argv); ::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();