mirror of
https://github.com/simon987/libscan.git
synced 2025-04-21 19:26:43 +00:00
Compare commits
4 Commits
1dad8fae20
...
23da8ada5f
Author | SHA1 | Date | |
---|---|---|---|
23da8ada5f | |||
f061212d4b | |||
fe53e1a219 | |||
75ff57fd94 |
@ -25,6 +25,8 @@ add_library(
|
|||||||
libscan/media/media.c libscan/media/media.h
|
libscan/media/media.c libscan/media/media.h
|
||||||
libscan/font/font.c libscan/font/font.h
|
libscan/font/font.c libscan/font/font.h
|
||||||
libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h
|
libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h
|
||||||
|
libscan/json/json.c libscan/json/json.h
|
||||||
|
libscan/wpd/wpd.c libscan/wpd/wpd.h libscan/wpd/libwpd_c_api.h libscan/wpd/libwpd_c_api.cpp
|
||||||
|
|
||||||
third-party/utf8.h
|
third-party/utf8.h
|
||||||
libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h)
|
libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h)
|
||||||
@ -32,6 +34,7 @@ set_target_properties(scan PROPERTIES LINKER_LANGUAGE C)
|
|||||||
|
|
||||||
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib .so)
|
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib .so)
|
||||||
|
|
||||||
|
find_package(cJSON CONFIG REQUIRED)
|
||||||
find_package(LibArchive REQUIRED)
|
find_package(LibArchive REQUIRED)
|
||||||
find_package(BZip2 REQUIRED)
|
find_package(BZip2 REQUIRED)
|
||||||
find_package(lz4 REQUIRED)
|
find_package(lz4 REQUIRED)
|
||||||
@ -117,37 +120,38 @@ ExternalProject_Add(
|
|||||||
SET(FFMPEG_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
|
SET(FFMPEG_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
|
||||||
SET(FFMPEG_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
|
SET(FFMPEG_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
|
||||||
|
|
||||||
#ExternalProject_Add(
|
ExternalProject_Add(
|
||||||
# libwpd
|
libwpd
|
||||||
# URL http://prdownloads.sourceforge.net/libwpd/libwpd-0.9.9.tar.gz
|
URL http://prdownloads.sourceforge.net/libwpd/libwpd-0.9.9.tar.gz
|
||||||
#
|
|
||||||
# UPDATE_COMMAND ""
|
UPDATE_COMMAND ""
|
||||||
# PATCH_COMMAND ""
|
PATCH_COMMAND ""
|
||||||
# TEST_COMMAND ""
|
TEST_COMMAND ""
|
||||||
# CONFIGURE_COMMAND ./configure --without-docs --enable-static --disable-shared
|
CONFIGURE_COMMAND ./configure --without-docs --enable-static --disable-shared
|
||||||
# INSTALL_COMMAND ""
|
INSTALL_COMMAND ""
|
||||||
#
|
|
||||||
# PREFIX "third-party/ext_libwpd"
|
PREFIX "third-party/ext_libwpd"
|
||||||
# SOURCE_DIR "third-party/ext_libwpd/src/libwpd"
|
SOURCE_DIR "third-party/ext_libwpd/src/libwpd"
|
||||||
# BINARY_DIR "third-party/ext_libwpd/src/libwpd"
|
BINARY_DIR "third-party/ext_libwpd/src/libwpd"
|
||||||
#
|
|
||||||
# BUILD_COMMAND ${MAKE_EXE} -j33
|
BUILD_COMMAND ${MAKE_EXE} -j33
|
||||||
#)
|
)
|
||||||
#SET(WPD_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/src/lib/.libs/)
|
SET(WPD_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/src/lib/.libs/)
|
||||||
#SET(WPD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/inc/)
|
SET(WPD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/inc/)
|
||||||
|
|
||||||
add_dependencies(
|
add_dependencies(
|
||||||
scan
|
scan
|
||||||
libmobi
|
libmobi
|
||||||
ffmpeg
|
ffmpeg
|
||||||
antiword
|
antiword
|
||||||
# libwpd
|
libwpd
|
||||||
)
|
)
|
||||||
|
|
||||||
target_link_libraries(
|
target_link_libraries(
|
||||||
scan
|
scan
|
||||||
PUBLIC
|
PUBLIC
|
||||||
|
|
||||||
|
cjson
|
||||||
${LibArchive_LIBRARIES}
|
${LibArchive_LIBRARIES}
|
||||||
ZLIB::ZLIB
|
ZLIB::ZLIB
|
||||||
BZip2::BZip2
|
BZip2::BZip2
|
||||||
@ -160,7 +164,8 @@ target_link_libraries(
|
|||||||
|
|
||||||
${MOBI_LIB_DIR}/libmobi.a
|
${MOBI_LIB_DIR}/libmobi.a
|
||||||
|
|
||||||
# ${WPD_LIB_DIR}/libwpd-0.9.a
|
${WPD_LIB_DIR}/libwpd-0.9.a
|
||||||
|
${WPD_LIB_DIR}/libwpd-stream-0.9.a
|
||||||
|
|
||||||
${FREETYPE_LIB}
|
${FREETYPE_LIB}
|
||||||
${HARFBUZZ_LIB}
|
${HARFBUZZ_LIB}
|
||||||
@ -205,7 +210,7 @@ target_include_directories(
|
|||||||
${LIBXML2_INCLUDE_DIR}
|
${LIBXML2_INCLUDE_DIR}
|
||||||
${FFMPEG_INCLUDE_DIR}
|
${FFMPEG_INCLUDE_DIR}
|
||||||
${MOBI_INCLUDE_DIR}
|
${MOBI_INCLUDE_DIR}
|
||||||
# ${WPD_INCLUDE_DIR}
|
${WPD_INCLUDE_DIR}
|
||||||
)
|
)
|
||||||
|
|
||||||
if (BUILD_TESTS)
|
if (BUILD_TESTS)
|
||||||
|
119
libscan/json/json.c
Normal file
119
libscan/json/json.c
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
#include "json.h"
|
||||||
|
#include "cjson/cJSON.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define JSON_MAX_FILE_SIZE (1024 * 1024 * 50)
|
||||||
|
|
||||||
|
int json_extract_text(cJSON *json, text_buffer_t *tex) {
|
||||||
|
if (cJSON_IsObject(json)) {
|
||||||
|
for (cJSON *child = json->child; child != NULL; child = child->next) {
|
||||||
|
if (json_extract_text(child, tex)) {
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (cJSON_IsArray(json)) {
|
||||||
|
cJSON *child;
|
||||||
|
cJSON_ArrayForEach(child, json) {
|
||||||
|
if (json_extract_text(child, tex)) {
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (cJSON_IsString(json)) {
|
||||||
|
if (text_buffer_append_string0(tex, json->valuestring) == TEXT_BUF_FULL) {
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||||
|
|
||||||
|
if (f->info.st_size > JSON_MAX_FILE_SIZE) {
|
||||||
|
CTX_LOG_WARNINGF("json.c", "File larger than maximum allowed [%s]", f->filepath)
|
||||||
|
return SCAN_ERR_SKIP;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t buf_len;
|
||||||
|
char *buf = read_all(f, &buf_len);
|
||||||
|
|
||||||
|
if (buf == NULL) {
|
||||||
|
return SCAN_ERR_READ;
|
||||||
|
}
|
||||||
|
|
||||||
|
buf_len += 1;
|
||||||
|
buf = realloc(buf, buf_len);
|
||||||
|
*(buf + buf_len - 1) = '\0';
|
||||||
|
|
||||||
|
cJSON *json = cJSON_ParseWithOpts(buf, NULL, TRUE);
|
||||||
|
text_buffer_t tex = text_buffer_create(ctx->content_size);
|
||||||
|
|
||||||
|
json_extract_text(json, &tex);
|
||||||
|
text_buffer_terminate_string(&tex);
|
||||||
|
|
||||||
|
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
|
||||||
|
|
||||||
|
cJSON_Delete(json);
|
||||||
|
free(buf);
|
||||||
|
text_buffer_destroy(&tex);
|
||||||
|
|
||||||
|
return SCAN_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define JSON_BUF_SIZE (1024 * 1024 * 5)
|
||||||
|
|
||||||
|
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||||
|
|
||||||
|
char *buf = malloc(JSON_BUF_SIZE + 1);
|
||||||
|
*(buf + JSON_BUF_SIZE) = '\0';
|
||||||
|
|
||||||
|
text_buffer_t tex = text_buffer_create(-1);
|
||||||
|
|
||||||
|
size_t ret;
|
||||||
|
int eof = FALSE;
|
||||||
|
const char *parse_end = buf;
|
||||||
|
size_t to_read;
|
||||||
|
char *ptr = buf;
|
||||||
|
|
||||||
|
while (TRUE) {
|
||||||
|
cJSON *json;
|
||||||
|
|
||||||
|
if (!eof) {
|
||||||
|
to_read = parse_end == buf ? JSON_BUF_SIZE : parse_end - buf;
|
||||||
|
ret = f->read(f, ptr, to_read);
|
||||||
|
if (ret != to_read) {
|
||||||
|
eof = TRUE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
json = cJSON_ParseWithOpts(buf, &parse_end, FALSE);
|
||||||
|
|
||||||
|
if (parse_end == buf + JSON_BUF_SIZE) {
|
||||||
|
CTX_LOG_ERRORF("json.c", "Line too large for buffer [%s]", doc->filepath);
|
||||||
|
cJSON_Delete(json);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parse_end == buf) {
|
||||||
|
cJSON_Delete(json);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
json_extract_text(json, &tex);
|
||||||
|
|
||||||
|
cJSON_Delete(json);
|
||||||
|
|
||||||
|
memmove(buf, parse_end, (buf + JSON_BUF_SIZE - parse_end));
|
||||||
|
ptr = buf + JSON_BUF_SIZE - parse_end + buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
text_buffer_terminate_string(&tex);
|
||||||
|
|
||||||
|
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
|
||||||
|
|
||||||
|
free(buf);
|
||||||
|
text_buffer_destroy(&tex);
|
||||||
|
}
|
30
libscan/json/json.h
Normal file
30
libscan/json/json.h
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
#ifndef SCAN_JSON_H
|
||||||
|
#define SCAN_JSON_H
|
||||||
|
|
||||||
|
#include "../scan.h"
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
long content_size;
|
||||||
|
log_callback_t log;
|
||||||
|
logf_callback_t logf;
|
||||||
|
store_callback_t store;
|
||||||
|
unsigned int json_mime;
|
||||||
|
unsigned int ndjson_mime;
|
||||||
|
} scan_json_ctx_t;
|
||||||
|
|
||||||
|
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||||
|
|
||||||
|
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||||
|
|
||||||
|
__always_inline
|
||||||
|
static int is_json(scan_json_ctx_t *ctx, unsigned int mime) {
|
||||||
|
return mime == ctx->json_mime;
|
||||||
|
}
|
||||||
|
|
||||||
|
__always_inline
|
||||||
|
static int is_ndjson(scan_json_ctx_t *ctx, unsigned int mime) {
|
||||||
|
return mime == ctx->ndjson_mime;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
@ -20,7 +20,7 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi
|
|||||||
opts->iPageWidth = 595;
|
opts->iPageWidth = 595;
|
||||||
opts->eImageLevel = level_ps_3;
|
opts->eImageLevel = level_ps_3;
|
||||||
|
|
||||||
int doc_word_version = iGuessVersionNumber(file_in, buf_len);
|
int doc_word_version = iGuessVersionNumber(file_in, (int) buf_len);
|
||||||
if (doc_word_version < 0 || doc_word_version == 3) {
|
if (doc_word_version < 0 || doc_word_version == 3) {
|
||||||
free(buf);
|
free(buf);
|
||||||
return;
|
return;
|
||||||
@ -38,7 +38,7 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
iInitDocument(file_in, buf_len);
|
iInitDocument(file_in, (int) buf_len);
|
||||||
const char *author = szGetAuthor();
|
const char *author = szGetAuthor();
|
||||||
if (author != NULL) {
|
if (author != NULL) {
|
||||||
APPEND_UTF8_META(doc, MetaAuthor, author)
|
APPEND_UTF8_META(doc, MetaAuthor, author)
|
||||||
@ -50,7 +50,7 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi
|
|||||||
}
|
}
|
||||||
vFreeDocument();
|
vFreeDocument();
|
||||||
|
|
||||||
bWordDecryptor(file_in, buf_len, diag);
|
bWordDecryptor(file_in, (int) buf_len, diag);
|
||||||
vDestroyDiagram(diag);
|
vDestroyDiagram(diag);
|
||||||
fclose(file_out);
|
fclose(file_out);
|
||||||
|
|
||||||
@ -93,7 +93,7 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b
|
|||||||
opts->iPageWidth = 595;
|
opts->iPageWidth = 595;
|
||||||
opts->eImageLevel = level_ps_3;
|
opts->eImageLevel = level_ps_3;
|
||||||
|
|
||||||
int doc_word_version = iGuessVersionNumber(file, buf_len);
|
int doc_word_version = iGuessVersionNumber(file, (int) buf_len);
|
||||||
if (doc_word_version < 0 || doc_word_version == 3) {
|
if (doc_word_version < 0 || doc_word_version == 3) {
|
||||||
free(buf);
|
free(buf);
|
||||||
return;
|
return;
|
||||||
@ -110,7 +110,7 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
bWordDecryptor(file, buf_len, diag);
|
bWordDecryptor(file, (int) buf_len, diag);
|
||||||
vDestroyDiagram(diag);
|
vDestroyDiagram(diag);
|
||||||
|
|
||||||
fclose(file_out);
|
fclose(file_out);
|
||||||
|
@ -22,6 +22,7 @@ typedef void (*log_callback_t)(const char *filepath, int level, char *str);
|
|||||||
typedef int scan_code_t;
|
typedef int scan_code_t;
|
||||||
#define SCAN_OK (scan_code_t) 0
|
#define SCAN_OK (scan_code_t) 0
|
||||||
#define SCAN_ERR_READ (scan_code_t) (-1)
|
#define SCAN_ERR_READ (scan_code_t) (-1)
|
||||||
|
#define SCAN_ERR_SKIP (scan_code_t) (-2)
|
||||||
|
|
||||||
#define LEVEL_DEBUG 0
|
#define LEVEL_DEBUG 0
|
||||||
#define LEVEL_INFO 1
|
#define LEVEL_INFO 1
|
||||||
|
@ -35,7 +35,7 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
|||||||
return SCAN_OK;
|
return SCAN_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define MAX_MARKUP_SIZE 1024 * 1024
|
#define MAX_MARKUP_SIZE (1024 * 1024)
|
||||||
|
|
||||||
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||||
|
|
||||||
|
@ -9,11 +9,11 @@
|
|||||||
|
|
||||||
#define STR_STARTS_WITH(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
|
#define STR_STARTS_WITH(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
|
||||||
|
|
||||||
#define TEXT_BUF_FULL -1
|
#define TEXT_BUF_FULL (-1)
|
||||||
#define INITIAL_BUF_SIZE 1024 * 16
|
#define INITIAL_BUF_SIZE (1024 * 16)
|
||||||
|
|
||||||
#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c))
|
#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c))
|
||||||
#define SHOULD_KEEP_CHAR(c) ((c >= '\'' && c <= ';') || (c >= 'A' && c <= 'z') || (c > 127))
|
#define SHOULD_KEEP_CHAR(c) (((c) >= '\'' && (c) <= ';') || ((c) >= 'A' && (c) <= 'z') || ((c) > 127))
|
||||||
|
|
||||||
|
|
||||||
typedef struct dyn_buffer {
|
typedef struct dyn_buffer {
|
||||||
|
200
libscan/wpd/libwpd_c_api.cpp
Normal file
200
libscan/wpd/libwpd_c_api.cpp
Normal file
@ -0,0 +1,200 @@
|
|||||||
|
#include "libwpd_c_api.h"
|
||||||
|
#include "libwpd/libwpd.h"
|
||||||
|
#include "libwpd/WPXProperty.h"
|
||||||
|
#include "libwpd-stream/libwpd-stream.h"
|
||||||
|
|
||||||
|
class StringDocument : public WPXDocumentInterface {
|
||||||
|
|
||||||
|
private:
|
||||||
|
text_buffer_t *tex;
|
||||||
|
document_t *doc;
|
||||||
|
bool is_full;
|
||||||
|
public:
|
||||||
|
|
||||||
|
StringDocument(text_buffer_t *tex, document_t *doc) {
|
||||||
|
this->tex = tex;
|
||||||
|
this->doc = doc;
|
||||||
|
this->is_full = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void setDocumentMetaData(const WPXPropertyList &propList) override {
|
||||||
|
|
||||||
|
WPXPropertyList::Iter propIter(propList);
|
||||||
|
for (propIter.rewind(); propIter.next();) {
|
||||||
|
// TODO: Read metadata here ?!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void endDocument() override {
|
||||||
|
text_buffer_terminate_string(this->tex);
|
||||||
|
}
|
||||||
|
|
||||||
|
void closeParagraph() override {
|
||||||
|
if (!this->is_full) {
|
||||||
|
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||||
|
this->is_full = true;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void closeSpan() override {
|
||||||
|
if (!this->is_full) {
|
||||||
|
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||||
|
this->is_full = true;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void closeSection() override {
|
||||||
|
if (!this->is_full) {
|
||||||
|
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||||
|
this->is_full = true;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void insertTab() override {
|
||||||
|
if (!this->is_full) {
|
||||||
|
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||||
|
this->is_full = true;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void insertSpace() override {
|
||||||
|
if (!this->is_full) {
|
||||||
|
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||||
|
this->is_full = true;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void insertText(const WPXString &text) override {
|
||||||
|
if (!this->is_full) {
|
||||||
|
if (text_buffer_append_string0(tex, text.cstr()) == TEXT_BUF_FULL) {
|
||||||
|
this->is_full = true;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void insertLineBreak() override {
|
||||||
|
if (!this->is_full) {
|
||||||
|
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||||
|
this->is_full = true;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void definePageStyle(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void closePageSpan() override { /* noop */ }
|
||||||
|
|
||||||
|
void openHeader(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void closeHeader() override { /* noop */ }
|
||||||
|
|
||||||
|
void openFooter(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void closeFooter() override { /* noop */ }
|
||||||
|
|
||||||
|
void
|
||||||
|
defineParagraphStyle(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
|
||||||
|
|
||||||
|
void openParagraph(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
|
||||||
|
|
||||||
|
void defineCharacterStyle(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void openSpan(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void
|
||||||
|
defineSectionStyle(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
|
||||||
|
|
||||||
|
void openSection(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
|
||||||
|
|
||||||
|
void insertField(const WPXString &type, const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void defineOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void defineUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void openOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void openUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void closeOrderedListLevel() override { /* noop */ }
|
||||||
|
|
||||||
|
void closeUnorderedListLevel() override { /* noop */ }
|
||||||
|
|
||||||
|
void openListElement(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
|
||||||
|
|
||||||
|
void closeListElement() override { /* noop */ }
|
||||||
|
|
||||||
|
void openFootnote(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void closeFootnote() override { /* noop */ }
|
||||||
|
|
||||||
|
void openEndnote(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void closeEndnote() override { /* noop */ }
|
||||||
|
|
||||||
|
void openComment(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void closeComment() override { /* noop */ }
|
||||||
|
|
||||||
|
void openTextBox(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void closeTextBox() override { /* noop */ }
|
||||||
|
|
||||||
|
void openTable(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
|
||||||
|
|
||||||
|
void openTableRow(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void closeTableRow() override { /* noop */ }
|
||||||
|
|
||||||
|
void openTableCell(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void closeTableCell() override { /* noop */ }
|
||||||
|
|
||||||
|
void insertCoveredTableCell(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void closeTable() override { /* noop */ }
|
||||||
|
|
||||||
|
void openFrame(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void closeFrame() override { /* noop */ }
|
||||||
|
|
||||||
|
void insertBinaryObject(const WPXPropertyList &propList, const WPXBinaryData &data) override { /* noop */ }
|
||||||
|
|
||||||
|
void insertEquation(const WPXPropertyList &propList, const WPXString &data) override { /* noop */ }
|
||||||
|
|
||||||
|
void openPageSpan(const WPXPropertyList &propList) override { /* noop */ }
|
||||||
|
|
||||||
|
void startDocument() override { /* noop */ };
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len) {
|
||||||
|
auto *input = new WPXStringStream(buf, buf_len);
|
||||||
|
return input;
|
||||||
|
}
|
||||||
|
|
||||||
|
wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t ptr) {
|
||||||
|
auto *stream = (WPXStringStream *) ptr;
|
||||||
|
WPDConfidence confidence = WPDocument::isFileFormatSupported(stream);
|
||||||
|
|
||||||
|
return (wpd_confidence_t) confidence;
|
||||||
|
}
|
||||||
|
|
||||||
|
wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc) {
|
||||||
|
auto *stream = (WPXStringStream *) ptr;
|
||||||
|
|
||||||
|
auto myDoc = StringDocument(tex, doc);
|
||||||
|
WPDResult result2 = WPDocument::parse(stream, &myDoc, nullptr);
|
||||||
|
|
||||||
|
return (wpd_result_t) result2;
|
||||||
|
}
|
||||||
|
|
||||||
|
void wpd_memory_stream_destroy(wpd_stream_t ptr) {
|
||||||
|
auto *stream = (WPXStringStream *) ptr;
|
||||||
|
delete stream;
|
||||||
|
}
|
50
libscan/wpd/libwpd_c_api.h
Normal file
50
libscan/wpd/libwpd_c_api.h
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
#ifndef SIST2_LIBWPD_C_API_H
|
||||||
|
#define SIST2_LIBWPD_C_API_H
|
||||||
|
|
||||||
|
#include "stdlib.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
#define EXTERNC extern "C"
|
||||||
|
#else
|
||||||
|
#define EXTERNC
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
#include "../scan.h"
|
||||||
|
#include "../util.h"
|
||||||
|
#ifdef __cplusplus
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
typedef void *wpd_stream_t;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
C_WPD_CONFIDENCE_NONE = 0,
|
||||||
|
C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION,
|
||||||
|
C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION,
|
||||||
|
C_WPD_CONFIDENCE_EXCELLENT
|
||||||
|
} wpd_confidence_t;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
C_WPD_OK,
|
||||||
|
C_WPD_FILE_ACCESS_ERROR,
|
||||||
|
C_WPD_PARSE_ERROR,
|
||||||
|
C_WPD_UNSUPPORTED_ENCRYPTION_ERROR,
|
||||||
|
C_WPD_PASSWORD_MISSMATCH_ERROR,
|
||||||
|
C_WPD_OLE_ERROR,
|
||||||
|
C_WPD_UNKNOWN_ERROR
|
||||||
|
} wpd_result_t;
|
||||||
|
|
||||||
|
|
||||||
|
EXTERNC wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t stream);
|
||||||
|
|
||||||
|
EXTERNC wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len);
|
||||||
|
|
||||||
|
EXTERNC void wpd_memory_stream_destroy(wpd_stream_t stream);
|
||||||
|
|
||||||
|
EXTERNC wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc);
|
||||||
|
|
||||||
|
#endif
|
41
libscan/wpd/wpd.c
Normal file
41
libscan/wpd/wpd.c
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#include "wpd.h"
|
||||||
|
#include "libwpd_c_api.h"
|
||||||
|
|
||||||
|
scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||||
|
|
||||||
|
size_t buf_len;
|
||||||
|
void *buf = read_all(f, &buf_len);
|
||||||
|
|
||||||
|
void *stream = wpd_memory_stream_create(buf, buf_len);
|
||||||
|
wpd_confidence_t conf = wpd_is_file_format_supported(stream);
|
||||||
|
|
||||||
|
if (conf == C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION || conf == C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION) {
|
||||||
|
CTX_LOG_DEBUGF("wpd.c", "File is encrypted! Password-protected WPD files are not supported yet (conf=%d)", conf)
|
||||||
|
wpd_memory_stream_destroy(stream);
|
||||||
|
free(buf);
|
||||||
|
return SCAN_ERR_READ;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (conf != C_WPD_CONFIDENCE_EXCELLENT) {
|
||||||
|
CTX_LOG_ERRORF("wpd.c", "Unsupported file format! [%s] (conf=%d)", doc->filepath, conf)
|
||||||
|
wpd_memory_stream_destroy(stream);
|
||||||
|
free(buf);
|
||||||
|
return SCAN_ERR_READ;
|
||||||
|
}
|
||||||
|
|
||||||
|
text_buffer_t tex = text_buffer_create(-1);
|
||||||
|
wpd_result_t res = wpd_parse(stream, &tex, doc);
|
||||||
|
|
||||||
|
if (res != C_WPD_OK) {
|
||||||
|
CTX_LOG_ERRORF("wpd.c", "Error while parsing WPD file [%s] (%d)",
|
||||||
|
doc->filepath, res)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tex.dyn_buffer.cur != 0) {
|
||||||
|
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
|
||||||
|
}
|
||||||
|
|
||||||
|
text_buffer_destroy(&tex);
|
||||||
|
wpd_memory_stream_destroy(stream);
|
||||||
|
free(buf);
|
||||||
|
}
|
23
libscan/wpd/wpd.h
Normal file
23
libscan/wpd/wpd.h
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
#ifndef SIST2_WPD_H
|
||||||
|
#define SIST2_WPD_H
|
||||||
|
|
||||||
|
#include "../scan.h"
|
||||||
|
#include "../util.h"
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
long content_size;
|
||||||
|
|
||||||
|
log_callback_t log;
|
||||||
|
logf_callback_t logf;
|
||||||
|
|
||||||
|
unsigned int wpd_mime;
|
||||||
|
} scan_wpd_ctx_t;
|
||||||
|
|
||||||
|
scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||||
|
|
||||||
|
__always_inline
|
||||||
|
static int is_wpd(scan_wpd_ctx_t *ctx, unsigned int mime) {
|
||||||
|
return mime == ctx->wpd_mime;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
@ -11,6 +11,8 @@ extern "C" {
|
|||||||
#include "../libscan/mobi/scan_mobi.h"
|
#include "../libscan/mobi/scan_mobi.h"
|
||||||
#include "../libscan/raw/raw.h"
|
#include "../libscan/raw/raw.h"
|
||||||
#include "../libscan/msdoc/msdoc.h"
|
#include "../libscan/msdoc/msdoc.h"
|
||||||
|
#include "../libscan/wpd/wpd.h"
|
||||||
|
#include "../libscan/json/json.h"
|
||||||
#include <libavutil/avutil.h>
|
#include <libavutil/avutil.h>
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -39,6 +41,10 @@ static scan_msdoc_ctx_t msdoc_ctx;
|
|||||||
|
|
||||||
static scan_msdoc_ctx_t msdoc_text_ctx;
|
static scan_msdoc_ctx_t msdoc_text_ctx;
|
||||||
|
|
||||||
|
static scan_wpd_ctx_t wpd_ctx;
|
||||||
|
|
||||||
|
static scan_json_ctx_t json_ctx;
|
||||||
|
|
||||||
|
|
||||||
document_t LastSubDoc;
|
document_t LastSubDoc;
|
||||||
|
|
||||||
@ -953,6 +959,38 @@ TEST(Msdoc, TestFuzz1) {
|
|||||||
cleanup(&doc, &f);
|
cleanup(&doc, &f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(Wpd, Wpd51_1) {
|
||||||
|
vfile_t f;
|
||||||
|
document_t doc;
|
||||||
|
load_doc_file("libscan-test-files/test_files/wpd/test51_1.wpd", &f, &doc);
|
||||||
|
|
||||||
|
parse_wpd(&wpd_ctx, &f, &doc);
|
||||||
|
|
||||||
|
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val,
|
||||||
|
"Hello, WordPerfect This is a test This is the next page This is another page");
|
||||||
|
|
||||||
|
cleanup(&doc, &f);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Json, Json1) {
|
||||||
|
vfile_t f;
|
||||||
|
document_t doc;
|
||||||
|
load_doc_file("libscan-test-files/test_files/json/json1.json", &f, &doc);
|
||||||
|
|
||||||
|
parse_json(&json_ctx, &f, &doc);
|
||||||
|
|
||||||
|
cleanup(&doc, &f);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Json, NDJson1) {
|
||||||
|
vfile_t f;
|
||||||
|
document_t doc;
|
||||||
|
load_doc_file("libscan-test-files/test_files/json/ndjson1.jsonl", &f, &doc);
|
||||||
|
|
||||||
|
parse_ndjson(&json_ctx, &f, &doc);
|
||||||
|
|
||||||
|
cleanup(&doc, &f);
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
setlocale(LC_ALL, "");
|
setlocale(LC_ALL, "");
|
||||||
@ -1034,6 +1072,14 @@ int main(int argc, char **argv) {
|
|||||||
msdoc_text_ctx.content_size = 500;
|
msdoc_text_ctx.content_size = 500;
|
||||||
msdoc_text_ctx.tn_size = 0;
|
msdoc_text_ctx.tn_size = 0;
|
||||||
|
|
||||||
|
wpd_ctx.log = noop_log;
|
||||||
|
wpd_ctx.logf = noop_logf;
|
||||||
|
wpd_ctx.content_size = 500;
|
||||||
|
|
||||||
|
json_ctx.log = noop_log;
|
||||||
|
json_ctx.logf = noop_logf;
|
||||||
|
json_ctx.content_size = 5000;
|
||||||
|
|
||||||
av_log_set_level(AV_LOG_QUIET);
|
av_log_set_level(AV_LOG_QUIET);
|
||||||
::testing::InitGoogleTest(&argc, argv);
|
::testing::InitGoogleTest(&argc, argv);
|
||||||
return RUN_ALL_TESTS();
|
return RUN_ALL_TESTS();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user