diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ef7dd8..689837a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,7 @@ find_package(Tesseract CONFIG REQUIRED) find_package(harfbuzz CONFIG REQUIRED) find_package(OpenJPEG CONFIG REQUIRED) find_package(JPEG REQUIRED) +find_package(LibXml2 REQUIRED) include(ExternalProject) @@ -41,6 +42,7 @@ add_library( libscan/arc/arc.c libscan/arc/arc.h libscan/ebook/ebook.c libscan/ebook/ebook.h libscan/cbr/cbr.c libscan/cbr/cbr.h + libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h third-party/utf8.h ) @@ -76,6 +78,7 @@ target_link_libraries( ${JPEG_LIBRARIES} ${LibArchive_LIBRARIES} ${Tesseract_LIBRARIES} + ${LIBXML2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} @@ -88,6 +91,7 @@ target_include_directories( PRIVATE "${CMAKE_SOURCE_DIR}/third-party/ext_mupdf/src/mupdf/include/" ${JPEG_INCLUDE_DIR} + ${LIBXML2_INCLUDE_DIR} ) diff --git a/README.md b/README.md index 71ff6e5..4984801 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ ```bash -vcpkg install libarchive pthread tesseract +vcpkg install libarchive pthread tesseract libxml2 -rm -rf CMakeFiles/ CMakeCache.txt cmake -DCMAKE_TOOLCHAIN_FILE=/usr/share/vcpkg/scripts/buildsystems/vcpkg.cmake . make -j 4 ``` \ No newline at end of file diff --git a/libscan/ooxml/ooxml.c b/libscan/ooxml/ooxml.c new file mode 100644 index 0000000..11beb19 --- /dev/null +++ b/libscan/ooxml/ooxml.c @@ -0,0 +1,142 @@ +#include "ooxml.h" + +#include "../util.h" +#include <archive.h> +#include <archive_entry.h> +#include <libxml/xmlstring.h> +#include <libxml/parser.h> + +__always_inline +static int should_read_part(const char *part) { + +// LOG_DEBUGF("ooxml.c", "Got part : %s", part) + + if (part == NULL) { + return FALSE; + } + + if ( // Word + STR_STARTS_WITH(part, "word/document.xml") + || STR_STARTS_WITH(part, "word/footnotes.xml") + || STR_STARTS_WITH(part, "word/endnotes.xml") + || STR_STARTS_WITH(part, "word/footer") + || STR_STARTS_WITH(part, "word/header") + // PowerPoint + || STR_STARTS_WITH(part, "ppt/slides/slide") + || STR_STARTS_WITH(part, "ppt/notesSlides/slide") + // Excel + || STR_STARTS_WITH(part, "xl/worksheets/sheet") + || STR_STARTS_WITH(part, "xl/sharedStrings.xml") + || STR_STARTS_WITH(part, "xl/workbook.xml") + ) { + return TRUE; + } + + return FALSE; +} + +int extract_text(xmlDoc *xml, xmlNode *node, text_buffer_t *buf) { + //TODO: Check which nodes are likely to have a 't' child, and ignore nodes that aren't + xmlErrorPtr err = xmlGetLastError(); + if (err != NULL) { + if (err->level == XML_ERR_FATAL) { +// LOG_ERRORF("ooxml.c", "Got fatal XML error while parsing document: %s", err->message) + return -1; + } else { +// LOG_ERRORF("ooxml.c", "Got recoverable XML error while parsing document: %s", err->message) + } + } + + for (xmlNode *child = node; child; child = child->next) { + if (*child->name == 't' && *(child->name + 1) == '\0') { + xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1); + + if (text) { + text_buffer_append_string0(buf, (char *) text); + text_buffer_append_char(buf, ' '); + xmlFree(text); + } + } + + extract_text(xml, child->children, buf); + } + return 0; +} + +int xml_io_read(void *context, char *buffer, int len) { + struct archive *a = context; + return archive_read_data(a, buffer, len); +} + +int xml_io_close(UNUSED(void *context)) { + //noop + return 0; +} + +__always_inline +static int read_part(struct archive *a, text_buffer_t *buf, document_t *doc) { + + xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL, XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET); + + if (xml == NULL) { +// LOG_ERROR(doc->filepath, "Could not parse XML") + return -1; + } + + xmlNode *root = xmlDocGetRootElement(xml); + if (root == NULL) { +// LOG_ERROR(doc->filepath, "Empty document") + xmlFreeDoc(xml); + return -1; + } + + extract_text(xml, root, buf); + xmlFreeDoc(xml); + + return 0; +} + +void parse_doc(scan_ooxml_cxt_t *ctx, vfile_t *f, document_t *doc) { + + size_t buf_len; + void * buf = read_all(f, &buf_len); + + struct archive *a = archive_read_new(); + archive_read_support_format_zip(a); + + int ret = archive_read_open_memory(a, buf, buf_len); + if (ret != ARCHIVE_OK) { +// LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a)) + archive_read_free(a); + return; + } + + text_buffer_t tex = text_buffer_create(ctx->content_size); + + struct archive_entry *entry; + while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { + if (S_ISREG(archive_entry_stat(entry)->st_mode)) { + const char *path = archive_entry_pathname(entry); + + if (should_read_part(path)) { + ret = read_part(a, &tex, doc); + if (ret != 0) { + break; + } + } + } + } + + if (tex.dyn_buffer.cur > 0) { + text_buffer_terminate_string(&tex); + + meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); + meta->key = MetaContent; + strcpy(meta->str_val, tex.dyn_buffer.buf); + APPEND_META(doc, meta) + } + + archive_read_close(a); + archive_read_free(a); + text_buffer_destroy(&tex); +} diff --git a/libscan/ooxml/ooxml.h b/libscan/ooxml/ooxml.h new file mode 100644 index 0000000..369288c --- /dev/null +++ b/libscan/ooxml/ooxml.h @@ -0,0 +1,13 @@ +#ifndef SCAN_OOXML_H +#define SCAN_OOXML_H + +#include <stdlib.h> +#include "../scan.h" + +typedef struct { + long content_size; +} scan_ooxml_cxt_t; + +void parse_doc(scan_ooxml_cxt_t *ctx, vfile_t *f, document_t *doc); + +#endif diff --git a/libscan/scan.h b/libscan/scan.h index 626ddb8..c0147b8 100644 --- a/libscan/scan.h +++ b/libscan/scan.h @@ -5,6 +5,8 @@ #include <sys/stat.h> #include "../third-party/uuid/src/uuid.h" +#include "macros.h" + #define META_INT_MASK 0x80 #define META_STR_MASK 0x40 diff --git a/libscan/util.h b/libscan/util.h index 05c7a62..6115d46 100644 --- a/libscan/util.h +++ b/libscan/util.h @@ -7,6 +7,8 @@ #include "../third-party/utf8.h/utf8.h" #include "macros.h" +#define STR_STARTS_WITH(x, y) (strncmp(y, x, sizeof(y) - 1) == 0) + #define TEXT_BUF_FULL -1 #define INITIAL_BUF_SIZE 1024 * 16 diff --git a/test/main.c b/test/main.c index 7e9008d..3b31369 100644 --- a/test/main.c +++ b/test/main.c @@ -2,6 +2,7 @@ #include <fcntl.h> #include "../libscan/arc/arc.h" #include "../libscan/ebook/ebook.h" +#include "../libscan/ooxml/ooxml.h" int main() {