mirror of
https://github.com/simon987/sist2.git
synced 2025-12-12 15:08:53 +00:00
130 lines
3.8 KiB
C
130 lines
3.8 KiB
C
#include "doc.h"
|
|
#include "src/ctx.h"
|
|
|
|
int dump_text(mceTextReader_t *reader, dyn_buffer_t *buf) {
|
|
|
|
mce_skip_attributes(reader);
|
|
|
|
xmlErrorPtr err = xmlGetLastError();
|
|
if (err != NULL) {
|
|
if (err->level == XML_ERR_FATAL) {
|
|
LOG_ERRORF("doc.c", "Got fatal XML error while parsing document: %s", err->message)
|
|
return -1;
|
|
} else {
|
|
LOG_ERRORF("doc.c", "Got recoverable XML error while parsing document: %s", err->message)
|
|
}
|
|
}
|
|
|
|
mce_start_children(reader) {
|
|
mce_start_element(reader, NULL, _X("t")) {
|
|
mce_skip_attributes(reader);
|
|
mce_start_children(reader) {
|
|
mce_start_text(reader) {
|
|
char *str = (char *) xmlTextReaderConstValue(reader->reader);
|
|
dyn_buffer_append_string(buf, str);
|
|
dyn_buffer_write_char(buf, ' ');
|
|
} mce_end_text(reader);
|
|
} mce_end_children(reader);
|
|
} mce_end_element(reader);
|
|
|
|
mce_start_element(reader, NULL, NULL) {
|
|
int ret = dump_text(reader, buf);
|
|
if (ret != 0) {
|
|
return ret;
|
|
}
|
|
} mce_end_element(reader);
|
|
|
|
} mce_end_children(reader)
|
|
return 0;
|
|
}
|
|
|
|
__always_inline
|
|
int should_read_part(opcPart part) {
|
|
|
|
char *part_name = (char *) part;
|
|
|
|
if (part == NULL) {
|
|
return FALSE;
|
|
}
|
|
|
|
if ( // Word
|
|
strcmp(part_name, "word/document.xml") == 0
|
|
|| strncmp(part_name, "word/footer", sizeof("word/footer") - 1) == 0
|
|
|| strncmp(part_name, "word/header", sizeof("word/header") - 1) == 0
|
|
// PowerPoint
|
|
|| strncmp(part_name, "ppt/slides/slide", sizeof("ppt/slides/slide") - 1) == 0
|
|
|| strncmp(part_name, "ppt/notesSlides/notesSlide", sizeof("ppt/notesSlides/notesSlide") - 1) == 0
|
|
// Excel
|
|
|| strncmp(part_name, "xl/worksheets/sheet", sizeof("xl/worksheets/sheet") - 1) == 0
|
|
|| strcmp(part_name, "xl/sharedStrings.xml") == 0
|
|
|| strcmp(part_name, "xl/workbook.xml") == 0
|
|
) {
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
__always_inline
|
|
int read_part(opcContainer *c, dyn_buffer_t *buf, opcPart part, document_t *doc) {
|
|
|
|
mceTextReader_t reader;
|
|
int ret = opcXmlReaderOpen(c, &reader, part, NULL, "UTF-8", XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
|
|
|
|
if (ret != OPC_ERROR_NONE) {
|
|
LOG_ERRORF(doc->filepath, "(doc.c) opcXmlReaderOpen() returned error code %d", ret);
|
|
return -1;
|
|
}
|
|
|
|
mce_start_document(&reader) {
|
|
mce_start_element(&reader, NULL, NULL) {
|
|
ret = dump_text(&reader, buf);
|
|
if (ret != 0) {
|
|
mceTextReaderCleanup(&reader);
|
|
return -1;
|
|
}
|
|
} mce_end_element(&reader);
|
|
} mce_end_document(&reader);
|
|
|
|
mceTextReaderCleanup(&reader);
|
|
return 0;
|
|
}
|
|
|
|
void parse_doc(void *mem, size_t mem_len, document_t *doc) {
|
|
|
|
if (mem == NULL) {
|
|
return;
|
|
}
|
|
|
|
opcContainer *c = opcContainerOpenMem(mem, mem_len, OPC_OPEN_READ_ONLY, NULL);
|
|
if (c == NULL) {
|
|
LOG_ERROR(doc->filepath, "(doc.c) Couldn't open document with opcContainerOpenMem()");
|
|
return;
|
|
}
|
|
|
|
dyn_buffer_t buf = dyn_buffer_create();
|
|
|
|
opcPart part = opcPartGetFirst(c);
|
|
do {
|
|
if (should_read_part(part)) {
|
|
int ret = read_part(c, &buf, part, doc);
|
|
if (ret != 0) {
|
|
break;
|
|
}
|
|
}
|
|
} while ((part = opcPartGetNext(c, part)));
|
|
|
|
opcContainerClose(c, OPC_CLOSE_NOW);
|
|
|
|
if (buf.cur > 0) {
|
|
dyn_buffer_write_char(&buf, '\0');
|
|
|
|
meta_line_t *meta = malloc(sizeof(meta_line_t) + buf.cur);
|
|
meta->key = MetaContent;
|
|
strcpy(meta->strval, buf.buf);
|
|
APPEND_META(doc, meta)
|
|
}
|
|
|
|
dyn_buffer_destroy(&buf);
|
|
}
|