mirror of
https://github.com/simon987/sist2.git
synced 2025-12-11 14:38:54 +00:00
OOXML files support
This commit is contained in:
@@ -10,7 +10,7 @@
|
||||
#define EPILOG "Made by simon987 <me@simon987.net>. Released under GPL-3.0"
|
||||
|
||||
|
||||
static const char *const Version = "1.1.10";
|
||||
static const char *const Version = "1.1.11";
|
||||
static const char *const usage[] = {
|
||||
"sist2 scan [OPTION]... PATH",
|
||||
"sist2 index [OPTION]... INDEX",
|
||||
@@ -23,6 +23,7 @@ void global_init() {
|
||||
curl_global_init(CURL_GLOBAL_NOTHING);
|
||||
#endif
|
||||
av_log_set_level(AV_LOG_QUIET);
|
||||
opcInitLibrary();
|
||||
}
|
||||
|
||||
void init_dir(const char *dirpath) {
|
||||
|
||||
94
src/parsing/doc.c
Normal file
94
src/parsing/doc.c
Normal file
@@ -0,0 +1,94 @@
|
||||
#include "doc.h"
|
||||
|
||||
static void dumpText(mceTextReader_t *reader, dyn_buffer_t *buf) {
|
||||
|
||||
mce_skip_attributes(reader);
|
||||
|
||||
mce_start_children(reader) {
|
||||
mce_start_element(reader, NULL, _X("t")) {
|
||||
mce_skip_attributes(reader);
|
||||
mce_start_children(reader) {
|
||||
mce_start_text(reader) {
|
||||
char *str = (char *) xmlTextReaderConstValue(reader->reader);
|
||||
dyn_buffer_append_string(buf, str);
|
||||
dyn_buffer_write_char(buf, ' ');
|
||||
} mce_end_text(reader);
|
||||
} mce_end_children(reader);
|
||||
} mce_end_element(reader);
|
||||
|
||||
mce_start_element(reader, NULL, NULL) {
|
||||
dumpText(reader, buf);
|
||||
} mce_end_element(reader);
|
||||
|
||||
} mce_end_children(reader)
|
||||
}
|
||||
|
||||
__always_inline
|
||||
int should_read_part(opcPart part) {
|
||||
|
||||
char *part_name = (char *) part;
|
||||
|
||||
if ( // Word
|
||||
strcmp(part_name, "word/document.xml") == 0
|
||||
|| strncmp(part_name, "word/footer", sizeof("word/footer") - 1) == 0
|
||||
|| strncmp(part_name, "word/header", sizeof("word/header") - 1) == 0
|
||||
// PowerPoint
|
||||
|| strncmp(part_name, "ppt/slides/slide", sizeof("ppt/slides/slide") - 1) == 0
|
||||
|| strncmp(part_name, "ppt/notesSlides/notesSlide", sizeof("ppt/notesSlides/notesSlide") - 1) == 0
|
||||
// Excel
|
||||
|| strncmp(part_name, "xl/worksheets/sheet", sizeof("xl/worksheets/sheet") - 1) == 0
|
||||
|| strcmp(part_name, "xl/sharedStrings.xml") == 0
|
||||
|| strcmp(part_name, "xl/workbook.xml") == 0
|
||||
) {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
void read_part(opcContainer *c, dyn_buffer_t *buf, opcPart part) {
|
||||
|
||||
mceTextReader_t reader;
|
||||
int ret = opcXmlReaderOpen(c, &reader, part, NULL, "UTF-8", 0);
|
||||
|
||||
if (ret != OPC_ERROR_NONE) {
|
||||
//todo verbose
|
||||
return;
|
||||
}
|
||||
|
||||
mce_start_document(&reader) {
|
||||
mce_start_element(&reader, NULL, NULL) {
|
||||
dumpText(&reader, buf);
|
||||
} mce_end_element(&reader);
|
||||
}mce_end_document(&reader);
|
||||
|
||||
mceTextReaderCleanup(&reader);
|
||||
}
|
||||
|
||||
void parse_doc(void *mem, size_t mem_len, document_t *doc) {
|
||||
|
||||
opcContainer *c = opcContainerOpenMem(mem, mem_len, OPC_OPEN_READ_ONLY, NULL);
|
||||
if (c == NULL) {
|
||||
//todo verbose
|
||||
return;
|
||||
}
|
||||
|
||||
dyn_buffer_t buf = dyn_buffer_create();
|
||||
|
||||
opcPart part = opcPartGetFirst(c);
|
||||
do {
|
||||
if (should_read_part(part)) {
|
||||
read_part(c, &buf, part);
|
||||
}
|
||||
} while ((part = opcPartGetNext(c, part)));
|
||||
|
||||
opcContainerClose(c, OPC_CLOSE_NOW);
|
||||
dyn_buffer_write_char(&buf, '\0');
|
||||
|
||||
meta_line_t *meta = malloc(sizeof(meta_line_t) + buf.cur);
|
||||
meta->key = MetaContent;
|
||||
strcpy(meta->strval, buf.buf);
|
||||
APPEND_META(doc, meta)
|
||||
|
||||
dyn_buffer_destroy(&buf);
|
||||
}
|
||||
8
src/parsing/doc.h
Normal file
8
src/parsing/doc.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#ifndef SIST2_DOC_H
|
||||
#define SIST2_DOC_H
|
||||
|
||||
#include "src/sist.h"
|
||||
|
||||
void parse_doc(void *buf, size_t buf_len, document_t *doc);
|
||||
|
||||
#endif
|
||||
@@ -22,6 +22,9 @@
|
||||
#define ARC_FILTER_MASK 0x08000000
|
||||
#define IS_ARC_FILTER(mime_id) (mime_id & ARC_FILTER_MASK) == ARC_FILTER_MASK
|
||||
|
||||
#define DOC_MASK 0x04000000
|
||||
#define IS_DOC(mime_id) (mime_id & DOC_MASK) == DOC_MASK
|
||||
|
||||
enum major_mime {
|
||||
MimeInvalid = 0,
|
||||
MimeModel = 1,
|
||||
|
||||
@@ -72,9 +72,9 @@ enum mime {
|
||||
application_vnd_oasis_opendocument_presentation=655424,
|
||||
application_vnd_oasis_opendocument_spreadsheet=655425,
|
||||
application_vnd_oasis_opendocument_text=655426,
|
||||
application_vnd_openxmlformats_officedocument_presentationml_presentation=655427,
|
||||
application_vnd_openxmlformats_officedocument_spreadsheetml_sheet=655428,
|
||||
application_vnd_openxmlformats_officedocument_wordprocessingml_document=655429,
|
||||
application_vnd_openxmlformats_officedocument_presentationml_presentation=655427 | 0x04000000,
|
||||
application_vnd_openxmlformats_officedocument_spreadsheetml_sheet=655428 | 0x04000000,
|
||||
application_vnd_openxmlformats_officedocument_wordprocessingml_document=655429 | 0x04000000,
|
||||
application_vnd_symbian_install=655430,
|
||||
application_vnd_tcpdump_pcap=655431,
|
||||
application_vnd_wap_wmlc=655432,
|
||||
|
||||
@@ -137,6 +137,13 @@ void parse(void *arg) {
|
||||
(IS_ARC_FILTER(doc.mime) && should_parse_filtered_file(doc.filepath, doc.ext))
|
||||
)) {
|
||||
parse_archive(&job->vfile, &doc);
|
||||
} else if (ScanCtx.content_size > 0 && IS_DOC(doc.mime)) {
|
||||
void *doc_buf = read_all(job, (char *) buf, bytes_read);
|
||||
parse_doc(doc_buf, doc.size, &doc);
|
||||
|
||||
if (doc_buf != buf && doc_buf != NULL) {
|
||||
free(doc_buf);
|
||||
}
|
||||
}
|
||||
|
||||
//Parent meta
|
||||
|
||||
@@ -30,6 +30,8 @@
|
||||
#include "freetype/freetype.h"
|
||||
#include <archive.h>
|
||||
#include <archive_entry.h>
|
||||
#include <opc/opc.h>
|
||||
#include <libxml/xmlstring.h>
|
||||
|
||||
#ifndef SIST_SCAN_ONLY
|
||||
#include <onion/onion.h>
|
||||
@@ -55,6 +57,7 @@
|
||||
#include "parsing/media.h"
|
||||
#include "parsing/font.h"
|
||||
#include "parsing/arc.h"
|
||||
#include "parsing/doc.h"
|
||||
#include "cli.h"
|
||||
#include "utf8.h/utf8.h"
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user