mirror of
https://github.com/simon987/sist2.git
synced 2025-12-11 14:38:54 +00:00
Rewrite doc.c module, fix bad error handling, fix pdf.c memory leaks
This commit is contained in:
@@ -6,7 +6,7 @@
|
||||
#define EPILOG "Made by simon987 <me@simon987.net>. Released under GPL-3.0"
|
||||
|
||||
|
||||
static const char *const Version = "1.3.0";
|
||||
static const char *const Version = "1.3.1";
|
||||
static const char *const usage[] = {
|
||||
"sist2 scan [OPTION]... PATH",
|
||||
"sist2 index [OPTION]... INDEX",
|
||||
@@ -17,7 +17,6 @@ static const char *const usage[] = {
|
||||
void global_init() {
|
||||
curl_global_init(CURL_GLOBAL_NOTHING);
|
||||
av_log_set_level(AV_LOG_QUIET);
|
||||
opcInitLibrary();
|
||||
}
|
||||
|
||||
void init_dir(const char *dirpath) {
|
||||
|
||||
@@ -1,27 +1,31 @@
|
||||
#include "doc.h"
|
||||
#include "src/ctx.h"
|
||||
|
||||
|
||||
#define STR_STARTS_WITH(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
|
||||
|
||||
__always_inline
|
||||
static int should_read_part(char *part) {
|
||||
static int should_read_part(const char *part) {
|
||||
|
||||
LOG_DEBUGF("doc.c", "Got part : %s", part)
|
||||
char *part_name = (char *) part;
|
||||
|
||||
if (part == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if ( // Word
|
||||
strcmp(part_name, "word/document.xml") == 0
|
||||
|| strncmp(part_name, "word/footer", sizeof("word/footer") - 1) == 0
|
||||
|| strncmp(part_name, "word/header", sizeof("word/header") - 1) == 0
|
||||
STR_STARTS_WITH(part, "word/document.xml")
|
||||
|| STR_STARTS_WITH(part, "word/footnotes.xml")
|
||||
|| STR_STARTS_WITH(part, "word/endnotes.xml")
|
||||
|| STR_STARTS_WITH(part, "word/footer")
|
||||
|| STR_STARTS_WITH(part, "word/header")
|
||||
// PowerPoint
|
||||
|| strncmp(part_name, "ppt/slides/slide", sizeof("ppt/slides/slide") - 1) == 0
|
||||
|| strncmp(part_name, "ppt/notesSlides/notesSlide", sizeof("ppt/notesSlides/notesSlide") - 1) == 0
|
||||
|| STR_STARTS_WITH(part, "ppt/slides/slide")
|
||||
|| STR_STARTS_WITH(part, "ppt/notesSlides/slide")
|
||||
// Excel
|
||||
|| strncmp(part_name, "xl/worksheets/sheet", sizeof("xl/worksheets/sheet") - 1) == 0
|
||||
|| strcmp(part_name, "xl/sharedStrings.xml") == 0
|
||||
|| strcmp(part_name, "xl/workbook.xml") == 0
|
||||
|| STR_STARTS_WITH(part, "xl/worksheets/sheet")
|
||||
|| STR_STARTS_WITH(part, "xl/sharedStrings.xml")
|
||||
|| STR_STARTS_WITH(part, "xl/workbook.xml")
|
||||
) {
|
||||
return TRUE;
|
||||
}
|
||||
@@ -29,78 +33,64 @@ static int should_read_part(char *part) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
typedef int (XMLCALL *xmlInputReadCallback)(void *context, char *buffer, int len);
|
||||
int extract_text(xmlDoc *xml, xmlNode *node, text_buffer_t *buf) {
|
||||
//TODO: Check which nodes are likely to have a 't' child, and ignore nodes that aren't
|
||||
xmlErrorPtr err = xmlGetLastError();
|
||||
if (err != NULL) {
|
||||
if (err->level == XML_ERR_FATAL) {
|
||||
LOG_ERRORF("doc.c", "Got fatal XML error while parsing document: %s", err->message)
|
||||
return -1;
|
||||
} else {
|
||||
LOG_ERRORF("doc.c", "Got recoverable XML error while parsing document: %s", err->message)
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
struct archive *a;
|
||||
} xml_io_ctx;
|
||||
for (xmlNode *child = node; child; child = child->next) {
|
||||
if (*child->name == 't' && *(child->name + 1) == '\0') {
|
||||
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
|
||||
|
||||
int xml_io_read(void *context, char *buffer, int len) {
|
||||
xml_io_ctx *ctx = context;
|
||||
if (text) {
|
||||
text_buffer_append_string0(buf, (char *) text);
|
||||
text_buffer_append_char(buf, ' ');
|
||||
xmlFree(text);
|
||||
}
|
||||
}
|
||||
|
||||
//TODO: return value ?
|
||||
return archive_read_data(ctx->a, buffer, len);
|
||||
extract_text(xml, child->children, buf);
|
||||
}
|
||||
}
|
||||
|
||||
int xml_io_close(void *context) {
|
||||
int xml_io_read(void *context, char *buffer, int len) {
|
||||
struct archive *a = context;
|
||||
return archive_read_data(a, buffer, len);
|
||||
}
|
||||
|
||||
int xml_io_close(UNUSED(void *context)) {
|
||||
//noop
|
||||
return 0;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static int read_part(struct archive *a, dyn_buffer_t *buf, document_t *doc) {
|
||||
static int read_part(struct archive *a, text_buffer_t *buf, document_t *doc) {
|
||||
|
||||
xmlNode *root, *first_child, *node1, *node2, *node3, *node4;
|
||||
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL, XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
|
||||
|
||||
xml_io_ctx ctx = {a};
|
||||
|
||||
/* do actual parsing of document */
|
||||
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, &ctx, "/", NULL, 0);
|
||||
|
||||
/* error checking! */
|
||||
if (xml == NULL) {
|
||||
fprintf(stderr, "Document not parsed successfully. \n");
|
||||
LOG_ERROR(doc->filepath, "Could not parse XML")
|
||||
return -1;
|
||||
}
|
||||
root = xmlDocGetRootElement(xml);
|
||||
|
||||
xmlNode *root = xmlDocGetRootElement(xml);
|
||||
if (root == NULL) {
|
||||
fprintf(stderr, "empty document\n");
|
||||
xmlFreeDoc(xml);
|
||||
return -1;
|
||||
}
|
||||
if (xmlStrcmp(root->name, (const xmlChar *) "document") != 0) {
|
||||
fprintf(stderr, "document of the wrong type, root node != document");
|
||||
LOG_ERROR(doc->filepath, "Empty document")
|
||||
xmlFreeDoc(xml);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* init a few more variables */
|
||||
xmlChar *key;
|
||||
extract_text(xml, root, buf);
|
||||
xmlFreeDoc(xml);
|
||||
|
||||
first_child = root->children;
|
||||
for (node1 = first_child; node1; node1 = node1->next) {
|
||||
if ((xmlStrcmp(node1->name, (const xmlChar *) "body")) == 0) {
|
||||
for (node2 = node1->children; node2; node2 = node2->next) {
|
||||
if ((xmlStrcmp(node2->name, (const xmlChar *) "p")) == 0) {
|
||||
|
||||
dyn_buffer_write_char(buf, ' ');
|
||||
|
||||
for (node3 = node2->children; node3; node3 = node3->next) {
|
||||
if ((xmlStrcmp(node3->name, (const xmlChar *) "r")) == 0) {
|
||||
for (node4 = node3->children; node4; node4 = node4->next) {
|
||||
if ((!xmlStrcmp(node4->name, (const xmlChar *) "t"))) {
|
||||
key = xmlNodeListGetString(xml, node4->xmlChildrenNode, 1);
|
||||
|
||||
dyn_buffer_append_string(buf, (char *) key);
|
||||
dyn_buffer_write_char(buf, ' ');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void parse_doc(void *mem, size_t mem_len, document_t *doc) {
|
||||
@@ -114,17 +104,17 @@ void parse_doc(void *mem, size_t mem_len, document_t *doc) {
|
||||
|
||||
int ret = archive_read_open_memory(a, mem, mem_len);
|
||||
if (ret != ARCHIVE_OK) {
|
||||
LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a));
|
||||
LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a))
|
||||
archive_read_free(a);
|
||||
return;
|
||||
}
|
||||
|
||||
dyn_buffer_t buf = dyn_buffer_create();
|
||||
text_buffer_t buf = text_buffer_create(ScanCtx.content_size);
|
||||
|
||||
struct archive_entry *entry;
|
||||
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
|
||||
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
|
||||
char *path = (char *) archive_entry_pathname(entry);
|
||||
const char *path = archive_entry_pathname(entry);
|
||||
|
||||
if (should_read_part(path)) {
|
||||
ret = read_part(a, &buf, doc);
|
||||
@@ -132,21 +122,19 @@ void parse_doc(void *mem, size_t mem_len, document_t *doc) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if (buf.dyn_buffer.cur > 0) {
|
||||
text_buffer_terminate_string(&buf);
|
||||
|
||||
// close
|
||||
|
||||
if (buf.cur > 0) {
|
||||
dyn_buffer_write_char(&buf, '\0');
|
||||
|
||||
meta_line_t *meta = malloc(sizeof(meta_line_t) + buf.cur);
|
||||
meta_line_t *meta = malloc(sizeof(meta_line_t) + buf.dyn_buffer.cur);
|
||||
meta->key = MetaContent;
|
||||
strcpy(meta->strval, buf.buf);
|
||||
strcpy(meta->strval, buf.dyn_buffer.buf);
|
||||
APPEND_META(doc, meta)
|
||||
}
|
||||
|
||||
dyn_buffer_destroy(&buf);
|
||||
archive_read_close(a);
|
||||
archive_read_free(a);
|
||||
text_buffer_destroy(&buf);
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ void *read_all(parse_job_t *job, const char *buf, int bytes_read) {
|
||||
memcpy(full_buf, buf, bytes_read);
|
||||
|
||||
int ret = job->vfile.read(&job->vfile, full_buf + bytes_read, job->info.st_size - bytes_read);
|
||||
if (ret == -1) {
|
||||
if (ret < 0) {
|
||||
LOG_ERRORF(job->filepath, "read(): [%d] %s", errno, strerror(errno))
|
||||
return NULL;
|
||||
}
|
||||
@@ -58,6 +58,7 @@ void parse(void *arg) {
|
||||
|
||||
if (Magic == NULL) {
|
||||
Magic = magic_open(MAGIC_MIME_TYPE);
|
||||
magic_load(Magic, NULL);
|
||||
}
|
||||
|
||||
doc.filepath = job->filepath;
|
||||
@@ -90,7 +91,7 @@ void parse(void *arg) {
|
||||
if (doc.mime == 0 && !ScanCtx.fast) {
|
||||
// Get mime type with libmagic
|
||||
bytes_read = job->vfile.read(&job->vfile, buf, PARSE_BUF_SIZE);
|
||||
if (bytes_read == -1) {
|
||||
if (bytes_read < 0) {
|
||||
LOG_WARNINGF(job->filepath, "read() Error: %s", strerror(errno))
|
||||
CLOSE_FILE(job->vfile)
|
||||
return;
|
||||
@@ -99,10 +100,16 @@ void parse(void *arg) {
|
||||
const char *magic_mime_str = magic_buffer(Magic, buf, bytes_read);
|
||||
if (magic_mime_str != NULL) {
|
||||
doc.mime = mime_get_mime_by_string(ScanCtx.mime_table, magic_mime_str);
|
||||
|
||||
LOG_DEBUGF(job->filepath, "libmagic: %s", magic_mime_str);
|
||||
|
||||
if (doc.mime == 0) {
|
||||
LOG_WARNINGF(job->filepath, "Couldn't find mime %s", magic_mime_str);
|
||||
}
|
||||
}
|
||||
|
||||
magic_close(Magic);
|
||||
Magic = NULL;
|
||||
}
|
||||
|
||||
int mmime = MAJOR_MIME(doc.mime);
|
||||
@@ -112,11 +119,11 @@ void parse(void *arg) {
|
||||
} else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) ||
|
||||
(mmime == MimeImage && doc.size >= MIN_IMAGE_SIZE) || mmime == MimeAudio) {
|
||||
|
||||
if (job->vfile.is_fs_file) {
|
||||
parse_media_filename(job->filepath, &doc);
|
||||
} else {
|
||||
parse_media_vfile(&job->vfile, &doc);
|
||||
}
|
||||
// if (job->vfile.is_fs_file) {
|
||||
// parse_media_filename(job->filepath, &doc);
|
||||
// } else {
|
||||
// parse_media_vfile(&job->vfile, &doc);
|
||||
// }
|
||||
|
||||
} else if (IS_PDF(doc.mime)) {
|
||||
void *pdf_buf = read_all(job, (char *) buf, bytes_read);
|
||||
|
||||
@@ -31,8 +31,8 @@
|
||||
#include "freetype/freetype.h"
|
||||
#include <archive.h>
|
||||
#include <archive_entry.h>
|
||||
#include <opc/opc.h>
|
||||
#include <libxml/xmlstring.h>
|
||||
#include <libxml/parser.h>
|
||||
#define BOOL int
|
||||
#include <tesseract/capi.h>
|
||||
#include <pcre.h>
|
||||
|
||||
Reference in New Issue
Block a user