Rewrite doc.c module, fix bad error handling, fix pdf.c memory leaks

2025-12-11 14:38:54 +00:00 · 2020-03-05 16:12:34 -05:00
parent 9ace5774af
commit 1abddabeec
26 changed files with 110 additions and 2873 deletions
--- a/src/main.c
+++ b/src/main.c
@@ -6,7 +6,7 @@
 #define EPILOG "Made by simon987 <me@simon987.net>. Released under GPL-3.0"


-static const char *const Version = "1.3.0";
+static const char *const Version = "1.3.1";
 static const char *const usage[] = {
        "sist2 scan [OPTION]... PATH",
        "sist2 index [OPTION]... INDEX",
@@ -17,7 +17,6 @@ static const char *const usage[] = {
 void global_init() {
    curl_global_init(CURL_GLOBAL_NOTHING);
    av_log_set_level(AV_LOG_QUIET);
-    opcInitLibrary();
 }

 void init_dir(const char *dirpath) {
--- a/src/parsing/doc.c
+++ b/src/parsing/doc.c
@@ -1,27 +1,31 @@
 #include "doc.h"
 #include "src/ctx.h"

+
+#define STR_STARTS_WITH(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
+
 __always_inline
-static int should_read_part(char *part) {
+static int should_read_part(const char *part) {

    LOG_DEBUGF("doc.c", "Got part : %s", part)
-    char *part_name = (char *) part;

    if (part == NULL) {
        return FALSE;
    }

    if (    // Word
-            strcmp(part_name, "word/document.xml") == 0
-            || strncmp(part_name, "word/footer", sizeof("word/footer") - 1) == 0
-            || strncmp(part_name, "word/header", sizeof("word/header") - 1) == 0
+            STR_STARTS_WITH(part, "word/document.xml")
+            || STR_STARTS_WITH(part, "word/footnotes.xml")
+            || STR_STARTS_WITH(part, "word/endnotes.xml")
+            || STR_STARTS_WITH(part, "word/footer")
+            || STR_STARTS_WITH(part, "word/header")
            // PowerPoint
-            || strncmp(part_name, "ppt/slides/slide", sizeof("ppt/slides/slide") - 1) == 0
-            || strncmp(part_name, "ppt/notesSlides/notesSlide", sizeof("ppt/notesSlides/notesSlide") - 1) == 0
+            || STR_STARTS_WITH(part, "ppt/slides/slide")
+            || STR_STARTS_WITH(part, "ppt/notesSlides/slide")
            // Excel
-            || strncmp(part_name, "xl/worksheets/sheet", sizeof("xl/worksheets/sheet") - 1) == 0
-            || strcmp(part_name, "xl/sharedStrings.xml") == 0
-            || strcmp(part_name, "xl/workbook.xml") == 0
+            || STR_STARTS_WITH(part, "xl/worksheets/sheet")
+            || STR_STARTS_WITH(part, "xl/sharedStrings.xml")
+            || STR_STARTS_WITH(part, "xl/workbook.xml")
            ) {
        return TRUE;
    }
@@ -29,78 +33,64 @@ static int should_read_part(char *part) {
    return FALSE;
 }

-typedef int (XMLCALL *xmlInputReadCallback)(void *context, char *buffer, int len);
+int extract_text(xmlDoc *xml, xmlNode *node, text_buffer_t *buf) {
+    //TODO: Check which nodes are likely to have a 't' child, and ignore nodes that aren't
+    xmlErrorPtr err = xmlGetLastError();
+    if (err != NULL) {
+        if (err->level == XML_ERR_FATAL) {
+            LOG_ERRORF("doc.c", "Got fatal XML error while parsing document: %s", err->message)
+            return -1;
+        } else {
+            LOG_ERRORF("doc.c", "Got recoverable XML error while parsing document: %s", err->message)
+        }
+    }

-typedef struct {
-    struct archive *a;
-} xml_io_ctx;
+    for (xmlNode *child = node; child; child = child->next) {
+        if (*child->name == 't' && *(child->name + 1) == '\0') {
+            xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);

-int xml_io_read(void *context, char *buffer, int len) {
-    xml_io_ctx *ctx = context;
+            if (text) {
+                text_buffer_append_string0(buf, (char *) text);
+                text_buffer_append_char(buf, ' ');
+                xmlFree(text);
+            }
+        }

-    //TODO: return value ?
-    return archive_read_data(ctx->a, buffer, len);
+        extract_text(xml, child->children, buf);
+    }
 }

-int xml_io_close(void *context) {
+int xml_io_read(void *context, char *buffer, int len) {
+    struct archive *a = context;
+    return archive_read_data(a, buffer, len);
+}
+
+int xml_io_close(UNUSED(void *context)) {
    //noop
    return 0;
 }

 __always_inline
-static int read_part(struct archive *a, dyn_buffer_t *buf, document_t *doc) {
+static int read_part(struct archive *a, text_buffer_t *buf, document_t *doc) {

-    xmlNode *root, *first_child, *node1, *node2, *node3, *node4;
+    xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL, XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);

-    xml_io_ctx ctx = {a};
-
-    /* do actual parsing of document */
-    xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, &ctx, "/", NULL, 0);
-
-    /* error checking! */
    if (xml == NULL) {
-        fprintf(stderr, "Document not parsed successfully. \n");
+        LOG_ERROR(doc->filepath, "Could not parse XML")
        return -1;
    }
-    root = xmlDocGetRootElement(xml);
+
+    xmlNode *root = xmlDocGetRootElement(xml);
    if (root == NULL) {
-        fprintf(stderr, "empty document\n");
-        xmlFreeDoc(xml);
-        return -1;
-    }
-    if (xmlStrcmp(root->name, (const xmlChar *) "document") != 0) {
-        fprintf(stderr, "document of the wrong type, root node != document");
+        LOG_ERROR(doc->filepath, "Empty document")
        xmlFreeDoc(xml);
        return -1;
    }

-    /* init a few more variables */
-    xmlChar *key;
+    extract_text(xml, root, buf);
+    xmlFreeDoc(xml);

-    first_child = root->children;
-    for (node1 = first_child; node1; node1 = node1->next) {
-        if ((xmlStrcmp(node1->name, (const xmlChar *) "body")) == 0) {
-            for (node2 = node1->children; node2; node2 = node2->next) {
-                if ((xmlStrcmp(node2->name, (const xmlChar *) "p")) == 0) {
-
-                    dyn_buffer_write_char(buf, ' ');
-
-                    for (node3 = node2->children; node3; node3 = node3->next) {
-                        if ((xmlStrcmp(node3->name, (const xmlChar *) "r")) == 0) {
-                            for (node4 = node3->children; node4; node4 = node4->next) {
-                                if ((!xmlStrcmp(node4->name, (const xmlChar *) "t"))) {
-                                    key = xmlNodeListGetString(xml, node4->xmlChildrenNode, 1);
-
-                                    dyn_buffer_append_string(buf, (char *) key);
-                                    dyn_buffer_write_char(buf, ' ');
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
+    return 0;
 }

 void parse_doc(void *mem, size_t mem_len, document_t *doc) {
@@ -114,17 +104,17 @@ void parse_doc(void *mem, size_t mem_len, document_t *doc) {

    int ret = archive_read_open_memory(a, mem, mem_len);
    if (ret != ARCHIVE_OK) {
-        LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a));
+        LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a))
        archive_read_free(a);
        return;
    }

-    dyn_buffer_t buf = dyn_buffer_create();
+    text_buffer_t buf = text_buffer_create(ScanCtx.content_size);

    struct archive_entry *entry;
    while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
        if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
-            char *path = (char *) archive_entry_pathname(entry);
+            const char *path = archive_entry_pathname(entry);

            if (should_read_part(path)) {
                ret = read_part(a, &buf, doc);
@@ -132,21 +122,19 @@ void parse_doc(void *mem, size_t mem_len, document_t *doc) {
                    break;
                }
            }
-
        }
    }

+    if (buf.dyn_buffer.cur > 0) {
+        text_buffer_terminate_string(&buf);

-    // close
-
-    if (buf.cur > 0) {
-        dyn_buffer_write_char(&buf, '\0');
-
-        meta_line_t *meta = malloc(sizeof(meta_line_t) + buf.cur);
+        meta_line_t *meta = malloc(sizeof(meta_line_t) + buf.dyn_buffer.cur);
        meta->key = MetaContent;
-        strcpy(meta->strval, buf.buf);
+        strcpy(meta->strval, buf.dyn_buffer.buf);
        APPEND_META(doc, meta)
    }

-    dyn_buffer_destroy(&buf);
+    archive_read_close(a);
+    archive_read_free(a);
+    text_buffer_destroy(&buf);
 }
--- a/src/parsing/parse.c
+++ b/src/parsing/parse.c
@@ -36,7 +36,7 @@ void *read_all(parse_job_t *job, const char *buf, int bytes_read) {
        memcpy(full_buf, buf, bytes_read);

        int ret = job->vfile.read(&job->vfile, full_buf + bytes_read, job->info.st_size - bytes_read);
-        if (ret == -1) {
+        if (ret < 0) {
            LOG_ERRORF(job->filepath, "read(): [%d] %s", errno, strerror(errno))
            return NULL;
        }
@@ -58,6 +58,7 @@ void parse(void *arg) {

    if (Magic == NULL) {
        Magic = magic_open(MAGIC_MIME_TYPE);
+        magic_load(Magic, NULL);
    }

    doc.filepath = job->filepath;
@@ -90,7 +91,7 @@ void parse(void *arg) {
    if (doc.mime == 0 && !ScanCtx.fast) {
        // Get mime type with libmagic
        bytes_read = job->vfile.read(&job->vfile, buf, PARSE_BUF_SIZE);
-        if (bytes_read == -1) {
+        if (bytes_read < 0) {
            LOG_WARNINGF(job->filepath, "read() Error: %s", strerror(errno))
            CLOSE_FILE(job->vfile)
            return;
@@ -99,10 +100,16 @@ void parse(void *arg) {
        const char *magic_mime_str = magic_buffer(Magic, buf, bytes_read);
        if (magic_mime_str != NULL) {
            doc.mime = mime_get_mime_by_string(ScanCtx.mime_table, magic_mime_str);
+
+            LOG_DEBUGF(job->filepath, "libmagic: %s", magic_mime_str);
+
            if (doc.mime == 0) {
                LOG_WARNINGF(job->filepath, "Couldn't find mime %s", magic_mime_str);
            }
        }
+
+        magic_close(Magic);
+        Magic = NULL;
    }

    int mmime = MAJOR_MIME(doc.mime);
@@ -112,11 +119,11 @@ void parse(void *arg) {
    } else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) ||
               (mmime == MimeImage && doc.size >= MIN_IMAGE_SIZE) || mmime == MimeAudio) {

-        if (job->vfile.is_fs_file) {
-            parse_media_filename(job->filepath, &doc);
-        } else {
-            parse_media_vfile(&job->vfile, &doc);
-        }
+//        if (job->vfile.is_fs_file) {
+//            parse_media_filename(job->filepath, &doc);
+//        } else {
+//            parse_media_vfile(&job->vfile, &doc);
+//        }

    } else if (IS_PDF(doc.mime)) {
        void *pdf_buf = read_all(job, (char *) buf, bytes_read);
--- a/src/sist.h
+++ b/src/sist.h
@@ -31,8 +31,8 @@
 #include "freetype/freetype.h"
 #include <archive.h>
 #include <archive_entry.h>
-#include <opc/opc.h>
 #include <libxml/xmlstring.h>
+#include <libxml/parser.h>
 #define BOOL int
 #include <tesseract/capi.h>
 #include <pcre.h>