From 23da8ada5ff50d8714ce92db7220bd071b3ea10a Mon Sep 17 00:00:00 2001
From: simon987 <me@simon987.net>
Date: Mon, 6 Sep 2021 21:25:05 -0400
Subject: [PATCH] Add basic JSON/NDJSON support

---
 CMakeLists.txt      |   3 ++
 libscan/json/json.c | 119 ++++++++++++++++++++++++++++++++++++++++++++
 libscan/json/json.h |  30 +++++++++++
 libscan/scan.h      |   1 +
 test/main.cpp       |  27 ++++++++++
 5 files changed, 180 insertions(+)
 create mode 100644 libscan/json/json.c
 create mode 100644 libscan/json/json.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 52f05b3..430c8a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,7 @@ add_library(
         libscan/media/media.c libscan/media/media.h
         libscan/font/font.c libscan/font/font.h
         libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h
+        libscan/json/json.c libscan/json/json.h
         libscan/wpd/wpd.c libscan/wpd/wpd.h libscan/wpd/libwpd_c_api.h libscan/wpd/libwpd_c_api.cpp
 
         third-party/utf8.h
@@ -33,6 +34,7 @@ set_target_properties(scan PROPERTIES LINKER_LANGUAGE C)
 
 set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib .so)
 
+find_package(cJSON CONFIG REQUIRED)
 find_package(LibArchive REQUIRED)
 find_package(BZip2 REQUIRED)
 find_package(lz4 REQUIRED)
@@ -149,6 +151,7 @@ target_link_libraries(
         scan
         PUBLIC
 
+        cjson
         ${LibArchive_LIBRARIES}
         ZLIB::ZLIB
         BZip2::BZip2
diff --git a/libscan/json/json.c b/libscan/json/json.c
new file mode 100644
index 0000000..a12822b
--- /dev/null
+++ b/libscan/json/json.c
@@ -0,0 +1,119 @@
+#include "json.h"
+#include "cjson/cJSON.h"
+
+
+#define JSON_MAX_FILE_SIZE (1024 * 1024 * 50)
+
+int json_extract_text(cJSON *json, text_buffer_t *tex) {
+    if (cJSON_IsObject(json)) {
+        for (cJSON *child = json->child; child != NULL; child = child->next) {
+            if (json_extract_text(child, tex)) {
+                return TRUE;
+            }
+        }
+    } else if (cJSON_IsArray(json)) {
+        cJSON *child;
+        cJSON_ArrayForEach(child, json) {
+            if (json_extract_text(child, tex)) {
+                return TRUE;
+            }
+        }
+    } else if (cJSON_IsString(json)) {
+        if (text_buffer_append_string0(tex, json->valuestring) == TEXT_BUF_FULL) {
+            return TRUE;
+        }
+        if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
+            return TRUE;
+        }
+    }
+
+    return FALSE;
+}
+
+scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
+
+    if (f->info.st_size > JSON_MAX_FILE_SIZE) {
+        CTX_LOG_WARNINGF("json.c", "File larger than maximum allowed [%s]", f->filepath)
+        return SCAN_ERR_SKIP;
+    }
+
+    size_t buf_len;
+    char *buf = read_all(f, &buf_len);
+
+    if (buf == NULL) {
+        return SCAN_ERR_READ;
+    }
+
+    buf_len += 1;
+    buf = realloc(buf, buf_len);
+    *(buf + buf_len - 1) = '\0';
+
+    cJSON *json = cJSON_ParseWithOpts(buf, NULL, TRUE);
+    text_buffer_t tex = text_buffer_create(ctx->content_size);
+
+    json_extract_text(json, &tex);
+    text_buffer_terminate_string(&tex);
+
+    APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
+
+    cJSON_Delete(json);
+    free(buf);
+    text_buffer_destroy(&tex);
+
+    return SCAN_OK;
+}
+
+#define JSON_BUF_SIZE (1024 * 1024 * 5)
+
+scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
+
+    char *buf = malloc(JSON_BUF_SIZE + 1);
+    *(buf + JSON_BUF_SIZE) = '\0';
+
+    text_buffer_t tex = text_buffer_create(-1);
+
+    size_t ret;
+    int eof = FALSE;
+    const char *parse_end = buf;
+    size_t to_read;
+    char *ptr = buf;
+
+    while (TRUE) {
+        cJSON *json;
+
+        if (!eof) {
+            to_read = parse_end == buf ? JSON_BUF_SIZE : parse_end - buf;
+            ret = f->read(f, ptr, to_read);
+            if (ret != to_read) {
+                eof = TRUE;
+            }
+        }
+
+        json = cJSON_ParseWithOpts(buf, &parse_end, FALSE);
+
+        if (parse_end == buf + JSON_BUF_SIZE) {
+            CTX_LOG_ERRORF("json.c", "Line too large for buffer [%s]", doc->filepath);
+            cJSON_Delete(json);
+            break;
+        }
+
+        if (parse_end == buf) {
+            cJSON_Delete(json);
+            break;
+        }
+
+        json_extract_text(json, &tex);
+
+        cJSON_Delete(json);
+
+        memmove(buf, parse_end, (buf + JSON_BUF_SIZE - parse_end));
+        ptr = buf + JSON_BUF_SIZE - parse_end + buf;
+    }
+
+    text_buffer_terminate_string(&tex);
+
+    APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
+
+    free(buf);
+    text_buffer_destroy(&tex);
+}
diff --git a/libscan/json/json.h b/libscan/json/json.h
new file mode 100644
index 0000000..3d328f3
--- /dev/null
+++ b/libscan/json/json.h
@@ -0,0 +1,30 @@
+#ifndef SCAN_JSON_H
+#define SCAN_JSON_H
+
+#include "../scan.h"
+
+
+typedef struct {
+    long content_size;
+    log_callback_t log;
+    logf_callback_t logf;
+    store_callback_t store;
+    unsigned int json_mime;
+    unsigned int ndjson_mime;
+} scan_json_ctx_t;
+
+scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
+
+scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
+
+__always_inline
+static int is_json(scan_json_ctx_t *ctx, unsigned int mime) {
+    return mime == ctx->json_mime;
+}
+
+__always_inline
+static int is_ndjson(scan_json_ctx_t *ctx, unsigned int mime) {
+    return mime == ctx->ndjson_mime;
+}
+
+#endif
diff --git a/libscan/scan.h b/libscan/scan.h
index fb40a47..e06e8c3 100644
--- a/libscan/scan.h
+++ b/libscan/scan.h
@@ -22,6 +22,7 @@ typedef void (*log_callback_t)(const char *filepath, int level, char *str);
 typedef int scan_code_t;
 #define SCAN_OK (scan_code_t) 0
 #define SCAN_ERR_READ (scan_code_t) (-1)
+#define SCAN_ERR_SKIP (scan_code_t) (-2)
 
 #define LEVEL_DEBUG 0
 #define LEVEL_INFO 1
diff --git a/test/main.cpp b/test/main.cpp
index 19d59b7..42f2818 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -12,6 +12,7 @@ extern "C" {
 #include "../libscan/raw/raw.h"
 #include "../libscan/msdoc/msdoc.h"
 #include "../libscan/wpd/wpd.h"
+#include "../libscan/json/json.h"
 #include <libavutil/avutil.h>
 }
 
@@ -42,6 +43,8 @@ static scan_msdoc_ctx_t msdoc_text_ctx;
 
 static scan_wpd_ctx_t wpd_ctx;
 
+static scan_json_ctx_t json_ctx;
+
 
 document_t LastSubDoc;
 
@@ -969,6 +972,26 @@ TEST(Wpd, Wpd51_1) {
     cleanup(&doc, &f);
 }
 
+TEST(Json, Json1) {
+    vfile_t f;
+    document_t doc;
+    load_doc_file("libscan-test-files/test_files/json/json1.json", &f, &doc);
+
+    parse_json(&json_ctx, &f, &doc);
+
+    cleanup(&doc, &f);
+}
+
+TEST(Json, NDJson1) {
+    vfile_t f;
+    document_t doc;
+    load_doc_file("libscan-test-files/test_files/json/ndjson1.jsonl", &f, &doc);
+
+    parse_ndjson(&json_ctx, &f, &doc);
+
+    cleanup(&doc, &f);
+}
+
 int main(int argc, char **argv) {
     setlocale(LC_ALL, "");
 
@@ -1053,6 +1076,10 @@ int main(int argc, char **argv) {
     wpd_ctx.logf = noop_logf;
     wpd_ctx.content_size = 500;
 
+    json_ctx.log = noop_log;
+    json_ctx.logf = noop_logf;
+    json_ctx.content_size = 5000;
+
     av_log_set_level(AV_LOG_QUIET);
     ::testing::InitGoogleTest(&argc, argv);
     return RUN_ALL_TESTS();