diff --git a/CMakeLists.txt b/CMakeLists.txt index 52f05b3..430c8a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,7 @@ add_library( libscan/media/media.c libscan/media/media.h libscan/font/font.c libscan/font/font.h libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h + libscan/json/json.c libscan/json/json.h libscan/wpd/wpd.c libscan/wpd/wpd.h libscan/wpd/libwpd_c_api.h libscan/wpd/libwpd_c_api.cpp third-party/utf8.h @@ -33,6 +34,7 @@ set_target_properties(scan PROPERTIES LINKER_LANGUAGE C) set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib .so) +find_package(cJSON CONFIG REQUIRED) find_package(LibArchive REQUIRED) find_package(BZip2 REQUIRED) find_package(lz4 REQUIRED) @@ -149,6 +151,7 @@ target_link_libraries( scan PUBLIC + cjson ${LibArchive_LIBRARIES} ZLIB::ZLIB BZip2::BZip2 diff --git a/libscan/json/json.c b/libscan/json/json.c new file mode 100644 index 0000000..a12822b --- /dev/null +++ b/libscan/json/json.c @@ -0,0 +1,119 @@ +#include "json.h" +#include "cjson/cJSON.h" + + +#define JSON_MAX_FILE_SIZE (1024 * 1024 * 50) + +int json_extract_text(cJSON *json, text_buffer_t *tex) { + if (cJSON_IsObject(json)) { + for (cJSON *child = json->child; child != NULL; child = child->next) { + if (json_extract_text(child, tex)) { + return TRUE; + } + } + } else if (cJSON_IsArray(json)) { + cJSON *child; + cJSON_ArrayForEach(child, json) { + if (json_extract_text(child, tex)) { + return TRUE; + } + } + } else if (cJSON_IsString(json)) { + if (text_buffer_append_string0(tex, json->valuestring) == TEXT_BUF_FULL) { + return TRUE; + } + if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) { + return TRUE; + } + } + + return FALSE; +} + +scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) { + + if (f->info.st_size > JSON_MAX_FILE_SIZE) { + CTX_LOG_WARNINGF("json.c", "File larger than maximum allowed [%s]", f->filepath) + return SCAN_ERR_SKIP; + } + + size_t buf_len; + char *buf = read_all(f, &buf_len); + + if (buf == NULL) { + return SCAN_ERR_READ; + } + + buf_len += 1; + buf = realloc(buf, buf_len); + *(buf + buf_len - 1) = '\0'; + + cJSON *json = cJSON_ParseWithOpts(buf, NULL, TRUE); + text_buffer_t tex = text_buffer_create(ctx->content_size); + + json_extract_text(json, &tex); + text_buffer_terminate_string(&tex); + + APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf); + + cJSON_Delete(json); + free(buf); + text_buffer_destroy(&tex); + + return SCAN_OK; +} + +#define JSON_BUF_SIZE (1024 * 1024 * 5) + +scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) { + + char *buf = malloc(JSON_BUF_SIZE + 1); + *(buf + JSON_BUF_SIZE) = '\0'; + + text_buffer_t tex = text_buffer_create(-1); + + size_t ret; + int eof = FALSE; + const char *parse_end = buf; + size_t to_read; + char *ptr = buf; + + while (TRUE) { + cJSON *json; + + if (!eof) { + to_read = parse_end == buf ? JSON_BUF_SIZE : parse_end - buf; + ret = f->read(f, ptr, to_read); + if (ret != to_read) { + eof = TRUE; + } + } + + json = cJSON_ParseWithOpts(buf, &parse_end, FALSE); + + if (parse_end == buf + JSON_BUF_SIZE) { + CTX_LOG_ERRORF("json.c", "Line too large for buffer [%s]", doc->filepath); + cJSON_Delete(json); + break; + } + + if (parse_end == buf) { + cJSON_Delete(json); + break; + } + + json_extract_text(json, &tex); + + cJSON_Delete(json); + + memmove(buf, parse_end, (buf + JSON_BUF_SIZE - parse_end)); + ptr = buf + JSON_BUF_SIZE - parse_end + buf; + } + + text_buffer_terminate_string(&tex); + + APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf); + + free(buf); + text_buffer_destroy(&tex); +} diff --git a/libscan/json/json.h b/libscan/json/json.h new file mode 100644 index 0000000..3d328f3 --- /dev/null +++ b/libscan/json/json.h @@ -0,0 +1,30 @@ +#ifndef SCAN_JSON_H +#define SCAN_JSON_H + +#include "../scan.h" + + +typedef struct { + long content_size; + log_callback_t log; + logf_callback_t logf; + store_callback_t store; + unsigned int json_mime; + unsigned int ndjson_mime; +} scan_json_ctx_t; + +scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc); + +scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc); + +__always_inline +static int is_json(scan_json_ctx_t *ctx, unsigned int mime) { + return mime == ctx->json_mime; +} + +__always_inline +static int is_ndjson(scan_json_ctx_t *ctx, unsigned int mime) { + return mime == ctx->ndjson_mime; +} + +#endif diff --git a/libscan/scan.h b/libscan/scan.h index fb40a47..e06e8c3 100644 --- a/libscan/scan.h +++ b/libscan/scan.h @@ -22,6 +22,7 @@ typedef void (*log_callback_t)(const char *filepath, int level, char *str); typedef int scan_code_t; #define SCAN_OK (scan_code_t) 0 #define SCAN_ERR_READ (scan_code_t) (-1) +#define SCAN_ERR_SKIP (scan_code_t) (-2) #define LEVEL_DEBUG 0 #define LEVEL_INFO 1 diff --git a/test/main.cpp b/test/main.cpp index 19d59b7..42f2818 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -12,6 +12,7 @@ extern "C" { #include "../libscan/raw/raw.h" #include "../libscan/msdoc/msdoc.h" #include "../libscan/wpd/wpd.h" +#include "../libscan/json/json.h" #include } @@ -42,6 +43,8 @@ static scan_msdoc_ctx_t msdoc_text_ctx; static scan_wpd_ctx_t wpd_ctx; +static scan_json_ctx_t json_ctx; + document_t LastSubDoc; @@ -969,6 +972,26 @@ TEST(Wpd, Wpd51_1) { cleanup(&doc, &f); } +TEST(Json, Json1) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/json/json1.json", &f, &doc); + + parse_json(&json_ctx, &f, &doc); + + cleanup(&doc, &f); +} + +TEST(Json, NDJson1) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/json/ndjson1.jsonl", &f, &doc); + + parse_ndjson(&json_ctx, &f, &doc); + + cleanup(&doc, &f); +} + int main(int argc, char **argv) { setlocale(LC_ALL, ""); @@ -1053,6 +1076,10 @@ int main(int argc, char **argv) { wpd_ctx.logf = noop_logf; wpd_ctx.content_size = 500; + json_ctx.log = noop_log; + json_ctx.logf = noop_logf; + json_ctx.content_size = 5000; + av_log_set_level(AV_LOG_QUIET); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS();