mirror of
https://github.com/simon987/libscan.git
synced 2025-04-05 04:22:58 +00:00
Add basic JSON/NDJSON support
This commit is contained in:
parent
f061212d4b
commit
23da8ada5f
@ -25,6 +25,7 @@ add_library(
|
||||
libscan/media/media.c libscan/media/media.h
|
||||
libscan/font/font.c libscan/font/font.h
|
||||
libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h
|
||||
libscan/json/json.c libscan/json/json.h
|
||||
libscan/wpd/wpd.c libscan/wpd/wpd.h libscan/wpd/libwpd_c_api.h libscan/wpd/libwpd_c_api.cpp
|
||||
|
||||
third-party/utf8.h
|
||||
@ -33,6 +34,7 @@ set_target_properties(scan PROPERTIES LINKER_LANGUAGE C)
|
||||
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib .so)
|
||||
|
||||
find_package(cJSON CONFIG REQUIRED)
|
||||
find_package(LibArchive REQUIRED)
|
||||
find_package(BZip2 REQUIRED)
|
||||
find_package(lz4 REQUIRED)
|
||||
@ -149,6 +151,7 @@ target_link_libraries(
|
||||
scan
|
||||
PUBLIC
|
||||
|
||||
cjson
|
||||
${LibArchive_LIBRARIES}
|
||||
ZLIB::ZLIB
|
||||
BZip2::BZip2
|
||||
|
119
libscan/json/json.c
Normal file
119
libscan/json/json.c
Normal file
@ -0,0 +1,119 @@
|
||||
#include "json.h"
|
||||
#include "cjson/cJSON.h"
|
||||
|
||||
|
||||
#define JSON_MAX_FILE_SIZE (1024 * 1024 * 50)
|
||||
|
||||
int json_extract_text(cJSON *json, text_buffer_t *tex) {
|
||||
if (cJSON_IsObject(json)) {
|
||||
for (cJSON *child = json->child; child != NULL; child = child->next) {
|
||||
if (json_extract_text(child, tex)) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
} else if (cJSON_IsArray(json)) {
|
||||
cJSON *child;
|
||||
cJSON_ArrayForEach(child, json) {
|
||||
if (json_extract_text(child, tex)) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
} else if (cJSON_IsString(json)) {
|
||||
if (text_buffer_append_string0(tex, json->valuestring) == TEXT_BUF_FULL) {
|
||||
return TRUE;
|
||||
}
|
||||
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
if (f->info.st_size > JSON_MAX_FILE_SIZE) {
|
||||
CTX_LOG_WARNINGF("json.c", "File larger than maximum allowed [%s]", f->filepath)
|
||||
return SCAN_ERR_SKIP;
|
||||
}
|
||||
|
||||
size_t buf_len;
|
||||
char *buf = read_all(f, &buf_len);
|
||||
|
||||
if (buf == NULL) {
|
||||
return SCAN_ERR_READ;
|
||||
}
|
||||
|
||||
buf_len += 1;
|
||||
buf = realloc(buf, buf_len);
|
||||
*(buf + buf_len - 1) = '\0';
|
||||
|
||||
cJSON *json = cJSON_ParseWithOpts(buf, NULL, TRUE);
|
||||
text_buffer_t tex = text_buffer_create(ctx->content_size);
|
||||
|
||||
json_extract_text(json, &tex);
|
||||
text_buffer_terminate_string(&tex);
|
||||
|
||||
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
|
||||
|
||||
cJSON_Delete(json);
|
||||
free(buf);
|
||||
text_buffer_destroy(&tex);
|
||||
|
||||
return SCAN_OK;
|
||||
}
|
||||
|
||||
#define JSON_BUF_SIZE (1024 * 1024 * 5)
|
||||
|
||||
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
char *buf = malloc(JSON_BUF_SIZE + 1);
|
||||
*(buf + JSON_BUF_SIZE) = '\0';
|
||||
|
||||
text_buffer_t tex = text_buffer_create(-1);
|
||||
|
||||
size_t ret;
|
||||
int eof = FALSE;
|
||||
const char *parse_end = buf;
|
||||
size_t to_read;
|
||||
char *ptr = buf;
|
||||
|
||||
while (TRUE) {
|
||||
cJSON *json;
|
||||
|
||||
if (!eof) {
|
||||
to_read = parse_end == buf ? JSON_BUF_SIZE : parse_end - buf;
|
||||
ret = f->read(f, ptr, to_read);
|
||||
if (ret != to_read) {
|
||||
eof = TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
json = cJSON_ParseWithOpts(buf, &parse_end, FALSE);
|
||||
|
||||
if (parse_end == buf + JSON_BUF_SIZE) {
|
||||
CTX_LOG_ERRORF("json.c", "Line too large for buffer [%s]", doc->filepath);
|
||||
cJSON_Delete(json);
|
||||
break;
|
||||
}
|
||||
|
||||
if (parse_end == buf) {
|
||||
cJSON_Delete(json);
|
||||
break;
|
||||
}
|
||||
|
||||
json_extract_text(json, &tex);
|
||||
|
||||
cJSON_Delete(json);
|
||||
|
||||
memmove(buf, parse_end, (buf + JSON_BUF_SIZE - parse_end));
|
||||
ptr = buf + JSON_BUF_SIZE - parse_end + buf;
|
||||
}
|
||||
|
||||
text_buffer_terminate_string(&tex);
|
||||
|
||||
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
|
||||
|
||||
free(buf);
|
||||
text_buffer_destroy(&tex);
|
||||
}
|
30
libscan/json/json.h
Normal file
30
libscan/json/json.h
Normal file
@ -0,0 +1,30 @@
|
||||
#ifndef SCAN_JSON_H
|
||||
#define SCAN_JSON_H
|
||||
|
||||
#include "../scan.h"
|
||||
|
||||
|
||||
typedef struct {
|
||||
long content_size;
|
||||
log_callback_t log;
|
||||
logf_callback_t logf;
|
||||
store_callback_t store;
|
||||
unsigned int json_mime;
|
||||
unsigned int ndjson_mime;
|
||||
} scan_json_ctx_t;
|
||||
|
||||
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
__always_inline
|
||||
static int is_json(scan_json_ctx_t *ctx, unsigned int mime) {
|
||||
return mime == ctx->json_mime;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static int is_ndjson(scan_json_ctx_t *ctx, unsigned int mime) {
|
||||
return mime == ctx->ndjson_mime;
|
||||
}
|
||||
|
||||
#endif
|
@ -22,6 +22,7 @@ typedef void (*log_callback_t)(const char *filepath, int level, char *str);
|
||||
typedef int scan_code_t;
|
||||
#define SCAN_OK (scan_code_t) 0
|
||||
#define SCAN_ERR_READ (scan_code_t) (-1)
|
||||
#define SCAN_ERR_SKIP (scan_code_t) (-2)
|
||||
|
||||
#define LEVEL_DEBUG 0
|
||||
#define LEVEL_INFO 1
|
||||
|
@ -12,6 +12,7 @@ extern "C" {
|
||||
#include "../libscan/raw/raw.h"
|
||||
#include "../libscan/msdoc/msdoc.h"
|
||||
#include "../libscan/wpd/wpd.h"
|
||||
#include "../libscan/json/json.h"
|
||||
#include <libavutil/avutil.h>
|
||||
}
|
||||
|
||||
@ -42,6 +43,8 @@ static scan_msdoc_ctx_t msdoc_text_ctx;
|
||||
|
||||
static scan_wpd_ctx_t wpd_ctx;
|
||||
|
||||
static scan_json_ctx_t json_ctx;
|
||||
|
||||
|
||||
document_t LastSubDoc;
|
||||
|
||||
@ -969,6 +972,26 @@ TEST(Wpd, Wpd51_1) {
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Json, Json1) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/json/json1.json", &f, &doc);
|
||||
|
||||
parse_json(&json_ctx, &f, &doc);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Json, NDJson1) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/json/ndjson1.jsonl", &f, &doc);
|
||||
|
||||
parse_ndjson(&json_ctx, &f, &doc);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
setlocale(LC_ALL, "");
|
||||
|
||||
@ -1053,6 +1076,10 @@ int main(int argc, char **argv) {
|
||||
wpd_ctx.logf = noop_logf;
|
||||
wpd_ctx.content_size = 500;
|
||||
|
||||
json_ctx.log = noop_log;
|
||||
json_ctx.logf = noop_logf;
|
||||
json_ctx.content_size = 5000;
|
||||
|
||||
av_log_set_level(AV_LOG_QUIET);
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
|
Loading…
x
Reference in New Issue
Block a user