diff --git a/libscan/text/text.c b/libscan/text/text.c index 62ae442..e674c0c 100644 --- a/libscan/text/text.c +++ b/libscan/text/text.c @@ -4,6 +4,10 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) { int to_read = MIN(ctx->content_size, f->info.st_size); + if (to_read <= 2) { + return SCAN_OK; + } + char *buf = malloc(to_read); int ret = f->read(f, buf, to_read); if (ret < 0) { @@ -13,7 +17,14 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) { } text_buffer_t tex = text_buffer_create(ctx->content_size); - text_buffer_append_string(&tex, buf, to_read); + + if ((*(int16_t*)buf) == (int16_t)0xFFFE) { + text_buffer_append_string16_le(&tex, buf + 2, to_read - 2); + } else if((*(int16_t*)buf) == (int16_t)0xFEFF) { + text_buffer_append_string16_be(&tex, buf + 2, to_read - 2); + } else { + text_buffer_append_string(&tex, buf, to_read); + } text_buffer_terminate_string(&tex); APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf); diff --git a/libscan/util.h b/libscan/util.h index 87e46c1..4e3fe28 100644 --- a/libscan/util.h +++ b/libscan/util.h @@ -213,6 +213,19 @@ static void text_buffer_terminate_string(text_buffer_t *buf) { } } +// Naive UTF16 -> ascii conversion +static int text_buffer_append_string16_le(text_buffer_t *buf, const char *str, size_t len) { + for (int i = 1; i < len; i += 2) { + text_buffer_append_char(buf, str[i]); + } +} + +static int text_buffer_append_string16_be(text_buffer_t *buf, const char *str, size_t len) { + for (int i = 0; i < len; i += 2) { + text_buffer_append_char(buf, str[i]); + } +} + #define UTF8_END_OF_STRING \ (ptr - str >= len || *ptr == 0 || \ (0xc0 == (0xe0 & *ptr) && ptr - str > len - 2) || \ diff --git a/test/main.cpp b/test/main.cpp index 8eac0ed..f0182c8 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -62,7 +62,7 @@ TEST(Text, MemUtf8_1) { parse_text(&text_500_ctx, &f, &doc); - ASSERT_EQ(strlen(get_meta(&doc, MetaContent)->str_val), 1); + ASSERT_EQ(get_meta(&doc, MetaContent), nullptr); cleanup(&doc, &f); } @@ -114,6 +114,18 @@ TEST(Text, MemWhitespace) { cleanup(&doc, &f); } +TEST(Text, Utf16LE) { + + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/text/pain_is_beauty.log", &f, &doc); + parse_text(&text_500_ctx, &f, &doc); + + ASSERT_GE(strlen(get_meta(&doc, MetaContent)->str_val), 200); + + cleanup(&doc, &f); +} + TEST(Text, MemNoise) { char content[600];