Limited support for UTF16

This commit is contained in:
simon987 2020-08-25 13:15:10 -04:00
parent eaedb31f0b
commit 33f5fb8e2c
3 changed files with 38 additions and 2 deletions

View File

@ -4,6 +4,10 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
int to_read = MIN(ctx->content_size, f->info.st_size); int to_read = MIN(ctx->content_size, f->info.st_size);
if (to_read <= 2) {
return SCAN_OK;
}
char *buf = malloc(to_read); char *buf = malloc(to_read);
int ret = f->read(f, buf, to_read); int ret = f->read(f, buf, to_read);
if (ret < 0) { if (ret < 0) {
@ -13,7 +17,14 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
} }
text_buffer_t tex = text_buffer_create(ctx->content_size); text_buffer_t tex = text_buffer_create(ctx->content_size);
text_buffer_append_string(&tex, buf, to_read);
if ((*(int16_t*)buf) == (int16_t)0xFFFE) {
text_buffer_append_string16_le(&tex, buf + 2, to_read - 2);
} else if((*(int16_t*)buf) == (int16_t)0xFEFF) {
text_buffer_append_string16_be(&tex, buf + 2, to_read - 2);
} else {
text_buffer_append_string(&tex, buf, to_read);
}
text_buffer_terminate_string(&tex); text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf); APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);

View File

@ -213,6 +213,19 @@ static void text_buffer_terminate_string(text_buffer_t *buf) {
} }
} }
// Naive UTF16 -> ascii conversion
static int text_buffer_append_string16_le(text_buffer_t *buf, const char *str, size_t len) {
for (int i = 1; i < len; i += 2) {
text_buffer_append_char(buf, str[i]);
}
}
static int text_buffer_append_string16_be(text_buffer_t *buf, const char *str, size_t len) {
for (int i = 0; i < len; i += 2) {
text_buffer_append_char(buf, str[i]);
}
}
#define UTF8_END_OF_STRING \ #define UTF8_END_OF_STRING \
(ptr - str >= len || *ptr == 0 || \ (ptr - str >= len || *ptr == 0 || \
(0xc0 == (0xe0 & *ptr) && ptr - str > len - 2) || \ (0xc0 == (0xe0 & *ptr) && ptr - str > len - 2) || \

View File

@ -62,7 +62,7 @@ TEST(Text, MemUtf8_1) {
parse_text(&text_500_ctx, &f, &doc); parse_text(&text_500_ctx, &f, &doc);
ASSERT_EQ(strlen(get_meta(&doc, MetaContent)->str_val), 1); ASSERT_EQ(get_meta(&doc, MetaContent), nullptr);
cleanup(&doc, &f); cleanup(&doc, &f);
} }
@ -114,6 +114,18 @@ TEST(Text, MemWhitespace) {
cleanup(&doc, &f); cleanup(&doc, &f);
} }
TEST(Text, Utf16LE) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/text/pain_is_beauty.log", &f, &doc);
parse_text(&text_500_ctx, &f, &doc);
ASSERT_GE(strlen(get_meta(&doc, MetaContent)->str_val), 200);
cleanup(&doc, &f);
}
TEST(Text, MemNoise) { TEST(Text, MemNoise) {
char content[600]; char content[600];