mirror of
https://github.com/simon987/libscan.git
synced 2025-04-05 12:23:00 +00:00
Limited support for UTF16
This commit is contained in:
parent
eaedb31f0b
commit
33f5fb8e2c
@ -4,6 +4,10 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
|||||||
|
|
||||||
int to_read = MIN(ctx->content_size, f->info.st_size);
|
int to_read = MIN(ctx->content_size, f->info.st_size);
|
||||||
|
|
||||||
|
if (to_read <= 2) {
|
||||||
|
return SCAN_OK;
|
||||||
|
}
|
||||||
|
|
||||||
char *buf = malloc(to_read);
|
char *buf = malloc(to_read);
|
||||||
int ret = f->read(f, buf, to_read);
|
int ret = f->read(f, buf, to_read);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
@ -13,7 +17,14 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
text_buffer_t tex = text_buffer_create(ctx->content_size);
|
text_buffer_t tex = text_buffer_create(ctx->content_size);
|
||||||
text_buffer_append_string(&tex, buf, to_read);
|
|
||||||
|
if ((*(int16_t*)buf) == (int16_t)0xFFFE) {
|
||||||
|
text_buffer_append_string16_le(&tex, buf + 2, to_read - 2);
|
||||||
|
} else if((*(int16_t*)buf) == (int16_t)0xFEFF) {
|
||||||
|
text_buffer_append_string16_be(&tex, buf + 2, to_read - 2);
|
||||||
|
} else {
|
||||||
|
text_buffer_append_string(&tex, buf, to_read);
|
||||||
|
}
|
||||||
text_buffer_terminate_string(&tex);
|
text_buffer_terminate_string(&tex);
|
||||||
|
|
||||||
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
|
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
|
||||||
|
@ -213,6 +213,19 @@ static void text_buffer_terminate_string(text_buffer_t *buf) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Naive UTF16 -> ascii conversion
|
||||||
|
static int text_buffer_append_string16_le(text_buffer_t *buf, const char *str, size_t len) {
|
||||||
|
for (int i = 1; i < len; i += 2) {
|
||||||
|
text_buffer_append_char(buf, str[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int text_buffer_append_string16_be(text_buffer_t *buf, const char *str, size_t len) {
|
||||||
|
for (int i = 0; i < len; i += 2) {
|
||||||
|
text_buffer_append_char(buf, str[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#define UTF8_END_OF_STRING \
|
#define UTF8_END_OF_STRING \
|
||||||
(ptr - str >= len || *ptr == 0 || \
|
(ptr - str >= len || *ptr == 0 || \
|
||||||
(0xc0 == (0xe0 & *ptr) && ptr - str > len - 2) || \
|
(0xc0 == (0xe0 & *ptr) && ptr - str > len - 2) || \
|
||||||
|
@ -62,7 +62,7 @@ TEST(Text, MemUtf8_1) {
|
|||||||
|
|
||||||
parse_text(&text_500_ctx, &f, &doc);
|
parse_text(&text_500_ctx, &f, &doc);
|
||||||
|
|
||||||
ASSERT_EQ(strlen(get_meta(&doc, MetaContent)->str_val), 1);
|
ASSERT_EQ(get_meta(&doc, MetaContent), nullptr);
|
||||||
cleanup(&doc, &f);
|
cleanup(&doc, &f);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -114,6 +114,18 @@ TEST(Text, MemWhitespace) {
|
|||||||
cleanup(&doc, &f);
|
cleanup(&doc, &f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(Text, Utf16LE) {
|
||||||
|
|
||||||
|
vfile_t f;
|
||||||
|
document_t doc;
|
||||||
|
load_doc_file("libscan-test-files/test_files/text/pain_is_beauty.log", &f, &doc);
|
||||||
|
parse_text(&text_500_ctx, &f, &doc);
|
||||||
|
|
||||||
|
ASSERT_GE(strlen(get_meta(&doc, MetaContent)->str_val), 200);
|
||||||
|
|
||||||
|
cleanup(&doc, &f);
|
||||||
|
}
|
||||||
|
|
||||||
TEST(Text, MemNoise) {
|
TEST(Text, MemNoise) {
|
||||||
char content[600];
|
char content[600];
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user