diff --git a/libscan/text/text.c b/libscan/text/text.c index 8493192..1d3f1db 100644 --- a/libscan/text/text.c +++ b/libscan/text/text.c @@ -16,11 +16,7 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) { text_buffer_append_string(&tex, buf, to_read); text_buffer_terminate_string(&tex); - meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); - meta->key = MetaContent; - strcpy(meta->str_val, tex.dyn_buffer.buf); - - APPEND_META(doc, meta) + APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf); free(buf); text_buffer_destroy(&tex); @@ -28,3 +24,28 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) { return SCAN_OK; } +#define MAX_MARKUP_SIZE 1024 * 1024 + +scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) { + + int to_read = MIN(MAX_MARKUP_SIZE, f->info.st_size); + + char *buf = malloc(to_read); + int ret = f->read(f, buf, to_read); + if (ret < 0) { + CTX_LOG_ERRORF(doc->filepath, "read() returned error code: [%d]", ret) + free(buf); + return SCAN_ERR_READ; + } + + text_buffer_t tex = text_buffer_create(ctx->content_size); + text_buffer_append_markup(&tex, buf); + text_buffer_terminate_string(&tex); + + APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf); + + free(buf); + text_buffer_destroy(&tex); + + return SCAN_OK; +} diff --git a/libscan/text/text.h b/libscan/text/text.h index 28fb3c6..2477b47 100644 --- a/libscan/text/text.h +++ b/libscan/text/text.h @@ -13,4 +13,6 @@ typedef struct { scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc); +scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc); + #endif diff --git a/libscan/util.h b/libscan/util.h index 416b58e..fb321c6 100644 --- a/libscan/util.h +++ b/libscan/util.h @@ -292,6 +292,15 @@ static int text_buffer_append_markup(text_buffer_t *buf, const char *markup) { ptr += 1; } + + if (ptr != start) { + if (text_buffer_append_string(buf, start, (ptr - start)) == TEXT_BUF_FULL) { + return TEXT_BUF_FULL; + } + if (text_buffer_append_char(buf, ' ') == TEXT_BUF_FULL) { + return TEXT_BUF_FULL; + } + } return 0; } diff --git a/test/main.cpp b/test/main.cpp index f6cae12..c0f7682 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -35,7 +35,7 @@ TEST(Text, BookCsvContentLen) { parse_text(&text_500_ctx, &f, &doc); - ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); cleanup(&doc, &f); } @@ -99,6 +99,42 @@ TEST(Text, MemWhitespace) { cleanup(&doc, &f); } +TEST(TextMarkup, Mem1) { + const char *content = "<test<>test test <>"; + vfile_t f; + document_t doc; + load_doc_mem((void *) content, strlen(content), &f, &doc); + + parse_markup(&text_500_ctx, &f, &doc); + + ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "test test test"); + cleanup(&doc, &f); +} + +TEST(TextMarkup, Mem2) { + const char *content = "<test<>test test "; + vfile_t f; + document_t doc; + load_doc_mem((void *) content, strlen(content), &f, &doc); + + parse_markup(&text_500_ctx, &f, &doc); + + ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "test test test"); + cleanup(&doc, &f); +} + +TEST(TextMarkup, Xml1) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/text/utf8-example.xml", &f, &doc); + + parse_markup(&text_500_ctx, &f, &doc); + + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); + ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "BMP:𐌈") != nullptr); + cleanup(&doc, &f); +} + /* Ebook */ TEST(Ebook, CandlePdf) { @@ -110,7 +146,7 @@ TEST(Ebook, CandlePdf) { ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Microsoft Word - A531 Candlemaking-01.doc"); ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Dafydd Prichard"); - ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); ASSERT_NE(get_meta(&doc, MetaContent)->str_val[0], ' '); cleanup(&doc, &f); } @@ -134,7 +170,7 @@ TEST(Ebook, Epub1) { parse_ebook(&ebook_500_ctx, &f, "application/epub+zip", &doc); ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Rabies"); - ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); cleanup(&doc, &f); } @@ -267,7 +303,7 @@ TEST(Ooxml, Pptx1) { ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Slide 1"); ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "thofeller"); ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Hofeller"); - ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); cleanup(&doc, &f); } @@ -281,7 +317,7 @@ TEST(Ooxml, Docx1) { ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Thomas"); ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Thomas"); - ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); cleanup(&doc, &f); } @@ -295,7 +331,7 @@ TEST(Ooxml, Xlsx1) { ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Bureau of Economic Analysis"); ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "lz"); - ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); cleanup(&doc, &f); } @@ -310,7 +346,7 @@ TEST(Mobi, Mobi1) { ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Gaiman, Neil"); ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Norse Mythology"); - ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); cleanup(&doc, &f); } @@ -324,7 +360,7 @@ TEST(Mobi, Azw) { ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Nietzsche, Friedrich"); ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "On the Genealogy of Morality (Hackett Classics)"); - ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); cleanup(&doc, &f); } @@ -338,7 +374,7 @@ TEST(Mobi, Azw3) { ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "George Orwell; Amélie Audiberti"); ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "1984"); - ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); cleanup(&doc, &f); }