mirror of
https://github.com/simon987/libscan.git
synced 2025-04-05 12:23:00 +00:00
add markup file support
This commit is contained in:
parent
b7a565a1c4
commit
0b0dcf89c0
@ -16,11 +16,7 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
text_buffer_append_string(&tex, buf, to_read);
|
||||
text_buffer_terminate_string(&tex);
|
||||
|
||||
meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
|
||||
meta->key = MetaContent;
|
||||
strcpy(meta->str_val, tex.dyn_buffer.buf);
|
||||
|
||||
APPEND_META(doc, meta)
|
||||
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
|
||||
|
||||
free(buf);
|
||||
text_buffer_destroy(&tex);
|
||||
@ -28,3 +24,28 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
return SCAN_OK;
|
||||
}
|
||||
|
||||
#define MAX_MARKUP_SIZE 1024 * 1024
|
||||
|
||||
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
int to_read = MIN(MAX_MARKUP_SIZE, f->info.st_size);
|
||||
|
||||
char *buf = malloc(to_read);
|
||||
int ret = f->read(f, buf, to_read);
|
||||
if (ret < 0) {
|
||||
CTX_LOG_ERRORF(doc->filepath, "read() returned error code: [%d]", ret)
|
||||
free(buf);
|
||||
return SCAN_ERR_READ;
|
||||
}
|
||||
|
||||
text_buffer_t tex = text_buffer_create(ctx->content_size);
|
||||
text_buffer_append_markup(&tex, buf);
|
||||
text_buffer_terminate_string(&tex);
|
||||
|
||||
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
|
||||
|
||||
free(buf);
|
||||
text_buffer_destroy(&tex);
|
||||
|
||||
return SCAN_OK;
|
||||
}
|
||||
|
@ -13,4 +13,6 @@ typedef struct {
|
||||
|
||||
scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
#endif
|
||||
|
@ -292,6 +292,15 @@ static int text_buffer_append_markup(text_buffer_t *buf, const char *markup) {
|
||||
|
||||
ptr += 1;
|
||||
}
|
||||
|
||||
if (ptr != start) {
|
||||
if (text_buffer_append_string(buf, start, (ptr - start)) == TEXT_BUF_FULL) {
|
||||
return TEXT_BUF_FULL;
|
||||
}
|
||||
if (text_buffer_append_char(buf, ' ') == TEXT_BUF_FULL) {
|
||||
return TEXT_BUF_FULL;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -35,7 +35,7 @@ TEST(Text, BookCsvContentLen) {
|
||||
|
||||
parse_text(&text_500_ctx, &f, &doc);
|
||||
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
@ -99,6 +99,42 @@ TEST(Text, MemWhitespace) {
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(TextMarkup, Mem1) {
|
||||
const char *content = "<<a<aa<<<>test<aaaa><>test test <>";
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_mem((void *) content, strlen(content), &f, &doc);
|
||||
|
||||
parse_markup(&text_500_ctx, &f, &doc);
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "test test test");
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(TextMarkup, Mem2) {
|
||||
const char *content = "<<a<aa<<<>test<aaaa><>test test ";
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_mem((void *) content, strlen(content), &f, &doc);
|
||||
|
||||
parse_markup(&text_500_ctx, &f, &doc);
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "test test test");
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(TextMarkup, Xml1) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/text/utf8-example.xml", &f, &doc);
|
||||
|
||||
parse_markup(&text_500_ctx, &f, &doc);
|
||||
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
|
||||
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "BMP:𐌈") != nullptr);
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
/* Ebook */
|
||||
|
||||
TEST(Ebook, CandlePdf) {
|
||||
@ -110,7 +146,7 @@ TEST(Ebook, CandlePdf) {
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Microsoft Word - A531 Candlemaking-01.doc");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Dafydd Prichard");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
|
||||
ASSERT_NE(get_meta(&doc, MetaContent)->str_val[0], ' ');
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
@ -134,7 +170,7 @@ TEST(Ebook, Epub1) {
|
||||
parse_ebook(&ebook_500_ctx, &f, "application/epub+zip", &doc);
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Rabies");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
@ -267,7 +303,7 @@ TEST(Ooxml, Pptx1) {
|
||||
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Slide 1");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "thofeller");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Hofeller");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
@ -281,7 +317,7 @@ TEST(Ooxml, Docx1) {
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Thomas");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Thomas");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
@ -295,7 +331,7 @@ TEST(Ooxml, Xlsx1) {
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Bureau of Economic Analysis");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "lz");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
@ -310,7 +346,7 @@ TEST(Mobi, Mobi1) {
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Gaiman, Neil");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Norse Mythology");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
@ -324,7 +360,7 @@ TEST(Mobi, Azw) {
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Nietzsche, Friedrich");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "On the Genealogy of Morality (Hackett Classics)");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
@ -338,7 +374,7 @@ TEST(Mobi, Azw3) {
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "George Orwell; Amélie Audiberti");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "1984");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user