add markup file support

This commit is contained in:
simon987 2020-04-09 20:29:16 -04:00
parent b7a565a1c4
commit 0b0dcf89c0
4 changed files with 82 additions and 14 deletions

View File

@ -16,11 +16,7 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
text_buffer_append_string(&tex, buf, to_read);
text_buffer_terminate_string(&tex);
meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta->key = MetaContent;
strcpy(meta->str_val, tex.dyn_buffer.buf);
APPEND_META(doc, meta)
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
free(buf);
text_buffer_destroy(&tex);
@ -28,3 +24,28 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
return SCAN_OK;
}
#define MAX_MARKUP_SIZE 1024 * 1024
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
int to_read = MIN(MAX_MARKUP_SIZE, f->info.st_size);
char *buf = malloc(to_read);
int ret = f->read(f, buf, to_read);
if (ret < 0) {
CTX_LOG_ERRORF(doc->filepath, "read() returned error code: [%d]", ret)
free(buf);
return SCAN_ERR_READ;
}
text_buffer_t tex = text_buffer_create(ctx->content_size);
text_buffer_append_markup(&tex, buf);
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
free(buf);
text_buffer_destroy(&tex);
return SCAN_OK;
}

View File

@ -13,4 +13,6 @@ typedef struct {
scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc);
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

View File

@ -292,6 +292,15 @@ static int text_buffer_append_markup(text_buffer_t *buf, const char *markup) {
ptr += 1;
}
if (ptr != start) {
if (text_buffer_append_string(buf, start, (ptr - start)) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
if (text_buffer_append_char(buf, ' ') == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
}
return 0;
}

View File

@ -35,7 +35,7 @@ TEST(Text, BookCsvContentLen) {
parse_text(&text_500_ctx, &f, &doc);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
cleanup(&doc, &f);
}
@ -99,6 +99,42 @@ TEST(Text, MemWhitespace) {
cleanup(&doc, &f);
}
TEST(TextMarkup, Mem1) {
const char *content = "<<a<aa<<<>test<aaaa><>test test <>";
vfile_t f;
document_t doc;
load_doc_mem((void *) content, strlen(content), &f, &doc);
parse_markup(&text_500_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "test test test");
cleanup(&doc, &f);
}
TEST(TextMarkup, Mem2) {
const char *content = "<<a<aa<<<>test<aaaa><>test test ";
vfile_t f;
document_t doc;
load_doc_mem((void *) content, strlen(content), &f, &doc);
parse_markup(&text_500_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "test test test");
cleanup(&doc, &f);
}
TEST(TextMarkup, Xml1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/text/utf8-example.xml", &f, &doc);
parse_markup(&text_500_ctx, &f, &doc);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "BMP:𐌈") != nullptr);
cleanup(&doc, &f);
}
/* Ebook */
TEST(Ebook, CandlePdf) {
@ -110,7 +146,7 @@ TEST(Ebook, CandlePdf) {
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Microsoft Word - A531 Candlemaking-01.doc");
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Dafydd Prichard");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
ASSERT_NE(get_meta(&doc, MetaContent)->str_val[0], ' ');
cleanup(&doc, &f);
}
@ -134,7 +170,7 @@ TEST(Ebook, Epub1) {
parse_ebook(&ebook_500_ctx, &f, "application/epub+zip", &doc);
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Rabies");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
cleanup(&doc, &f);
}
@ -267,7 +303,7 @@ TEST(Ooxml, Pptx1) {
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Slide 1");
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "thofeller");
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Hofeller");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
cleanup(&doc, &f);
}
@ -281,7 +317,7 @@ TEST(Ooxml, Docx1) {
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Thomas");
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Thomas");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
cleanup(&doc, &f);
}
@ -295,7 +331,7 @@ TEST(Ooxml, Xlsx1) {
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Bureau of Economic Analysis");
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "lz");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
cleanup(&doc, &f);
}
@ -310,7 +346,7 @@ TEST(Mobi, Mobi1) {
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Gaiman, Neil");
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Norse Mythology");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
cleanup(&doc, &f);
}
@ -324,7 +360,7 @@ TEST(Mobi, Azw) {
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Nietzsche, Friedrich");
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "On the Genealogy of Morality (Hackett Classics)");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
cleanup(&doc, &f);
}
@ -338,7 +374,7 @@ TEST(Mobi, Azw3) {
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "George Orwell; Amélie Audiberti");
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "1984");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
cleanup(&doc, &f);
}