diff --git a/libscan/media/media.c b/libscan/media/media.c index 03924ee..b96f79e 100644 --- a/libscan/media/media.c +++ b/libscan/media/media.c @@ -12,6 +12,8 @@ #define MIN_SIZE 32 #define AVIO_BUF_SIZE 8192 +#define IS_VIDEO(fmt) (fmt->iformat->name && strcmp(fmt->iformat->name, "image2") != 0) + __always_inline static AVCodecContext *alloc_jpeg_encoder(scan_media_ctx_t *ctx, int dstW, int dstH, float qscale) { @@ -167,6 +169,8 @@ static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) { APPEND_TAG_META(doc, tag, MetaAlbumArtist) } else if (strcmp(key, "album") == 0) { APPEND_TAG_META(doc, tag, MetaAlbum) + } else if (strcmp(key, "comment") == 0) { + APPEND_TAG_META(doc, tag, MetaContent) } } } @@ -228,8 +232,6 @@ append_video_meta(AVFormatContext *pFormatCtx, AVFrame *frame, document_t *doc, } } -#define IS_VIDEO(fmt) (fmt->iformat->name && strcmp(fmt->iformat->name, "image2") != 0) - void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) { int video_stream = -1; diff --git a/libscan/ooxml/ooxml.c b/libscan/ooxml/ooxml.c index f7e6122..5e91b1a 100644 --- a/libscan/ooxml/ooxml.c +++ b/libscan/ooxml/ooxml.c @@ -6,6 +6,8 @@ #include #include +#define _X(str) ((const xmlChar*)str) + __always_inline static int should_read_part(const char *part) { @@ -50,13 +52,19 @@ int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_ xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1); if (text) { - text_buffer_append_string0(buf, (char *) text); + int ret = text_buffer_append_string0(buf, (char *) text); text_buffer_append_char(buf, ' '); xmlFree(text); + + if (ret == TEXT_BUF_FULL) { + return ret; + } } } - extract_text(ctx, xml, child->children, buf); + if (extract_text(ctx, xml, child->children, buf) == TEXT_BUF_FULL) { + return TEXT_BUF_FULL; + } } return 0; } @@ -71,10 +79,42 @@ int xml_io_close(UNUSED(void *context)) { return 0; } +#define READ_PART_ERR -2 + __always_inline static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) { - xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL, XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET); + xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL, + XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET); + + if (xml == NULL) { + CTX_LOG_ERROR(doc->filepath, "Could not parse XML") + return READ_PART_ERR; + } + + xmlNode *root = xmlDocGetRootElement(xml); + if (root == NULL) { + CTX_LOG_ERROR(doc->filepath, "Empty document") + xmlFreeDoc(xml); + return READ_PART_ERR; + } + + int ret = extract_text(ctx, xml, root, buf); + xmlFreeDoc(xml); + + return ret; +} + +#define APPEND_STR_META(doc, keyname, value) \ + meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \ + meta_str->key = keyname; \ + strcpy(meta_str->str_val, value); \ + APPEND_META(doc, meta_str) + +__always_inline +static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) { + xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL, + XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET); if (xml == NULL) { CTX_LOG_ERROR(doc->filepath, "Could not parse XML") @@ -88,7 +128,24 @@ static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *bu return -1; } - extract_text(ctx, xml, root, buf); + if (xmlStrEqual(root->name, _X("coreProperties"))) { + for (xmlNode *child = root->children; child; child = child->next) { + xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1); + if (text == NULL) { + continue; + } + + if (xmlStrEqual(child->name, _X("title"))) { + APPEND_STR_META(doc, MetaTitle, (char *) text) + } else if (xmlStrEqual(child->name, _X("creator"))) { + APPEND_STR_META(doc, MetaAuthor, (char *) text) + } else if (xmlStrEqual(child->name, _X("lastModifiedBy"))) { + APPEND_STR_META(doc, MetaModifiedBy, (char *) text) + } + + xmlFree(text); + } + } xmlFreeDoc(xml); return 0; @@ -97,7 +154,7 @@ static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *bu void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) { size_t buf_len; - void * buf = read_all(f, &buf_len); + void *buf = read_all(f, &buf_len); struct archive *a = archive_read_new(); archive_read_support_format_zip(a); @@ -113,13 +170,20 @@ void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) { text_buffer_t tex = text_buffer_create(ctx->content_size); struct archive_entry *entry; + int buffer_full = FALSE; while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { if (S_ISREG(archive_entry_stat(entry)->st_mode)) { const char *path = archive_entry_pathname(entry); - if (should_read_part(path)) { + if (!buffer_full && should_read_part(path)) { ret = read_part(ctx, a, &tex, doc); - if (ret != 0) { + if (ret == READ_PART_ERR) { + break; + } else if (ret == TEXT_BUF_FULL) { + buffer_full = TRUE; + } + } else if (strcmp(path, "docProps/core.xml") == 0) { + if (read_doc_props(ctx, a, &tex, doc) != 0) { break; } } diff --git a/libscan/ooxml/ooxml.h b/libscan/ooxml/ooxml.h index 5b2513c..19c31ae 100644 --- a/libscan/ooxml/ooxml.h +++ b/libscan/ooxml/ooxml.h @@ -8,7 +8,6 @@ typedef struct { long content_size; log_callback_t log; logf_callback_t logf; - store_callback_t store; } scan_ooxml_ctx_t; void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc); diff --git a/libscan/scan.h b/libscan/scan.h index c84a143..8f67a6b 100644 --- a/libscan/scan.h +++ b/libscan/scan.h @@ -79,6 +79,7 @@ enum metakey { MetaExifIsoSpeedRatings = META_STR(22), MetaExifDateTime = META_STR(23), MetaAuthor = META_STR(24), + MetaModifiedBy = META_STR(25), }; typedef struct meta_line { diff --git a/test/main.cpp b/test/main.cpp index 6a10775..8dff3a4 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -6,6 +6,7 @@ extern "C" { #include "../libscan/text/text.h" #include "../libscan/ebook/ebook.h" #include "../libscan/media/media.h" +#include "../libscan/ooxml/ooxml.h" #include } @@ -19,6 +20,8 @@ static scan_ebook_ctx_t ebook_500_ctx; static scan_media_ctx_t media_ctx; +static scan_ooxml_ctx_t ooxml_500_ctx; + /* Text */ @@ -231,6 +234,69 @@ TEST(MediaVideo, Vid3Webm) { cleanup(&doc, &f); } +//TODO: test music file with embedded cover art + +TEST(MediaAudio, MusicMp3) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/media/02-The Watchmaker-Barry James_spoken.mp3", &f, &doc); + + parse_media(&media_ctx, &f, &doc); + + ASSERT_STREQ(get_meta(&doc, MetaArtist)->str_val, "Barry James"); + ASSERT_STREQ(get_meta(&doc, MetaAlbum)->str_val, "Strange Slumber, Music for Wonderful Dreams"); + ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "The Watchmaker"); + ASSERT_STREQ(get_meta(&doc, MetaGenre)->str_val, "New Age"); + ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "http://magnatune.com/artists/barry_james"); + ASSERT_STREQ(get_meta(&doc, MetaMediaAudioCodec)->str_val, "mp3"); + + cleanup(&doc, &f); +} + +/* OOXML */ + +TEST(Ooxml, Pptx1) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/ooxml/Catalist Presentation.pptx", &f, &doc); + + parse_ooxml(&ooxml_500_ctx, &f, &doc); + + ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Slide 1"); + ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "thofeller"); + ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Hofeller"); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + + cleanup(&doc, &f); +} + +TEST(Ooxml, Docx1) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/ooxml/How To Play A DVD On Windows 8.docx", &f, &doc); + + parse_ooxml(&ooxml_500_ctx, &f, &doc); + + ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Thomas"); + ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Thomas"); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + + cleanup(&doc, &f); +} + +TEST(Ooxml, Xlsx1) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/ooxml/xlsx1.xlsx", &f, &doc); + + parse_ooxml(&ooxml_500_ctx, &f, &doc); + + ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Bureau of Economic Analysis"); + ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "lz"); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + + cleanup(&doc, &f); +} int main(int argc, char **argv) { arc_recurse_ctx.log = noop_log; @@ -265,6 +331,10 @@ int main(int argc, char **argv) { media_ctx.tn_size = 500; media_ctx.tn_qscale = 1.0; + ooxml_500_ctx.content_size = 500; + ooxml_500_ctx.log = noop_log; + ooxml_500_ctx.logf = noop_logf; + av_log_set_level(AV_LOG_QUIET); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS();