Add media comment, add ooxml author/title, ooxml tests

This commit is contained in:
simon987 2020-04-09 11:00:20 -04:00
parent 6504f5ef3a
commit 90c4ca3d6e
5 changed files with 146 additions and 10 deletions

View File

@ -12,6 +12,8 @@
#define MIN_SIZE 32
#define AVIO_BUF_SIZE 8192
#define IS_VIDEO(fmt) (fmt->iformat->name && strcmp(fmt->iformat->name, "image2") != 0)
__always_inline
static AVCodecContext *alloc_jpeg_encoder(scan_media_ctx_t *ctx, int dstW, int dstH, float qscale) {
@ -167,6 +169,8 @@ static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
APPEND_TAG_META(doc, tag, MetaAlbumArtist)
} else if (strcmp(key, "album") == 0) {
APPEND_TAG_META(doc, tag, MetaAlbum)
} else if (strcmp(key, "comment") == 0) {
APPEND_TAG_META(doc, tag, MetaContent)
}
}
}
@ -228,8 +232,6 @@ append_video_meta(AVFormatContext *pFormatCtx, AVFrame *frame, document_t *doc,
}
}
#define IS_VIDEO(fmt) (fmt->iformat->name && strcmp(fmt->iformat->name, "image2") != 0)
void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) {
int video_stream = -1;

View File

@ -6,6 +6,8 @@
#include <libxml/xmlstring.h>
#include <libxml/parser.h>
#define _X(str) ((const xmlChar*)str)
__always_inline
static int should_read_part(const char *part) {
@ -50,13 +52,19 @@ int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text) {
text_buffer_append_string0(buf, (char *) text);
int ret = text_buffer_append_string0(buf, (char *) text);
text_buffer_append_char(buf, ' ');
xmlFree(text);
if (ret == TEXT_BUF_FULL) {
return ret;
}
}
}
extract_text(ctx, xml, child->children, buf);
if (extract_text(ctx, xml, child->children, buf) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
}
return 0;
}
@ -71,10 +79,42 @@ int xml_io_close(UNUSED(void *context)) {
return 0;
}
#define READ_PART_ERR -2
__always_inline
static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL, XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
return READ_PART_ERR;
}
xmlNode *root = xmlDocGetRootElement(xml);
if (root == NULL) {
CTX_LOG_ERROR(doc->filepath, "Empty document")
xmlFreeDoc(xml);
return READ_PART_ERR;
}
int ret = extract_text(ctx, xml, root, buf);
xmlFreeDoc(xml);
return ret;
}
#define APPEND_STR_META(doc, keyname, value) \
meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \
meta_str->key = keyname; \
strcpy(meta_str->str_val, value); \
APPEND_META(doc, meta_str)
__always_inline
static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
@ -88,7 +128,24 @@ static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *bu
return -1;
}
extract_text(ctx, xml, root, buf);
if (xmlStrEqual(root->name, _X("coreProperties"))) {
for (xmlNode *child = root->children; child; child = child->next) {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text == NULL) {
continue;
}
if (xmlStrEqual(child->name, _X("title"))) {
APPEND_STR_META(doc, MetaTitle, (char *) text)
} else if (xmlStrEqual(child->name, _X("creator"))) {
APPEND_STR_META(doc, MetaAuthor, (char *) text)
} else if (xmlStrEqual(child->name, _X("lastModifiedBy"))) {
APPEND_STR_META(doc, MetaModifiedBy, (char *) text)
}
xmlFree(text);
}
}
xmlFreeDoc(xml);
return 0;
@ -97,7 +154,7 @@ static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *bu
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
void * buf = read_all(f, &buf_len);
void *buf = read_all(f, &buf_len);
struct archive *a = archive_read_new();
archive_read_support_format_zip(a);
@ -113,13 +170,20 @@ void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
text_buffer_t tex = text_buffer_create(ctx->content_size);
struct archive_entry *entry;
int buffer_full = FALSE;
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
const char *path = archive_entry_pathname(entry);
if (should_read_part(path)) {
if (!buffer_full && should_read_part(path)) {
ret = read_part(ctx, a, &tex, doc);
if (ret != 0) {
if (ret == READ_PART_ERR) {
break;
} else if (ret == TEXT_BUF_FULL) {
buffer_full = TRUE;
}
} else if (strcmp(path, "docProps/core.xml") == 0) {
if (read_doc_props(ctx, a, &tex, doc) != 0) {
break;
}
}

View File

@ -8,7 +8,6 @@ typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
} scan_ooxml_ctx_t;
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc);

View File

@ -79,6 +79,7 @@ enum metakey {
MetaExifIsoSpeedRatings = META_STR(22),
MetaExifDateTime = META_STR(23),
MetaAuthor = META_STR(24),
MetaModifiedBy = META_STR(25),
};
typedef struct meta_line {

View File

@ -6,6 +6,7 @@ extern "C" {
#include "../libscan/text/text.h"
#include "../libscan/ebook/ebook.h"
#include "../libscan/media/media.h"
#include "../libscan/ooxml/ooxml.h"
#include <libavutil/avutil.h>
}
@ -19,6 +20,8 @@ static scan_ebook_ctx_t ebook_500_ctx;
static scan_media_ctx_t media_ctx;
static scan_ooxml_ctx_t ooxml_500_ctx;
/* Text */
@ -231,6 +234,69 @@ TEST(MediaVideo, Vid3Webm) {
cleanup(&doc, &f);
}
//TODO: test music file with embedded cover art
TEST(MediaAudio, MusicMp3) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/media/02-The Watchmaker-Barry James_spoken.mp3", &f, &doc);
parse_media(&media_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaArtist)->str_val, "Barry James");
ASSERT_STREQ(get_meta(&doc, MetaAlbum)->str_val, "Strange Slumber, Music for Wonderful Dreams");
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "The Watchmaker");
ASSERT_STREQ(get_meta(&doc, MetaGenre)->str_val, "New Age");
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "http://magnatune.com/artists/barry_james");
ASSERT_STREQ(get_meta(&doc, MetaMediaAudioCodec)->str_val, "mp3");
cleanup(&doc, &f);
}
/* OOXML */
TEST(Ooxml, Pptx1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ooxml/Catalist Presentation.pptx", &f, &doc);
parse_ooxml(&ooxml_500_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Slide 1");
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "thofeller");
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Hofeller");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
cleanup(&doc, &f);
}
TEST(Ooxml, Docx1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ooxml/How To Play A DVD On Windows 8.docx", &f, &doc);
parse_ooxml(&ooxml_500_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Thomas");
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Thomas");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
cleanup(&doc, &f);
}
TEST(Ooxml, Xlsx1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ooxml/xlsx1.xlsx", &f, &doc);
parse_ooxml(&ooxml_500_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Bureau of Economic Analysis");
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "lz");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
cleanup(&doc, &f);
}
int main(int argc, char **argv) {
arc_recurse_ctx.log = noop_log;
@ -265,6 +331,10 @@ int main(int argc, char **argv) {
media_ctx.tn_size = 500;
media_ctx.tn_qscale = 1.0;
ooxml_500_ctx.content_size = 500;
ooxml_500_ctx.log = noop_log;
ooxml_500_ctx.logf = noop_logf;
av_log_set_level(AV_LOG_QUIET);
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();