mirror of
https://github.com/simon987/libscan.git
synced 2025-04-05 12:23:00 +00:00
Add media comment, add ooxml author/title, ooxml tests
This commit is contained in:
parent
6504f5ef3a
commit
90c4ca3d6e
@ -12,6 +12,8 @@
|
||||
|
||||
#define MIN_SIZE 32
|
||||
#define AVIO_BUF_SIZE 8192
|
||||
#define IS_VIDEO(fmt) (fmt->iformat->name && strcmp(fmt->iformat->name, "image2") != 0)
|
||||
|
||||
|
||||
__always_inline
|
||||
static AVCodecContext *alloc_jpeg_encoder(scan_media_ctx_t *ctx, int dstW, int dstH, float qscale) {
|
||||
@ -167,6 +169,8 @@ static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
|
||||
APPEND_TAG_META(doc, tag, MetaAlbumArtist)
|
||||
} else if (strcmp(key, "album") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaAlbum)
|
||||
} else if (strcmp(key, "comment") == 0) {
|
||||
APPEND_TAG_META(doc, tag, MetaContent)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -228,8 +232,6 @@ append_video_meta(AVFormatContext *pFormatCtx, AVFrame *frame, document_t *doc,
|
||||
}
|
||||
}
|
||||
|
||||
#define IS_VIDEO(fmt) (fmt->iformat->name && strcmp(fmt->iformat->name, "image2") != 0)
|
||||
|
||||
void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) {
|
||||
|
||||
int video_stream = -1;
|
||||
|
@ -6,6 +6,8 @@
|
||||
#include <libxml/xmlstring.h>
|
||||
#include <libxml/parser.h>
|
||||
|
||||
#define _X(str) ((const xmlChar*)str)
|
||||
|
||||
__always_inline
|
||||
static int should_read_part(const char *part) {
|
||||
|
||||
@ -50,13 +52,19 @@ int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_
|
||||
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
|
||||
|
||||
if (text) {
|
||||
text_buffer_append_string0(buf, (char *) text);
|
||||
int ret = text_buffer_append_string0(buf, (char *) text);
|
||||
text_buffer_append_char(buf, ' ');
|
||||
xmlFree(text);
|
||||
|
||||
if (ret == TEXT_BUF_FULL) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extract_text(ctx, xml, child->children, buf);
|
||||
if (extract_text(ctx, xml, child->children, buf) == TEXT_BUF_FULL) {
|
||||
return TEXT_BUF_FULL;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@ -71,10 +79,42 @@ int xml_io_close(UNUSED(void *context)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define READ_PART_ERR -2
|
||||
|
||||
__always_inline
|
||||
static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) {
|
||||
|
||||
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL, XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
|
||||
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
|
||||
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
|
||||
|
||||
if (xml == NULL) {
|
||||
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
|
||||
return READ_PART_ERR;
|
||||
}
|
||||
|
||||
xmlNode *root = xmlDocGetRootElement(xml);
|
||||
if (root == NULL) {
|
||||
CTX_LOG_ERROR(doc->filepath, "Empty document")
|
||||
xmlFreeDoc(xml);
|
||||
return READ_PART_ERR;
|
||||
}
|
||||
|
||||
int ret = extract_text(ctx, xml, root, buf);
|
||||
xmlFreeDoc(xml);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define APPEND_STR_META(doc, keyname, value) \
|
||||
meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \
|
||||
meta_str->key = keyname; \
|
||||
strcpy(meta_str->str_val, value); \
|
||||
APPEND_META(doc, meta_str)
|
||||
|
||||
__always_inline
|
||||
static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) {
|
||||
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
|
||||
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
|
||||
|
||||
if (xml == NULL) {
|
||||
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
|
||||
@ -88,7 +128,24 @@ static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *bu
|
||||
return -1;
|
||||
}
|
||||
|
||||
extract_text(ctx, xml, root, buf);
|
||||
if (xmlStrEqual(root->name, _X("coreProperties"))) {
|
||||
for (xmlNode *child = root->children; child; child = child->next) {
|
||||
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
|
||||
if (text == NULL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (xmlStrEqual(child->name, _X("title"))) {
|
||||
APPEND_STR_META(doc, MetaTitle, (char *) text)
|
||||
} else if (xmlStrEqual(child->name, _X("creator"))) {
|
||||
APPEND_STR_META(doc, MetaAuthor, (char *) text)
|
||||
} else if (xmlStrEqual(child->name, _X("lastModifiedBy"))) {
|
||||
APPEND_STR_META(doc, MetaModifiedBy, (char *) text)
|
||||
}
|
||||
|
||||
xmlFree(text);
|
||||
}
|
||||
}
|
||||
xmlFreeDoc(xml);
|
||||
|
||||
return 0;
|
||||
@ -97,7 +154,7 @@ static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *bu
|
||||
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
size_t buf_len;
|
||||
void * buf = read_all(f, &buf_len);
|
||||
void *buf = read_all(f, &buf_len);
|
||||
|
||||
struct archive *a = archive_read_new();
|
||||
archive_read_support_format_zip(a);
|
||||
@ -113,13 +170,20 @@ void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
text_buffer_t tex = text_buffer_create(ctx->content_size);
|
||||
|
||||
struct archive_entry *entry;
|
||||
int buffer_full = FALSE;
|
||||
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
|
||||
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
|
||||
const char *path = archive_entry_pathname(entry);
|
||||
|
||||
if (should_read_part(path)) {
|
||||
if (!buffer_full && should_read_part(path)) {
|
||||
ret = read_part(ctx, a, &tex, doc);
|
||||
if (ret != 0) {
|
||||
if (ret == READ_PART_ERR) {
|
||||
break;
|
||||
} else if (ret == TEXT_BUF_FULL) {
|
||||
buffer_full = TRUE;
|
||||
}
|
||||
} else if (strcmp(path, "docProps/core.xml") == 0) {
|
||||
if (read_doc_props(ctx, a, &tex, doc) != 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -8,7 +8,6 @@ typedef struct {
|
||||
long content_size;
|
||||
log_callback_t log;
|
||||
logf_callback_t logf;
|
||||
store_callback_t store;
|
||||
} scan_ooxml_ctx_t;
|
||||
|
||||
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
@ -79,6 +79,7 @@ enum metakey {
|
||||
MetaExifIsoSpeedRatings = META_STR(22),
|
||||
MetaExifDateTime = META_STR(23),
|
||||
MetaAuthor = META_STR(24),
|
||||
MetaModifiedBy = META_STR(25),
|
||||
};
|
||||
|
||||
typedef struct meta_line {
|
||||
|
@ -6,6 +6,7 @@ extern "C" {
|
||||
#include "../libscan/text/text.h"
|
||||
#include "../libscan/ebook/ebook.h"
|
||||
#include "../libscan/media/media.h"
|
||||
#include "../libscan/ooxml/ooxml.h"
|
||||
#include <libavutil/avutil.h>
|
||||
}
|
||||
|
||||
@ -19,6 +20,8 @@ static scan_ebook_ctx_t ebook_500_ctx;
|
||||
|
||||
static scan_media_ctx_t media_ctx;
|
||||
|
||||
static scan_ooxml_ctx_t ooxml_500_ctx;
|
||||
|
||||
|
||||
|
||||
/* Text */
|
||||
@ -231,6 +234,69 @@ TEST(MediaVideo, Vid3Webm) {
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
//TODO: test music file with embedded cover art
|
||||
|
||||
TEST(MediaAudio, MusicMp3) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/media/02-The Watchmaker-Barry James_spoken.mp3", &f, &doc);
|
||||
|
||||
parse_media(&media_ctx, &f, &doc);
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaArtist)->str_val, "Barry James");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAlbum)->str_val, "Strange Slumber, Music for Wonderful Dreams");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "The Watchmaker");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaGenre)->str_val, "New Age");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "http://magnatune.com/artists/barry_james");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaMediaAudioCodec)->str_val, "mp3");
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
/* OOXML */
|
||||
|
||||
TEST(Ooxml, Pptx1) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/ooxml/Catalist Presentation.pptx", &f, &doc);
|
||||
|
||||
parse_ooxml(&ooxml_500_ctx, &f, &doc);
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Slide 1");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "thofeller");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Hofeller");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Ooxml, Docx1) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/ooxml/How To Play A DVD On Windows 8.docx", &f, &doc);
|
||||
|
||||
parse_ooxml(&ooxml_500_ctx, &f, &doc);
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Thomas");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Thomas");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Ooxml, Xlsx1) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/ooxml/xlsx1.xlsx", &f, &doc);
|
||||
|
||||
parse_ooxml(&ooxml_500_ctx, &f, &doc);
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Bureau of Economic Analysis");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "lz");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
arc_recurse_ctx.log = noop_log;
|
||||
@ -265,6 +331,10 @@ int main(int argc, char **argv) {
|
||||
media_ctx.tn_size = 500;
|
||||
media_ctx.tn_qscale = 1.0;
|
||||
|
||||
ooxml_500_ctx.content_size = 500;
|
||||
ooxml_500_ctx.log = noop_log;
|
||||
ooxml_500_ctx.logf = noop_logf;
|
||||
|
||||
av_log_set_level(AV_LOG_QUIET);
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
|
Loading…
x
Reference in New Issue
Block a user