libscan/test/main.cpp

341 lines
10 KiB
C++

#include <gtest/gtest.h>
#include "test_util.h"
extern "C" {
#include "../libscan/arc/arc.h"
#include "../libscan/text/text.h"
#include "../libscan/ebook/ebook.h"
#include "../libscan/media/media.h"
#include "../libscan/ooxml/ooxml.h"
#include <libavutil/avutil.h>
}
static scan_arc_ctx_t arc_recurse_ctx;
static scan_arc_ctx_t arc_list_ctx;
static scan_text_ctx_t text_500_ctx;
static scan_ebook_ctx_t ebook_ctx;
static scan_ebook_ctx_t ebook_500_ctx;
static scan_media_ctx_t media_ctx;
static scan_ooxml_ctx_t ooxml_500_ctx;
/* Text */
TEST(Text, BookCsvContentLen) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/text/books.csv", &f, &doc);
parse_text(&text_500_ctx, &f, &doc);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
cleanup(&doc, &f);
}
TEST(Text, MemUtf8_1) {
const char *content = "a";
vfile_t f;
document_t doc;
load_doc_mem((void *) content, strlen(content), &f, &doc);
parse_text(&text_500_ctx, &f, &doc);
ASSERT_EQ(strlen(get_meta(&doc, MetaContent)->str_val), 1);
cleanup(&doc, &f);
}
TEST(Text, MemUtf8_Invalid1) {
const char *content = "12345\xE0";
vfile_t f;
document_t doc;
load_doc_mem((void *) content, strlen(content), &f, &doc);
parse_text(&text_500_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "12345");
cleanup(&doc, &f);
}
TEST(Text, MemUtf8_2) {
const char *content = "最後測試";
vfile_t f;
document_t doc;
load_doc_mem((void *) content, strlen(content), &f, &doc);
parse_text(&text_500_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "最後測試");
cleanup(&doc, &f);
}
TEST(Text, MemUtf8_Invalid2) {
const char *content = "最後測\xe8\xa9";
vfile_t f;
document_t doc;
load_doc_mem((void *) content, strlen(content), &f, &doc);
parse_text(&text_500_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "最後測");
cleanup(&doc, &f);
}
TEST(Text, MemWhitespace) {
const char *content = "\n \ttest\t\ntest test ";
vfile_t f;
document_t doc;
load_doc_mem((void *) content, strlen(content), &f, &doc);
parse_text(&text_500_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "test test test");
cleanup(&doc, &f);
}
/* Ebook */
TEST(Ebook, CandlePdf) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ebook/General_-_Candle_Making.pdf", &f, &doc);
parse_ebook(&ebook_500_ctx, &f, "application/pdf", &doc);
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Microsoft Word - A531 Candlemaking-01.doc");
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Dafydd Prichard");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
ASSERT_NE(get_meta(&doc, MetaContent)->str_val[0], ' ');
cleanup(&doc, &f);
}
TEST(Ebook, Utf8Pdf) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ebook/utf8.pdf", &f, &doc);
parse_ebook(&ebook_500_ctx, &f, "application/pdf", &doc);
ASSERT_TRUE(STR_STARTS_WITH(get_meta(&doc, MetaContent)->str_val, "最後測試 "));
cleanup(&doc, &f);
}
TEST(Ebook, Epub1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ebook/epub1.epub", &f, &doc);
parse_ebook(&ebook_500_ctx, &f, "application/epub+zip", &doc);
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Rabies");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
cleanup(&doc, &f);
}
TEST(Ebook, ComicCbz) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ebook/lost_treasure.cbz", &f, &doc);
parse_ebook(&ebook_500_ctx, &f, "application/vnd.comicbook+zip", &doc);
//TODO: Check that thumbnail was generated correctly
cleanup(&doc, &f);
}
TEST(Ebook, ComicCbr) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ebook/laugh.cbr", &f, &doc);
parse_ebook(&ebook_500_ctx, &f, "application/vnd.comicbook-rar", &doc);
//TODO: Check that thumbnail was generated correctly
cleanup(&doc, &f);
}
/* Media (image) */
TEST(MediaImage, Exif1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/media/exiftest1.jpg", &f, &doc);
parse_media(&media_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "I don't know if it's a thing mostly done for high end "
"hotels or what, but I've seen it in a few places in Thailand: "
"There's a tradition of flower folding, doing a sort of light "
"origami with the petals of lotus and other flowers, to make "
"cute little ornaments.");
ASSERT_STREQ(get_meta(&doc, MetaExifMake)->str_val, "NIKON CORPORATION");
ASSERT_STREQ(get_meta(&doc, MetaExifModel)->str_val, "NIKON D7000");
ASSERT_STREQ(get_meta(&doc, MetaExifDateTime)->str_val, "2019:11:08 14:37:59");
ASSERT_STREQ(get_meta(&doc, MetaExifExposureTime)->str_val, "1:160");
ASSERT_STREQ(get_meta(&doc, MetaArtist)->str_val, "FinalDoom");
ASSERT_STREQ(get_meta(&doc, MetaExifSoftware)->str_val, "Adobe Photoshop Lightroom 5.7 (Windows)");
ASSERT_STREQ(get_meta(&doc, MetaExifFNumber)->str_val, "53:10");
ASSERT_STREQ(get_meta(&doc, MetaExifFocalLength)->str_val, "900:10");
ASSERT_STREQ(get_meta(&doc, MetaExifIsoSpeedRatings)->str_val, "400");
ASSERT_STREQ(get_meta(&doc, MetaExifExposureTime)->str_val, "1:160");
//TODO: Check that thumbnail was generated correctly
cleanup(&doc, &f);
}
TEST(MediaVideo, Vid3Mp4) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/media/vid3.mp4", &f, &doc);
parse_media(&media_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Helicopter (((Accident))) - "
"https://archive.org/details/Virginia_Helicopter_Crash");
ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "h264");
ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 825169);
ASSERT_EQ(get_meta(&doc, MetaMediaDuration)->long_val, 10);
//TODO: Check that thumbnail was generated correctly
cleanup(&doc, &f);
}
TEST(MediaVideo, Vid3Ogv) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/media/vid3.ogv", &f, &doc);
parse_media(&media_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "theora");
ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 590261);
ASSERT_EQ(get_meta(&doc, MetaMediaDuration)->long_val, 10);
//TODO: Check that thumbnail was generated correctly
cleanup(&doc, &f);
}
TEST(MediaVideo, Vid3Webm) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/media/vid3.webm", &f, &doc);
parse_media(&media_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "vp8");
ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 343153);
ASSERT_EQ(get_meta(&doc, MetaMediaDuration)->long_val, 10);
//TODO: Check that thumbnail was generated correctly
cleanup(&doc, &f);
}
//TODO: test music file with embedded cover art
TEST(MediaAudio, MusicMp3) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/media/02-The Watchmaker-Barry James_spoken.mp3", &f, &doc);
parse_media(&media_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaArtist)->str_val, "Barry James");
ASSERT_STREQ(get_meta(&doc, MetaAlbum)->str_val, "Strange Slumber, Music for Wonderful Dreams");
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "The Watchmaker");
ASSERT_STREQ(get_meta(&doc, MetaGenre)->str_val, "New Age");
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "http://magnatune.com/artists/barry_james");
ASSERT_STREQ(get_meta(&doc, MetaMediaAudioCodec)->str_val, "mp3");
cleanup(&doc, &f);
}
/* OOXML */
TEST(Ooxml, Pptx1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ooxml/Catalist Presentation.pptx", &f, &doc);
parse_ooxml(&ooxml_500_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Slide 1");
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "thofeller");
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Hofeller");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
cleanup(&doc, &f);
}
TEST(Ooxml, Docx1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ooxml/How To Play A DVD On Windows 8.docx", &f, &doc);
parse_ooxml(&ooxml_500_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Thomas");
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Thomas");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
cleanup(&doc, &f);
}
TEST(Ooxml, Xlsx1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ooxml/xlsx1.xlsx", &f, &doc);
parse_ooxml(&ooxml_500_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Bureau of Economic Analysis");
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "lz");
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1);
cleanup(&doc, &f);
}
int main(int argc, char **argv) {
arc_recurse_ctx.log = noop_log;
arc_recurse_ctx.logf = noop_logf;
arc_recurse_ctx.store = noop_store;
arc_recurse_ctx.mode = ARC_MODE_RECURSE;
arc_recurse_ctx.parse = nullptr; //TODO
arc_list_ctx.log = noop_log;
arc_list_ctx.logf = noop_logf;
arc_list_ctx.store = noop_store;
arc_list_ctx.mode = ARC_MODE_LIST;
text_500_ctx.content_size = 500;
text_500_ctx.log = noop_log;
text_500_ctx.logf = noop_logf;
ebook_ctx.content_size = 999999999999;
ebook_ctx.store = noop_store;
ebook_ctx.tesseract_lang = "eng";
ebook_ctx.tesseract_path = "./tessdata";
ebook_ctx.tn_size = 500;
ebook_ctx.log = noop_log;
ebook_ctx.logf = noop_logf;
ebook_500_ctx = ebook_ctx;
ebook_500_ctx.content_size = 500;
media_ctx.log = noop_log;
media_ctx.logf = noop_logf;
media_ctx.store = noop_store;
media_ctx.tn_size = 500;
media_ctx.tn_qscale = 1.0;
ooxml_500_ctx.content_size = 500;
ooxml_500_ctx.log = noop_log;
ooxml_500_ctx.logf = noop_logf;
av_log_set_level(AV_LOG_QUIET);
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}