Compare commits

..

No commits in common. "3787475ecba7453a2a97ab470103606c2cecabb2" and "da172823745b67662846cf1970a47ebcea8fe50e" have entirely different histories.

4 changed files with 21 additions and 38 deletions

View File

@ -46,8 +46,6 @@ find_package(JPEG REQUIRED)
find_package(LibXml2 REQUIRED)
find_package(LibLZMA REQUIRED)
find_package(ZLIB REQUIRED)
find_package(unofficial-pcre CONFIG REQUIRED)
find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec)
find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd)
@ -202,7 +200,6 @@ target_link_libraries(
${GUMBO_LIB}
dl
antiword
unofficial::pcre::pcre unofficial::pcre::pcre16 unofficial::pcre::pcre32 unofficial::pcre::pcrecpp
)
target_include_directories(

View File

@ -5,7 +5,6 @@
#include <string.h>
#include <fcntl.h>
#include <openssl/evp.h>
#include <pcre.h>
int should_parse_filtered_file(const char *filepath, int ext) {
@ -143,10 +142,7 @@ int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *ar
}
}
static __thread int sub_strings[30];
#define EXCLUDED(str) (pcre_exec(exclude, exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra) {
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
@ -217,19 +213,14 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre
}
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
// Handle excludes
if (exclude != NULL && EXCLUDED(sub_job->filepath)) {
CTX_LOG_DEBUGF("arc.c", "Excluded: %s", sub_job->filepath)
continue;
}
char *p = strrchr(sub_job->filepath, '.');
if (p != NULL && (p - sub_job->filepath) > strlen(f->filepath)) {
if (p != NULL) {
sub_job->ext = (int) (p - sub_job->filepath + 1);
} else {
sub_job->ext = (int) strlen(sub_job->filepath);
}
memset(&sub_job->vfile.sha1_ctx, 0, sizeof(sub_job->vfile.sha1_ctx));
SHA1_Init(&sub_job->vfile.sha1_ctx);
ctx->parse(sub_job);

View File

@ -4,7 +4,6 @@
#include <archive.h>
#include <archive_entry.h>
#include <fcntl.h>
#include <pcre.h>
#include "../scan.h"
# define ARC_SKIPPED (-1)
@ -69,7 +68,7 @@ int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *ar
int should_parse_filtered_file(const char *filepath, int ext);
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra);
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc);
int arc_read(struct vfile *f, void *buf, size_t size);

View File

@ -48,10 +48,9 @@ static scan_json_ctx_t json_ctx;
static document_t LastSubDoc;
static char *RecurseMediaMime = (char *) "";
void _parse_media(parse_job_t *job) {
parse_media(&media_ctx, &job->vfile, &LastSubDoc, RecurseMediaMime);
parse_media(&media_ctx, &job->vfile, &LastSubDoc);
}
void _parse_ooxml(parse_job_t *job) {
@ -390,7 +389,7 @@ TEST(MediaImage, ExifGps1) {
document_t doc;
load_doc_file("libscan-test-files/test_files/media/exif_GPS.jpg", &f, &doc);
parse_media(&media_ctx, &f, &doc, "image/jpeg");
parse_media(&media_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaExifGpsLatitudeRef)->str_val, "N");
ASSERT_STREQ(get_meta(&doc, MetaExifGpsLatitudeDMS)->str_val, "48:1 , 56585399:1000000, 0:1");
@ -406,7 +405,7 @@ TEST(MediaImage, Exif1) {
document_t doc;
load_doc_file("libscan-test-files/test_files/media/exiftest1.jpg", &f, &doc);
parse_media(&media_ctx, &f, &doc, "image/jpeg");
parse_media(&media_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "I don't know if it's a thing mostly done for high end "
"hotels or what, but I've seen it in a few places in Thailand: "
@ -435,8 +434,7 @@ TEST(MediaImage, Mem1) {
size_t size_before = store_size;
RecurseMediaMime = (char *) "image/jpeg";
parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr);
parse_archive(&arc_recurse_media_ctx, &f, &doc);
ASSERT_NE(size_before, store_size);
@ -450,7 +448,7 @@ TEST(MediaImage, AsIsFs) {
size_t size_before = store_size;
parse_media(&media_ctx, &f, &doc, "image/jpeg");
parse_media(&media_ctx, &f, &doc);
ASSERT_EQ(size_before + 14098, store_size);
@ -464,8 +462,7 @@ TEST(MediaImage, Mem2AsIs) {
size_t size_before = store_size;
RecurseMediaMime = (char *) "image/jpeg";
parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr);
parse_archive(&arc_recurse_media_ctx, &f, &doc);
ASSERT_EQ(size_before + 14098, store_size);
@ -478,7 +475,7 @@ TEST(MediaVideo, VidMkvSubDisabled) {
load_doc_file("libscan-test-files/test_files/media/berd.mkv", &f, &doc);
size_t size_before = store_size;
parse_media(&media_ctx, &f, &doc, "video/x-matroska");
parse_media(&media_ctx, &f, &doc);
ASSERT_NE(size_before, store_size);
ASSERT_EQ(get_meta(&doc, MetaContent), nullptr);
@ -493,7 +490,7 @@ TEST(MediaVideo, VidMkvSubEnabled) {
size_t size_before = store_size;
media_ctx.read_subtitles = TRUE;
parse_media(&media_ctx, &f, &doc, "video/x-matroska");
parse_media(&media_ctx, &f, &doc);
media_ctx.read_subtitles = FALSE;
ASSERT_NE(size_before, store_size);
@ -507,7 +504,7 @@ TEST(MediaVideo, Vid3Mp4) {
document_t doc;
load_doc_file("libscan-test-files/test_files/media/vid3.mp4", &f, &doc);
parse_media(&media_ctx, &f, &doc, "video/mp4");
parse_media(&media_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Helicopter (((Accident))) - "
"https://archive.org/details/Virginia_Helicopter_Crash");
@ -524,7 +521,7 @@ TEST(MediaVideo, Vid3Ogv) {
document_t doc;
load_doc_file("libscan-test-files/test_files/media/vid3.ogv", &f, &doc);
parse_media(&media_ctx, &f, &doc, "application/ogg");
parse_media(&media_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "theora");
ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 590261);
@ -539,7 +536,7 @@ TEST(MediaVideo, Vid3Webm) {
document_t doc;
load_doc_file("libscan-test-files/test_files/media/vid3.webm", &f, &doc);
parse_media(&media_ctx, &f, &doc, "video/webm");
parse_media(&media_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "vp8");
ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 343153);
@ -556,8 +553,7 @@ TEST(MediaVideoVfile, Vid3Ogv) {
size_t size_before = store_size;
RecurseMediaMime = (char *) "video/webm";
parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr);
parse_archive(&arc_recurse_media_ctx, &f, &doc);
// ASSERT_STREQ(get_meta(&LastSubDoc, MetaMediaVideoCodec)->str_val, "theora");
ASSERT_EQ(get_meta(&LastSubDoc, MetaMediaBitrate)->long_val, 590261);
@ -572,7 +568,7 @@ TEST(MediaVideo, VidDuplicateTags) {
document_t doc;
load_doc_file("libscan-test-files/test_files/media/vid_tags.mkv", &f, &doc);
parse_media(&media_ctx, &f, &doc, "video/x-matroska");
parse_media(&media_ctx, &f, &doc);
meta_line_t *meta_content = get_meta(&doc, MetaContent);
ASSERT_STREQ(meta_content->str_val, "he's got a point");
@ -596,7 +592,7 @@ TEST(MediaAudio, MusicMp3) {
document_t doc;
load_doc_file("libscan-test-files/test_files/media/02-The Watchmaker-Barry James_spoken.mp3", &f, &doc);
parse_media(&media_ctx, &f, &doc, "audio/x-mpeg-3");
parse_media(&media_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaArtist)->str_val, "Barry James");
ASSERT_STREQ(get_meta(&doc, MetaAlbum)->str_val, "Strange Slumber, Music for Wonderful Dreams");
@ -663,7 +659,7 @@ TEST(Ooxml, Docx2Archive) {
load_doc_file("libscan-test-files/test_files/ooxml/docx2.docx.7z", &f, &doc);
ooxml_500_ctx.content_size = 999999;
parse_archive(&arc_recurse_ooxml_ctx, &f, &doc, nullptr, nullptr);
parse_archive(&arc_recurse_ooxml_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&LastSubDoc, MetaAuthor)->str_val, "liz evans");
ASSERT_EQ(get_meta(&LastSubDoc, MetaPages)->long_val, 1);
@ -755,7 +751,7 @@ TEST(Arc, Utf8) {
document_t doc;
load_doc_file("libscan-test-files/test_files/arc/test1.zip", &f, &doc);
parse_archive(&arc_list_ctx, &f, &doc, nullptr, nullptr);
parse_archive(&arc_list_ctx, &f, &doc);
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "arctest/ȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬ.txt") != nullptr);
@ -770,7 +766,7 @@ TEST(Arc, EncryptedZip) {
size_t size_before = store_size;
strcpy(arc_recurse_media_ctx.passphrase, "sist2");
parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr);
parse_archive(&arc_recurse_media_ctx, &f, &doc);
arc_recurse_media_ctx.passphrase[0] = '\0';