mirror of
				https://github.com/simon987/libscan.git
				synced 2025-10-26 15:36:52 +00:00 
			
		
		
		
	Compare commits
	
		
			No commits in common. "3787475ecba7453a2a97ab470103606c2cecabb2" and "da172823745b67662846cf1970a47ebcea8fe50e" have entirely different histories.
		
	
	
		
			3787475ecb
			...
			da17282374
		
	
		
| @ -46,8 +46,6 @@ find_package(JPEG REQUIRED) | |||||||
| find_package(LibXml2 REQUIRED) | find_package(LibXml2 REQUIRED) | ||||||
| find_package(LibLZMA REQUIRED) | find_package(LibLZMA REQUIRED) | ||||||
| find_package(ZLIB REQUIRED) | find_package(ZLIB REQUIRED) | ||||||
| find_package(unofficial-pcre CONFIG REQUIRED) |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec) | find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec) | ||||||
| find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd) | find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd) | ||||||
| @ -202,7 +200,6 @@ target_link_libraries( | |||||||
|         ${GUMBO_LIB} |         ${GUMBO_LIB} | ||||||
|         dl |         dl | ||||||
|         antiword |         antiword | ||||||
|         unofficial::pcre::pcre unofficial::pcre::pcre16 unofficial::pcre::pcre32 unofficial::pcre::pcrecpp |  | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| target_include_directories( | target_include_directories( | ||||||
|  | |||||||
| @ -5,7 +5,6 @@ | |||||||
| #include <string.h> | #include <string.h> | ||||||
| #include <fcntl.h> | #include <fcntl.h> | ||||||
| #include <openssl/evp.h> | #include <openssl/evp.h> | ||||||
| #include <pcre.h> |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| int should_parse_filtered_file(const char *filepath, int ext) { | int should_parse_filtered_file(const char *filepath, int ext) { | ||||||
| @ -143,10 +142,7 @@ int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *ar | |||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static __thread int sub_strings[30]; | scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { | ||||||
| #define EXCLUDED(str) (pcre_exec(exclude, exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0) |  | ||||||
| 
 |  | ||||||
| scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra) { |  | ||||||
| 
 | 
 | ||||||
|     struct archive *a = NULL; |     struct archive *a = NULL; | ||||||
|     struct archive_entry *entry = NULL; |     struct archive_entry *entry = NULL; | ||||||
| @ -217,19 +213,14 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre | |||||||
|                 } |                 } | ||||||
|                 sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1; |                 sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1; | ||||||
| 
 | 
 | ||||||
|                 // Handle excludes
 |  | ||||||
|                 if (exclude != NULL && EXCLUDED(sub_job->filepath)) { |  | ||||||
|                     CTX_LOG_DEBUGF("arc.c", "Excluded: %s", sub_job->filepath) |  | ||||||
|                     continue; |  | ||||||
|                 } |  | ||||||
| 
 |  | ||||||
|                 char *p = strrchr(sub_job->filepath, '.'); |                 char *p = strrchr(sub_job->filepath, '.'); | ||||||
|                 if (p != NULL && (p - sub_job->filepath) > strlen(f->filepath)) { |                 if (p != NULL) { | ||||||
|                     sub_job->ext = (int) (p - sub_job->filepath + 1); |                     sub_job->ext = (int) (p - sub_job->filepath + 1); | ||||||
|                 } else { |                 } else { | ||||||
|                     sub_job->ext = (int) strlen(sub_job->filepath); |                     sub_job->ext = (int) strlen(sub_job->filepath); | ||||||
|                 } |                 } | ||||||
| 
 | 
 | ||||||
|  |                 memset(&sub_job->vfile.sha1_ctx, 0, sizeof(sub_job->vfile.sha1_ctx)); | ||||||
|                 SHA1_Init(&sub_job->vfile.sha1_ctx); |                 SHA1_Init(&sub_job->vfile.sha1_ctx); | ||||||
| 
 | 
 | ||||||
|                 ctx->parse(sub_job); |                 ctx->parse(sub_job); | ||||||
|  | |||||||
| @ -4,7 +4,6 @@ | |||||||
| #include <archive.h> | #include <archive.h> | ||||||
| #include <archive_entry.h> | #include <archive_entry.h> | ||||||
| #include <fcntl.h> | #include <fcntl.h> | ||||||
| #include <pcre.h> |  | ||||||
| #include "../scan.h" | #include "../scan.h" | ||||||
| 
 | 
 | ||||||
| # define ARC_SKIPPED (-1) | # define ARC_SKIPPED (-1) | ||||||
| @ -69,7 +68,7 @@ int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *ar | |||||||
| 
 | 
 | ||||||
| int should_parse_filtered_file(const char *filepath, int ext); | int should_parse_filtered_file(const char *filepath, int ext); | ||||||
| 
 | 
 | ||||||
| scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra); | scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc); | ||||||
| 
 | 
 | ||||||
| int arc_read(struct vfile *f, void *buf, size_t size); | int arc_read(struct vfile *f, void *buf, size_t size); | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -48,10 +48,9 @@ static scan_json_ctx_t json_ctx; | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| static document_t LastSubDoc; | static document_t LastSubDoc; | ||||||
| static char *RecurseMediaMime = (char *) ""; |  | ||||||
| 
 | 
 | ||||||
| void _parse_media(parse_job_t *job) { | void _parse_media(parse_job_t *job) { | ||||||
|     parse_media(&media_ctx, &job->vfile, &LastSubDoc, RecurseMediaMime); |     parse_media(&media_ctx, &job->vfile, &LastSubDoc); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void _parse_ooxml(parse_job_t *job) { | void _parse_ooxml(parse_job_t *job) { | ||||||
| @ -390,7 +389,7 @@ TEST(MediaImage, ExifGps1) { | |||||||
|     document_t doc; |     document_t doc; | ||||||
|     load_doc_file("libscan-test-files/test_files/media/exif_GPS.jpg", &f, &doc); |     load_doc_file("libscan-test-files/test_files/media/exif_GPS.jpg", &f, &doc); | ||||||
| 
 | 
 | ||||||
|     parse_media(&media_ctx, &f, &doc, "image/jpeg"); |     parse_media(&media_ctx, &f, &doc); | ||||||
| 
 | 
 | ||||||
|     ASSERT_STREQ(get_meta(&doc, MetaExifGpsLatitudeRef)->str_val, "N"); |     ASSERT_STREQ(get_meta(&doc, MetaExifGpsLatitudeRef)->str_val, "N"); | ||||||
|     ASSERT_STREQ(get_meta(&doc, MetaExifGpsLatitudeDMS)->str_val, "48:1 , 56585399:1000000, 0:1"); |     ASSERT_STREQ(get_meta(&doc, MetaExifGpsLatitudeDMS)->str_val, "48:1 , 56585399:1000000, 0:1"); | ||||||
| @ -406,7 +405,7 @@ TEST(MediaImage, Exif1) { | |||||||
|     document_t doc; |     document_t doc; | ||||||
|     load_doc_file("libscan-test-files/test_files/media/exiftest1.jpg", &f, &doc); |     load_doc_file("libscan-test-files/test_files/media/exiftest1.jpg", &f, &doc); | ||||||
| 
 | 
 | ||||||
|     parse_media(&media_ctx, &f, &doc, "image/jpeg"); |     parse_media(&media_ctx, &f, &doc); | ||||||
| 
 | 
 | ||||||
|     ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "I don't know if it's a thing mostly done for high end " |     ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "I don't know if it's a thing mostly done for high end " | ||||||
|                                                        "hotels or what, but I've seen it in a few places in Thailand: " |                                                        "hotels or what, but I've seen it in a few places in Thailand: " | ||||||
| @ -435,8 +434,7 @@ TEST(MediaImage, Mem1) { | |||||||
| 
 | 
 | ||||||
|     size_t size_before = store_size; |     size_t size_before = store_size; | ||||||
| 
 | 
 | ||||||
|     RecurseMediaMime = (char *) "image/jpeg"; |     parse_archive(&arc_recurse_media_ctx, &f, &doc); | ||||||
|     parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr); |  | ||||||
| 
 | 
 | ||||||
|     ASSERT_NE(size_before, store_size); |     ASSERT_NE(size_before, store_size); | ||||||
| 
 | 
 | ||||||
| @ -450,7 +448,7 @@ TEST(MediaImage, AsIsFs) { | |||||||
| 
 | 
 | ||||||
|     size_t size_before = store_size; |     size_t size_before = store_size; | ||||||
| 
 | 
 | ||||||
|     parse_media(&media_ctx, &f, &doc, "image/jpeg"); |     parse_media(&media_ctx, &f, &doc); | ||||||
| 
 | 
 | ||||||
|     ASSERT_EQ(size_before + 14098, store_size); |     ASSERT_EQ(size_before + 14098, store_size); | ||||||
| 
 | 
 | ||||||
| @ -464,8 +462,7 @@ TEST(MediaImage, Mem2AsIs) { | |||||||
| 
 | 
 | ||||||
|     size_t size_before = store_size; |     size_t size_before = store_size; | ||||||
| 
 | 
 | ||||||
|     RecurseMediaMime = (char *) "image/jpeg"; |     parse_archive(&arc_recurse_media_ctx, &f, &doc); | ||||||
|     parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr); |  | ||||||
| 
 | 
 | ||||||
|     ASSERT_EQ(size_before + 14098, store_size); |     ASSERT_EQ(size_before + 14098, store_size); | ||||||
| 
 | 
 | ||||||
| @ -478,7 +475,7 @@ TEST(MediaVideo, VidMkvSubDisabled) { | |||||||
|     load_doc_file("libscan-test-files/test_files/media/berd.mkv", &f, &doc); |     load_doc_file("libscan-test-files/test_files/media/berd.mkv", &f, &doc); | ||||||
| 
 | 
 | ||||||
|     size_t size_before = store_size; |     size_t size_before = store_size; | ||||||
|     parse_media(&media_ctx, &f, &doc, "video/x-matroska"); |     parse_media(&media_ctx, &f, &doc); | ||||||
| 
 | 
 | ||||||
|     ASSERT_NE(size_before, store_size); |     ASSERT_NE(size_before, store_size); | ||||||
|     ASSERT_EQ(get_meta(&doc, MetaContent), nullptr); |     ASSERT_EQ(get_meta(&doc, MetaContent), nullptr); | ||||||
| @ -493,7 +490,7 @@ TEST(MediaVideo, VidMkvSubEnabled) { | |||||||
| 
 | 
 | ||||||
|     size_t size_before = store_size; |     size_t size_before = store_size; | ||||||
|     media_ctx.read_subtitles = TRUE; |     media_ctx.read_subtitles = TRUE; | ||||||
|     parse_media(&media_ctx, &f, &doc, "video/x-matroska"); |     parse_media(&media_ctx, &f, &doc); | ||||||
|     media_ctx.read_subtitles = FALSE; |     media_ctx.read_subtitles = FALSE; | ||||||
| 
 | 
 | ||||||
|     ASSERT_NE(size_before, store_size); |     ASSERT_NE(size_before, store_size); | ||||||
| @ -507,7 +504,7 @@ TEST(MediaVideo, Vid3Mp4) { | |||||||
|     document_t doc; |     document_t doc; | ||||||
|     load_doc_file("libscan-test-files/test_files/media/vid3.mp4", &f, &doc); |     load_doc_file("libscan-test-files/test_files/media/vid3.mp4", &f, &doc); | ||||||
| 
 | 
 | ||||||
|     parse_media(&media_ctx, &f, &doc, "video/mp4"); |     parse_media(&media_ctx, &f, &doc); | ||||||
| 
 | 
 | ||||||
|     ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Helicopter (((Accident))) - " |     ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Helicopter (((Accident))) - " | ||||||
|                                                      "https://archive.org/details/Virginia_Helicopter_Crash"); |                                                      "https://archive.org/details/Virginia_Helicopter_Crash"); | ||||||
| @ -524,7 +521,7 @@ TEST(MediaVideo, Vid3Ogv) { | |||||||
|     document_t doc; |     document_t doc; | ||||||
|     load_doc_file("libscan-test-files/test_files/media/vid3.ogv", &f, &doc); |     load_doc_file("libscan-test-files/test_files/media/vid3.ogv", &f, &doc); | ||||||
| 
 | 
 | ||||||
|     parse_media(&media_ctx, &f, &doc, "application/ogg"); |     parse_media(&media_ctx, &f, &doc); | ||||||
| 
 | 
 | ||||||
|     ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "theora"); |     ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "theora"); | ||||||
|     ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 590261); |     ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 590261); | ||||||
| @ -539,7 +536,7 @@ TEST(MediaVideo, Vid3Webm) { | |||||||
|     document_t doc; |     document_t doc; | ||||||
|     load_doc_file("libscan-test-files/test_files/media/vid3.webm", &f, &doc); |     load_doc_file("libscan-test-files/test_files/media/vid3.webm", &f, &doc); | ||||||
| 
 | 
 | ||||||
|     parse_media(&media_ctx, &f, &doc, "video/webm"); |     parse_media(&media_ctx, &f, &doc); | ||||||
| 
 | 
 | ||||||
|     ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "vp8"); |     ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "vp8"); | ||||||
|     ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 343153); |     ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 343153); | ||||||
| @ -556,8 +553,7 @@ TEST(MediaVideoVfile, Vid3Ogv) { | |||||||
| 
 | 
 | ||||||
|     size_t size_before = store_size; |     size_t size_before = store_size; | ||||||
| 
 | 
 | ||||||
|     RecurseMediaMime = (char *) "video/webm"; |     parse_archive(&arc_recurse_media_ctx, &f, &doc); | ||||||
|     parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr); |  | ||||||
| 
 | 
 | ||||||
| //    ASSERT_STREQ(get_meta(&LastSubDoc, MetaMediaVideoCodec)->str_val, "theora");
 | //    ASSERT_STREQ(get_meta(&LastSubDoc, MetaMediaVideoCodec)->str_val, "theora");
 | ||||||
|     ASSERT_EQ(get_meta(&LastSubDoc, MetaMediaBitrate)->long_val, 590261); |     ASSERT_EQ(get_meta(&LastSubDoc, MetaMediaBitrate)->long_val, 590261); | ||||||
| @ -572,7 +568,7 @@ TEST(MediaVideo, VidDuplicateTags) { | |||||||
|     document_t doc; |     document_t doc; | ||||||
|     load_doc_file("libscan-test-files/test_files/media/vid_tags.mkv", &f, &doc); |     load_doc_file("libscan-test-files/test_files/media/vid_tags.mkv", &f, &doc); | ||||||
| 
 | 
 | ||||||
|     parse_media(&media_ctx, &f, &doc, "video/x-matroska"); |     parse_media(&media_ctx, &f, &doc); | ||||||
| 
 | 
 | ||||||
|     meta_line_t *meta_content = get_meta(&doc, MetaContent); |     meta_line_t *meta_content = get_meta(&doc, MetaContent); | ||||||
|     ASSERT_STREQ(meta_content->str_val, "he's got a point"); |     ASSERT_STREQ(meta_content->str_val, "he's got a point"); | ||||||
| @ -596,7 +592,7 @@ TEST(MediaAudio, MusicMp3) { | |||||||
|     document_t doc; |     document_t doc; | ||||||
|     load_doc_file("libscan-test-files/test_files/media/02-The Watchmaker-Barry James_spoken.mp3", &f, &doc); |     load_doc_file("libscan-test-files/test_files/media/02-The Watchmaker-Barry James_spoken.mp3", &f, &doc); | ||||||
| 
 | 
 | ||||||
|     parse_media(&media_ctx, &f, &doc, "audio/x-mpeg-3"); |     parse_media(&media_ctx, &f, &doc); | ||||||
| 
 | 
 | ||||||
|     ASSERT_STREQ(get_meta(&doc, MetaArtist)->str_val, "Barry James"); |     ASSERT_STREQ(get_meta(&doc, MetaArtist)->str_val, "Barry James"); | ||||||
|     ASSERT_STREQ(get_meta(&doc, MetaAlbum)->str_val, "Strange Slumber, Music for Wonderful Dreams"); |     ASSERT_STREQ(get_meta(&doc, MetaAlbum)->str_val, "Strange Slumber, Music for Wonderful Dreams"); | ||||||
| @ -663,7 +659,7 @@ TEST(Ooxml, Docx2Archive) { | |||||||
|     load_doc_file("libscan-test-files/test_files/ooxml/docx2.docx.7z", &f, &doc); |     load_doc_file("libscan-test-files/test_files/ooxml/docx2.docx.7z", &f, &doc); | ||||||
| 
 | 
 | ||||||
|     ooxml_500_ctx.content_size = 999999; |     ooxml_500_ctx.content_size = 999999; | ||||||
|     parse_archive(&arc_recurse_ooxml_ctx, &f, &doc, nullptr, nullptr); |     parse_archive(&arc_recurse_ooxml_ctx, &f, &doc); | ||||||
| 
 | 
 | ||||||
|     ASSERT_STREQ(get_meta(&LastSubDoc, MetaAuthor)->str_val, "liz evans"); |     ASSERT_STREQ(get_meta(&LastSubDoc, MetaAuthor)->str_val, "liz evans"); | ||||||
|     ASSERT_EQ(get_meta(&LastSubDoc, MetaPages)->long_val, 1); |     ASSERT_EQ(get_meta(&LastSubDoc, MetaPages)->long_val, 1); | ||||||
| @ -755,7 +751,7 @@ TEST(Arc, Utf8) { | |||||||
|     document_t doc; |     document_t doc; | ||||||
|     load_doc_file("libscan-test-files/test_files/arc/test1.zip", &f, &doc); |     load_doc_file("libscan-test-files/test_files/arc/test1.zip", &f, &doc); | ||||||
| 
 | 
 | ||||||
|     parse_archive(&arc_list_ctx, &f, &doc, nullptr, nullptr); |     parse_archive(&arc_list_ctx, &f, &doc); | ||||||
| 
 | 
 | ||||||
|     ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "arctest/ȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬ.txt") != nullptr); |     ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "arctest/ȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬ.txt") != nullptr); | ||||||
| 
 | 
 | ||||||
| @ -770,7 +766,7 @@ TEST(Arc, EncryptedZip) { | |||||||
|     size_t size_before = store_size; |     size_t size_before = store_size; | ||||||
| 
 | 
 | ||||||
|     strcpy(arc_recurse_media_ctx.passphrase, "sist2"); |     strcpy(arc_recurse_media_ctx.passphrase, "sist2"); | ||||||
|     parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr); |     parse_archive(&arc_recurse_media_ctx, &f, &doc); | ||||||
| 
 | 
 | ||||||
|     arc_recurse_media_ctx.passphrase[0] = '\0'; |     arc_recurse_media_ctx.passphrase[0] = '\0'; | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user