mirror of
				https://github.com/simon987/libscan.git
				synced 2025-10-25 23:26:52 +00:00 
			
		
		
		
	Support for sha1sum
This commit is contained in:
		
							parent
							
								
									23da8ada5f
								
							
						
					
					
						commit
						52d7649322
					
				| @ -4,6 +4,7 @@ | ||||
| #include <stdlib.h> | ||||
| #include <string.h> | ||||
| #include <fcntl.h> | ||||
| #include <openssl/evp.h> | ||||
| 
 | ||||
| 
 | ||||
| int should_parse_filtered_file(const char *filepath, int ext) { | ||||
| @ -33,18 +34,29 @@ int should_parse_filtered_file(const char *filepath, int ext) { | ||||
|     return FALSE; | ||||
| } | ||||
| 
 | ||||
| int arc_read(struct vfile *f, void *buf, size_t size) { | ||||
|     size_t read = archive_read_data(f->arc, buf, size); | ||||
| void arc_close(struct vfile *f) { | ||||
|     SHA1_Final(f->sha1_digest, &f->sha1_ctx); | ||||
| } | ||||
| 
 | ||||
|     if (read != size) { | ||||
|         const char* error_str = archive_error_string(f->arc); | ||||
| 
 | ||||
| int arc_read(struct vfile *f, void *buf, size_t size) { | ||||
|     size_t bytes_read = archive_read_data(f->arc, buf, size); | ||||
| 
 | ||||
|     if (bytes_read != 0 && bytes_read <= size && f->calculate_checksum) { | ||||
|         f->has_checksum = TRUE; | ||||
| 
 | ||||
|         safe_sha1_update(&f->sha1_ctx, (unsigned char*)buf, bytes_read); | ||||
|     } | ||||
| 
 | ||||
|     if (bytes_read != size) { | ||||
|         const char *error_str = archive_error_string(f->arc); | ||||
|         if (error_str != NULL) { | ||||
|             f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str); | ||||
|         } | ||||
|         return -1; | ||||
|     } | ||||
| 
 | ||||
|     return read; | ||||
|     return (int) bytes_read; | ||||
| } | ||||
| 
 | ||||
| int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) { | ||||
| @ -58,7 +70,7 @@ int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *ar | ||||
|             archive_read_add_passphrase(*a, ctx->passphrase); | ||||
|         } | ||||
| 
 | ||||
|        return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE); | ||||
|         return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE); | ||||
|     } else if (allow_recurse) { | ||||
|         *a = archive_read_new(); | ||||
|         archive_read_support_filter_all(*a); | ||||
| @ -102,8 +114,8 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { | ||||
| 
 | ||||
|         while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { | ||||
|             if (S_ISREG(archive_entry_stat(entry)->st_mode)) { | ||||
|                 const char* utf8_name = archive_entry_pathname_utf8(entry); | ||||
|                 const char* file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name; | ||||
|                 const char *utf8_name = archive_entry_pathname_utf8(entry); | ||||
|                 const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name; | ||||
| 
 | ||||
|                 dyn_buffer_append_string(&buf, file_path); | ||||
|                 dyn_buffer_write_char(&buf, ' '); | ||||
| @ -121,7 +133,7 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { | ||||
| 
 | ||||
|         parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2); | ||||
| 
 | ||||
|         sub_job->vfile.close = NULL; | ||||
|         sub_job->vfile.close = arc_close; | ||||
|         sub_job->vfile.read = arc_read; | ||||
|         sub_job->vfile.reset = NULL; | ||||
|         sub_job->vfile.arc = a; | ||||
| @ -129,13 +141,15 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { | ||||
|         sub_job->vfile.is_fs_file = FALSE; | ||||
|         sub_job->vfile.log = ctx->log; | ||||
|         sub_job->vfile.logf = ctx->logf; | ||||
|         sub_job->vfile.has_checksum = FALSE; | ||||
|         sub_job->vfile.calculate_checksum = f->calculate_checksum; | ||||
|         memcpy(sub_job->parent, doc->path_md5, MD5_DIGEST_LENGTH); | ||||
| 
 | ||||
|         while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { | ||||
|             sub_job->vfile.info = *archive_entry_stat(entry); | ||||
|             if (S_ISREG(sub_job->vfile.info.st_mode)) { | ||||
| 
 | ||||
|                 const char* utf8_name = archive_entry_pathname_utf8(entry); | ||||
|                 const char *utf8_name = archive_entry_pathname_utf8(entry); | ||||
| 
 | ||||
|                 if (utf8_name == NULL) { | ||||
|                     sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry)); | ||||
| @ -151,6 +165,9 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { | ||||
|                     sub_job->ext = (int) strlen(sub_job->filepath); | ||||
|                 } | ||||
| 
 | ||||
|                 memset(&sub_job->vfile.sha1_ctx, 0, sizeof(sub_job->vfile.sha1_ctx)); | ||||
|                 SHA1_Init(&sub_job->vfile.sha1_ctx); | ||||
| 
 | ||||
|                 ctx->parse(sub_job); | ||||
|             } | ||||
|         } | ||||
|  | ||||
| @ -6,7 +6,7 @@ | ||||
| #include <fcntl.h> | ||||
| #include "../scan.h" | ||||
| 
 | ||||
| # define ARC_SKIPPED -1 | ||||
| # define ARC_SKIPPED (-1) | ||||
| #define ARC_MODE_SKIP 0 | ||||
| #define ARC_MODE_LIST 1 | ||||
| #define ARC_MODE_SHALLOW 2 | ||||
| @ -31,27 +31,34 @@ typedef struct { | ||||
| } arc_data_t; | ||||
| 
 | ||||
| static int vfile_open_callback(struct archive *a, void *user_data) { | ||||
|     arc_data_t *data = (arc_data_t*)user_data; | ||||
|     arc_data_t *data = (arc_data_t *) user_data; | ||||
| 
 | ||||
|     if (data->f->is_fs_file && data->f->fd == -1) { | ||||
|         data->f->fd = open(data->f->filepath, O_RDONLY); | ||||
|     if (!data->f->is_fs_file) { | ||||
|         SHA1_Init(&data->f->sha1_ctx); | ||||
|     } | ||||
| 
 | ||||
|     return ARCHIVE_OK; | ||||
| } | ||||
| 
 | ||||
| static long vfile_read_callback(struct archive *a, void *user_data, const void **buf) { | ||||
|     arc_data_t *data = (arc_data_t*)user_data; | ||||
|     arc_data_t *data = (arc_data_t *) user_data; | ||||
| 
 | ||||
|     *buf = data->buf; | ||||
|     return data->f->read(data->f, data->buf, ARC_BUF_SIZE); | ||||
|     long ret = data->f->read(data->f, data->buf, sizeof(data->buf)); | ||||
| 
 | ||||
|     if (!data->f->is_fs_file && ret > 0) { | ||||
|         data->f->has_checksum = TRUE; | ||||
|         safe_sha1_update(&data->f->sha1_ctx, (unsigned char*)data->buf, ret); | ||||
|     } | ||||
| 
 | ||||
|     return ret; | ||||
| } | ||||
| 
 | ||||
| static int vfile_close_callback(struct archive *a, void *user_data) { | ||||
|     arc_data_t *data = (arc_data_t*)user_data; | ||||
|     arc_data_t *data = (arc_data_t *) user_data; | ||||
| 
 | ||||
|     if (data->f->close != NULL) { | ||||
|         data->f->close(data->f); | ||||
|     if (!data->f->is_fs_file) { | ||||
|         SHA1_Final((unsigned char *) data->f->sha1_digest, &data->f->sha1_ctx); | ||||
|     } | ||||
| 
 | ||||
|     return ARCHIVE_OK; | ||||
| @ -63,6 +70,8 @@ int should_parse_filtered_file(const char *filepath, int ext); | ||||
| 
 | ||||
| scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc); | ||||
| 
 | ||||
| int arc_read(struct vfile * f, void *buf, size_t size); | ||||
| int arc_read(struct vfile *f, void *buf, size_t size); | ||||
| 
 | ||||
| void arc_close(struct vfile *f); | ||||
| 
 | ||||
| #endif | ||||
|  | ||||
| @ -67,10 +67,10 @@ scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) { | ||||
| 
 | ||||
| scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) { | ||||
| 
 | ||||
|     char *buf = malloc(JSON_BUF_SIZE + 1); | ||||
|     char *buf = calloc(JSON_BUF_SIZE + 1, sizeof(char)); | ||||
|     *(buf + JSON_BUF_SIZE) = '\0'; | ||||
| 
 | ||||
|     text_buffer_t tex = text_buffer_create(-1); | ||||
|     text_buffer_t tex = text_buffer_create(ctx->content_size); | ||||
| 
 | ||||
|     size_t ret; | ||||
|     int eof = FALSE; | ||||
|  | ||||
| @ -20,6 +20,9 @@ | ||||
| #undef ABS | ||||
| #define ABS(a) (((a) < 0) ? -(a) : (a)) | ||||
| 
 | ||||
| #define SHA1_STR_LENGTH 41 | ||||
| #define SHA1_DIGEST_LENGTH 20 | ||||
| 
 | ||||
| #define APPEND_STR_META(doc, keyname, value) \ | ||||
|     {meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \ | ||||
|     meta_str->key = keyname; \ | ||||
|  | ||||
| @ -511,7 +511,7 @@ int memfile_read(void *ptr, uint8_t *buf, int buf_size) { | ||||
|         return AVERROR_EOF; | ||||
|     } | ||||
| 
 | ||||
|     return buf_size; | ||||
|     return (int) ret; | ||||
| } | ||||
| 
 | ||||
| long memfile_seek(void *ptr, long offset, int whence) { | ||||
| @ -540,11 +540,18 @@ int memfile_open(vfile_t *f, memfile_t *mem) { | ||||
|     int ret = f->read(f, mem->buf, mem->info.st_size); | ||||
|     mem->file = fmemopen(mem->buf, mem->info.st_size, "rb"); | ||||
| 
 | ||||
|     if (f->calculate_checksum) { | ||||
|         SHA1_Init(&f->sha1_ctx); | ||||
|         safe_sha1_update(&f->sha1_ctx, mem->buf, mem->info.st_size); | ||||
|         SHA1_Final(f->sha1_digest, &f->sha1_ctx); | ||||
|         f->has_checksum = TRUE; | ||||
|     } | ||||
| 
 | ||||
|     return (ret == mem->info.st_size && mem->file != NULL) ? 0 : -1; | ||||
| } | ||||
| 
 | ||||
| int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) { | ||||
|     mem->info.st_size = buf_len; | ||||
|     mem->info.st_size = (int) buf_len; | ||||
| 
 | ||||
|     mem->buf = buf; | ||||
|     mem->file = fmemopen(mem->buf, mem->info.st_size, "rb"); | ||||
| @ -619,7 +626,7 @@ void init_media() { | ||||
| } | ||||
| 
 | ||||
| int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) { | ||||
|     memfile_t memfile; | ||||
|     memfile_t memfile = {{}, 0, 0}; | ||||
|     AVIOContext *io_ctx = NULL; | ||||
| 
 | ||||
|     AVFormatContext *pFormatCtx = avformat_alloc_context(); | ||||
| @ -637,8 +644,6 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu | ||||
|     } else { | ||||
|         avformat_close_input(&pFormatCtx); | ||||
|         avformat_free_context(pFormatCtx); | ||||
|         av_free(io_ctx->buffer); | ||||
|         avio_context_free(&io_ctx); | ||||
|         fclose(memfile.file); | ||||
|         return FALSE; | ||||
|     } | ||||
|  | ||||
| @ -8,6 +8,7 @@ | ||||
| #include <stdio.h> | ||||
| #include <sys/stat.h> | ||||
| #include <openssl/md5.h> | ||||
| #include <openssl/sha.h> | ||||
| 
 | ||||
| #include "macros.h" | ||||
| 
 | ||||
| @ -69,6 +70,7 @@ enum metakey { | ||||
|     MetaAuthor, | ||||
|     MetaModifiedBy, | ||||
|     MetaThumbnail, | ||||
|     MetaChecksum, | ||||
| 
 | ||||
|     // Number
 | ||||
|     MetaWidth, | ||||
| @ -130,9 +132,14 @@ typedef struct vfile { | ||||
|     }; | ||||
| 
 | ||||
|     int is_fs_file; | ||||
|     int has_checksum; | ||||
|     int calculate_checksum; | ||||
|     const char *filepath; | ||||
|     struct stat info; | ||||
| 
 | ||||
|     SHA_CTX sha1_ctx; | ||||
|     unsigned char sha1_digest[SHA1_DIGEST_LENGTH]; | ||||
| 
 | ||||
|     read_func_t read; | ||||
|     seek_func_t seek; | ||||
|     close_func_t close; | ||||
|  | ||||
| @ -13,7 +13,10 @@ | ||||
| #define INITIAL_BUF_SIZE (1024 * 16) | ||||
| 
 | ||||
| #define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c)) | ||||
| #define SHOULD_KEEP_CHAR(c) (((c) >= '\'' && (c) <= ';') || ((c) >= 'A' && (c) <= 'z') || ((c) > 127)) | ||||
| #define SHOULD_KEEP_CHAR(c) (\ | ||||
|     ((c) >= '\'' && (c) <= ';') || \ | ||||
|     ((c) >= 'A' && (c) <= 'z') || \ | ||||
|     ((c) > 127 && (c) != 0x00A0 && (c) && (c) != 0xFFFD)) | ||||
| 
 | ||||
| 
 | ||||
| typedef struct dyn_buffer { | ||||
| @ -333,4 +336,26 @@ static void *read_all(vfile_t *f, size_t *size) { | ||||
|     return buf; | ||||
| } | ||||
| 
 | ||||
| #define STACK_BUFFER_SIZE (size_t)(4096 * 8) | ||||
| 
 | ||||
| __always_inline | ||||
| static void safe_sha1_update(SHA_CTX *ctx, void *buf, size_t size) { | ||||
|     unsigned char stack_buf[STACK_BUFFER_SIZE]; | ||||
| 
 | ||||
|     void *sha1_buf; | ||||
|     if (size <= STACK_BUFFER_SIZE) { | ||||
|         sha1_buf = stack_buf; | ||||
|     } else { | ||||
|         void *heap_sha1_buf = malloc(size); | ||||
|         sha1_buf = heap_sha1_buf; | ||||
|     } | ||||
| 
 | ||||
|     memcpy(sha1_buf, buf, size); | ||||
|     SHA1_Update(ctx, (const void *) sha1_buf, size); | ||||
| 
 | ||||
|     if (sha1_buf != stack_buf) { | ||||
|         free(sha1_buf); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
|  | ||||
| @ -46,7 +46,7 @@ static scan_wpd_ctx_t wpd_ctx; | ||||
| static scan_json_ctx_t json_ctx; | ||||
| 
 | ||||
| 
 | ||||
| document_t LastSubDoc; | ||||
| static document_t LastSubDoc; | ||||
| 
 | ||||
| void _parse_media(parse_job_t *job) { | ||||
|     parse_media(&media_ctx, &job->vfile, &LastSubDoc); | ||||
| @ -225,6 +225,24 @@ TEST(Ebook, Utf8Pdf) { | ||||
|     cleanup(&doc, &f); | ||||
| } | ||||
| 
 | ||||
| TEST(Ebook, Utf8PdfInvalidChars) { | ||||
|     vfile_t f; | ||||
|     document_t doc; | ||||
|     load_doc_file("libscan-test-files/test_files/ebook/invalid_chars.pdf", &f, &doc); | ||||
| 
 | ||||
|     ebook_ctx.tesseract_lang = nullptr; | ||||
| 
 | ||||
|     parse_ebook(&ebook_ctx, &f, "application/pdf", &doc); | ||||
| 
 | ||||
|     ebook_ctx.tesseract_lang = "eng"; | ||||
| 
 | ||||
|     // It should say "HART is a group of highly qualified ..." but the PDF
 | ||||
|     //  text is been intentionally fucked with by the authors
 | ||||
|     // We can at least filter out the non-printable/invalid characters like '<27>' etc
 | ||||
|     ASSERT_TRUE(STR_STARTS_WITH(get_meta(&doc, MetaContent)->str_val, "HART i a g f highl alified ")); | ||||
|     cleanup(&doc, &f); | ||||
| } | ||||
| 
 | ||||
| TEST(Ebook, Pdf2) { | ||||
|     vfile_t f; | ||||
|     document_t doc; | ||||
| @ -418,6 +436,20 @@ TEST(MediaImage, Mem1) { | ||||
|     cleanup(&doc, &f); | ||||
| } | ||||
| 
 | ||||
| TEST(MediaImage, AsIsFs) { | ||||
|     vfile_t f; | ||||
|     document_t doc; | ||||
|     load_doc_file("libscan-test-files/test_files/media/9555.jpg", &f, &doc); | ||||
| 
 | ||||
|     size_t size_before = store_size; | ||||
| 
 | ||||
|     parse_media(&media_ctx, &f, &doc); | ||||
| 
 | ||||
|     ASSERT_EQ(size_before + 14098, store_size); | ||||
| 
 | ||||
|     cleanup(&doc, &f); | ||||
| } | ||||
| 
 | ||||
| TEST(MediaImage, Mem2AsIs) { | ||||
|     vfile_t f; | ||||
|     document_t doc; | ||||
|  | ||||
| @ -61,6 +61,8 @@ void load_file(const char *filepath, vfile_t *f) { | ||||
|     f->read = fs_read; | ||||
|     f->close = fs_close; | ||||
|     f->is_fs_file = TRUE; | ||||
|     f->calculate_checksum = TRUE; | ||||
|     f->has_checksum = FALSE; | ||||
| } | ||||
| 
 | ||||
| void load_mem(void *mem, size_t size, vfile_t *f) { | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user