mirror of
				https://github.com/simon987/libscan.git
				synced 2025-10-25 23:26:52 +00:00 
			
		
		
		
	Compare commits
	
		
			6 Commits
		
	
	
		
			babd6b6d13
			...
			097580eb40
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 097580eb40 | |||
| b1f001d8f1 | |||
| 4ea76adfaa | |||
| 8212dd4b23 | |||
| 49d4f1ae48 | |||
| 0a9742b686 | 
| @ -47,19 +47,25 @@ int arc_read(struct vfile *f, void *buf, size_t size) { | |||||||
|     return read; |     return read; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| int arc_open(vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) { | int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) { | ||||||
|     arc_data->f = f; |     arc_data->f = f; | ||||||
| 
 | 
 | ||||||
|     if (f->is_fs_file) { |     if (f->is_fs_file) { | ||||||
|         *a = archive_read_new(); |         *a = archive_read_new(); | ||||||
|         archive_read_support_filter_all(*a); |         archive_read_support_filter_all(*a); | ||||||
|         archive_read_support_format_all(*a); |         archive_read_support_format_all(*a); | ||||||
|  |         if (ctx->passphrase[0] != 0) { | ||||||
|  |             archive_read_add_passphrase(*a, ctx->passphrase); | ||||||
|  |         } | ||||||
| 
 | 
 | ||||||
|        return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE); |        return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE); | ||||||
|     } else if (allow_recurse) { |     } else if (allow_recurse) { | ||||||
|         *a = archive_read_new(); |         *a = archive_read_new(); | ||||||
|         archive_read_support_filter_all(*a); |         archive_read_support_filter_all(*a); | ||||||
|         archive_read_support_format_all(*a); |         archive_read_support_format_all(*a); | ||||||
|  |         if (ctx->passphrase[0] != 0) { | ||||||
|  |             archive_read_add_passphrase(*a, ctx->passphrase); | ||||||
|  |         } | ||||||
| 
 | 
 | ||||||
|         return archive_read_open( |         return archive_read_open( | ||||||
|                 *a, arc_data, |                 *a, arc_data, | ||||||
| @ -80,7 +86,7 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { | |||||||
|     arc_data_t arc_data; |     arc_data_t arc_data; | ||||||
|     arc_data.f = f; |     arc_data.f = f; | ||||||
| 
 | 
 | ||||||
|     int ret = arc_open(f, &a, &arc_data, ctx->mode == ARC_MODE_RECURSE); |     int ret = arc_open(ctx, f, &a, &arc_data, ctx->mode == ARC_MODE_RECURSE); | ||||||
|     if (ret == ARC_SKIPPED) { |     if (ret == ARC_SKIPPED) { | ||||||
|         return SCAN_OK; |         return SCAN_OK; | ||||||
|     } |     } | ||||||
|  | |||||||
| @ -20,6 +20,7 @@ typedef struct { | |||||||
|     log_callback_t log; |     log_callback_t log; | ||||||
|     logf_callback_t logf; |     logf_callback_t logf; | ||||||
|     store_callback_t store; |     store_callback_t store; | ||||||
|  |     char passphrase[4096]; | ||||||
| } scan_arc_ctx_t; | } scan_arc_ctx_t; | ||||||
| 
 | 
 | ||||||
| #define ARC_BUF_SIZE 8192 | #define ARC_BUF_SIZE 8192 | ||||||
| @ -56,7 +57,7 @@ static int vfile_close_callback(struct archive *a, void *user_data) { | |||||||
|     return ARCHIVE_OK; |     return ARCHIVE_OK; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| int arc_open(vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse); | int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse); | ||||||
| 
 | 
 | ||||||
| int should_parse_filtered_file(const char *filepath, int ext); | int should_parse_filtered_file(const char *filepath, int ext); | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -5,13 +5,14 @@ | |||||||
| #include <stdlib.h> | #include <stdlib.h> | ||||||
| #include <archive.h> | #include <archive.h> | ||||||
| 
 | 
 | ||||||
|  | static scan_arc_ctx_t arc_ctx = (scan_arc_ctx_t) {.passphrase = {0,}}; | ||||||
| 
 | 
 | ||||||
| void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc) { | void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc) { | ||||||
|     struct archive *a = NULL; |     struct archive *a = NULL; | ||||||
|     struct archive_entry *entry = NULL; |     struct archive_entry *entry = NULL; | ||||||
|     arc_data_t arc_data; |     arc_data_t arc_data; | ||||||
| 
 | 
 | ||||||
|     int ret = arc_open(f, &a, &arc_data, TRUE); |     int ret = arc_open(&arc_ctx, f, &a, &arc_data, TRUE); | ||||||
|     if (ret != ARCHIVE_OK) { |     if (ret != ARCHIVE_OK) { | ||||||
|         CTX_LOG_ERRORF(f->filepath, "(cbr.c) [%d] %s", ret, archive_error_string(a)) |         CTX_LOG_ERRORF(f->filepath, "(cbr.c) [%d] %s", ret, archive_error_string(a)) | ||||||
|         archive_read_free(a); |         archive_read_free(a); | ||||||
| @ -21,17 +22,17 @@ void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc) { | |||||||
|     while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { |     while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { | ||||||
|         struct stat info = *archive_entry_stat(entry); |         struct stat info = *archive_entry_stat(entry); | ||||||
|         if (S_ISREG(info.st_mode)) { |         if (S_ISREG(info.st_mode)) { | ||||||
|             const char* utf8_name = archive_entry_pathname_utf8(entry); |             const char *utf8_name = archive_entry_pathname_utf8(entry); | ||||||
|             const char* file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name; |             const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name; | ||||||
| 
 | 
 | ||||||
|             char *p = strrchr(file_path, '.'); |             char *p = strrchr(file_path, '.'); | ||||||
|             if (p != NULL && strcmp(p, ".png") == 0 || strcmp(p, ".jpg") == 0 || strcmp(p, ".jpeg") == 0) { |             if (p != NULL && strcmp(p, ".png") == 0 || strcmp(p, ".jpg") == 0 || strcmp(p, ".jpeg") == 0) { | ||||||
|                 size_t entry_size = archive_entry_size(entry); |                 size_t entry_size = archive_entry_size(entry); | ||||||
|                 void* buf = malloc(entry_size); |                 void *buf = malloc(entry_size); | ||||||
|                 int read = archive_read_data(a, buf, entry_size); |                 int read = archive_read_data(a, buf, entry_size); | ||||||
| 
 | 
 | ||||||
|                 if (read != entry_size) { |                 if (read != entry_size) { | ||||||
|                     const char* err_str = archive_error_string(a); |                     const char *err_str = archive_error_string(a); | ||||||
|                     if (err_str) { |                     if (err_str) { | ||||||
|                         CTX_LOG_ERRORF("comic.c", "Error while reading entry: %s", err_str) |                         CTX_LOG_ERRORF("comic.c", "Error while reading entry: %s", err_str) | ||||||
|                     } |                     } | ||||||
| @ -39,7 +40,7 @@ void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc) { | |||||||
|                     break; |                     break; | ||||||
|                 } |                 } | ||||||
| 
 | 
 | ||||||
|                 ret = store_image_thumbnail((scan_media_ctx_t*)ctx, buf, entry_size, doc, file_path); |                 ret = store_image_thumbnail((scan_media_ctx_t *) ctx, buf, entry_size, doc, file_path); | ||||||
|                 free(buf); |                 free(buf); | ||||||
| 
 | 
 | ||||||
|                 if (ret == TRUE) { |                 if (ret == TRUE) { | ||||||
|  | |||||||
| @ -255,7 +255,7 @@ void fill_image(fz_context *fzctx, UNUSED(fz_device *dev), | |||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc) { | void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only) { | ||||||
| 
 | 
 | ||||||
|     fz_context *fzctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED); |     fz_context *fzctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED); | ||||||
|     thread_ctx = *ctx; |     thread_ctx = *ctx; | ||||||
| @ -285,26 +285,6 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     char title[8192] = {'\0',}; |  | ||||||
|     fz_try(fzctx) |  | ||||||
|         fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title)); |  | ||||||
|     fz_catch(fzctx) |  | ||||||
|         ; |  | ||||||
| 
 |  | ||||||
|     if (strlen(title) > 0) { |  | ||||||
|         APPEND_UTF8_META(doc, MetaTitle, title) |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     char author[4096] = {'\0',}; |  | ||||||
|     fz_try(fzctx) |  | ||||||
|         fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_AUTHOR, author, sizeof(author)); |  | ||||||
|     fz_catch(fzctx) |  | ||||||
|         ; |  | ||||||
| 
 |  | ||||||
|     if (strlen(author) > 0) { |  | ||||||
|         APPEND_UTF8_META(doc, MetaAuthor, author) |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     int page_count = -1; |     int page_count = -1; | ||||||
|     fz_var(err); |     fz_var(err); | ||||||
|     fz_try(fzctx) |     fz_try(fzctx) | ||||||
| @ -331,6 +311,33 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     if (tn_only) { | ||||||
|  |         fz_drop_stream(fzctx, stream); | ||||||
|  |         fz_drop_document(fzctx, fzdoc); | ||||||
|  |         fz_drop_context(fzctx); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     char title[8192] = {'\0',}; | ||||||
|  |     fz_try(fzctx) | ||||||
|  |                 fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title)); | ||||||
|  |     fz_catch(fzctx) | ||||||
|  |         ; | ||||||
|  | 
 | ||||||
|  |     if (strlen(title) > 0) { | ||||||
|  |         APPEND_UTF8_META(doc, MetaTitle, title) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     char author[4096] = {'\0',}; | ||||||
|  |     fz_try(fzctx) | ||||||
|  |                 fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_AUTHOR, author, sizeof(author)); | ||||||
|  |     fz_catch(fzctx) | ||||||
|  |         ; | ||||||
|  | 
 | ||||||
|  |     if (strlen(author) > 0) { | ||||||
|  |         APPEND_UTF8_META(doc, MetaAuthor, author) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
|     if (ctx->content_size > 0) { |     if (ctx->content_size > 0) { | ||||||
|         fz_stext_options opts = {0}; |         fz_stext_options opts = {0}; | ||||||
| @ -425,6 +432,6 @@ void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, docume | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     parse_ebook_mem(ctx, buf, buf_len, mime_str, doc); |     parse_ebook_mem(ctx, buf, buf_len, mime_str, doc, FALSE); | ||||||
|     free(buf); |     free(buf); | ||||||
| } | } | ||||||
|  | |||||||
| @ -16,6 +16,6 @@ typedef struct { | |||||||
| } scan_ebook_ctx_t; | } scan_ebook_ctx_t; | ||||||
| 
 | 
 | ||||||
| void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char* mime_str,  document_t *doc); | void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char* mime_str,  document_t *doc); | ||||||
| void parse_ebook_mem(scan_ebook_ctx_t *ctx, void* buf, size_t buf_len, const char* mime_str,  document_t *doc); | void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only); | ||||||
| 
 | 
 | ||||||
| #endif | #endif | ||||||
|  | |||||||
| @ -85,7 +85,59 @@ static void frame_and_packet_free(frame_and_packet_t *frame_and_packet) { | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| __always_inline | __always_inline | ||||||
| static frame_and_packet_t *read_frame(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVCodecContext *decoder, int stream_idx, | static void read_subtitles(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, int stream_idx, document_t *doc) { | ||||||
|  | 
 | ||||||
|  |     text_buffer_t tex = text_buffer_create(-1); | ||||||
|  | 
 | ||||||
|  |     AVPacket packet; | ||||||
|  |     AVSubtitle subtitle; | ||||||
|  | 
 | ||||||
|  |     AVCodec *subtitle_codec = avcodec_find_decoder(pFormatCtx->streams[stream_idx]->codecpar->codec_id); | ||||||
|  |     AVCodecContext *decoder = avcodec_alloc_context3(subtitle_codec); | ||||||
|  |     avcodec_parameters_to_context(decoder, pFormatCtx->streams[stream_idx]->codecpar); | ||||||
|  |     avcodec_open2(decoder, subtitle_codec, NULL); | ||||||
|  | 
 | ||||||
|  |     decoder->sub_text_format = FF_SUB_TEXT_FMT_ASS; | ||||||
|  | 
 | ||||||
|  |     int got_sub; | ||||||
|  | 
 | ||||||
|  |     while (1) { | ||||||
|  |         int read_frame_ret = av_read_frame(pFormatCtx, &packet); | ||||||
|  | 
 | ||||||
|  |         if (read_frame_ret != 0) { | ||||||
|  |             break; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         if (packet.stream_index != stream_idx) { | ||||||
|  |             av_packet_unref(&packet); | ||||||
|  |             continue; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         avcodec_decode_subtitle2(decoder, &subtitle, &got_sub, &packet); | ||||||
|  | 
 | ||||||
|  |         if (got_sub) { | ||||||
|  |             for (int i = 0; i < subtitle.num_rects; i++) { | ||||||
|  |                 const char *text = subtitle.rects[i]->ass; | ||||||
|  | 
 | ||||||
|  |                 char *idx = strstr(text, "\\N"); | ||||||
|  |                 if (idx != NULL && strlen(idx + 2) > 1) { | ||||||
|  |                     text_buffer_append_string0(&tex, idx + 2); | ||||||
|  |                     text_buffer_append_char(&tex, ' '); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             avsubtitle_free(&subtitle); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     text_buffer_terminate_string(&tex); | ||||||
|  | 
 | ||||||
|  |     APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf) | ||||||
|  |     text_buffer_destroy(&tex); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | __always_inline | ||||||
|  | static frame_and_packet_t * | ||||||
|  | read_frame(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVCodecContext *decoder, int stream_idx, | ||||||
|            document_t *doc) { |            document_t *doc) { | ||||||
| 
 | 
 | ||||||
|     frame_and_packet_t *result = calloc(1, sizeof(frame_and_packet_t)); |     frame_and_packet_t *result = calloc(1, sizeof(frame_and_packet_t)); | ||||||
| @ -261,6 +313,7 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, | |||||||
| 
 | 
 | ||||||
|     int video_stream = -1; |     int video_stream = -1; | ||||||
|     int audio_stream = -1; |     int audio_stream = -1; | ||||||
|  |     int subtitle_stream = -1; | ||||||
| 
 | 
 | ||||||
|     avformat_find_stream_info(pFormatCtx, NULL); |     avformat_find_stream_info(pFormatCtx, NULL); | ||||||
| 
 | 
 | ||||||
| @ -299,6 +352,17 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, | |||||||
| 
 | 
 | ||||||
|                 video_stream = i; |                 video_stream = i; | ||||||
|             } |             } | ||||||
|  |         } else if (stream->codecpar->codec_type == AVMEDIA_TYPE_SUBTITLE) { | ||||||
|  |             subtitle_stream = i; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (subtitle_stream != -1 && ctx->read_subtitles) { | ||||||
|  |         read_subtitles(ctx, pFormatCtx, subtitle_stream, doc); | ||||||
|  | 
 | ||||||
|  |         // Reset stream
 | ||||||
|  |         if (video_stream != -1) { | ||||||
|  |             av_seek_frame(pFormatCtx, video_stream, 0, 0); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
| @ -352,7 +416,8 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, | |||||||
| 
 | 
 | ||||||
|         if (scaled_frame == STORE_AS_IS) { |         if (scaled_frame == STORE_AS_IS) { | ||||||
|             APPEND_TN_META(doc, frame_and_packet->frame->width, frame_and_packet->frame->height) |             APPEND_TN_META(doc, frame_and_packet->frame->width, frame_and_packet->frame->height) | ||||||
|             ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data, frame_and_packet->packet->size); |             ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data, | ||||||
|  |                        frame_and_packet->packet->size); | ||||||
|         } else { |         } else { | ||||||
|             // Encode frame to jpeg
 |             // Encode frame to jpeg
 | ||||||
|             AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, |             AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, | ||||||
| @ -532,7 +597,7 @@ void init_media() { | |||||||
|     av_log_set_level(AV_LOG_QUIET); |     av_log_set_level(AV_LOG_QUIET); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| int store_image_thumbnail(scan_media_ctx_t *ctx, void* buf, size_t buf_len, document_t *doc, const char *url) { | int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) { | ||||||
|     memfile_t memfile; |     memfile_t memfile; | ||||||
|     AVIOContext *io_ctx = NULL; |     AVIOContext *io_ctx = NULL; | ||||||
| 
 | 
 | ||||||
| @ -604,7 +669,8 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void* buf, size_t buf_len, docu | |||||||
| 
 | 
 | ||||||
|     if (scaled_frame == STORE_AS_IS) { |     if (scaled_frame == STORE_AS_IS) { | ||||||
|         APPEND_TN_META(doc, frame_and_packet->frame->width, frame_and_packet->frame->height) |         APPEND_TN_META(doc, frame_and_packet->frame->width, frame_and_packet->frame->height) | ||||||
|         ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data, frame_and_packet->packet->size); |         ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data, | ||||||
|  |                    frame_and_packet->packet->size); | ||||||
|     } else { |     } else { | ||||||
|         // Encode frame to jpeg
 |         // Encode frame to jpeg
 | ||||||
|         AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, |         AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, | ||||||
|  | |||||||
| @ -22,7 +22,6 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi | |||||||
| 
 | 
 | ||||||
|     int doc_word_version = iGuessVersionNumber(file_in, buf_len); |     int doc_word_version = iGuessVersionNumber(file_in, buf_len); | ||||||
|     if (doc_word_version < 0 || doc_word_version == 3) { |     if (doc_word_version < 0 || doc_word_version == 3) { | ||||||
|         fclose(file_in); |  | ||||||
|         free(buf); |         free(buf); | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
| @ -68,7 +67,6 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi | |||||||
|         text_buffer_destroy(&tex); |         text_buffer_destroy(&tex); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     fclose(file_in); |  | ||||||
|     free(buf); |     free(buf); | ||||||
|     free(out_buf); |     free(out_buf); | ||||||
| } | } | ||||||
| @ -84,21 +82,19 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b | |||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|     // Open word doc
 |     // Open word doc
 | ||||||
| 
 |  | ||||||
|     options_type *opts = direct_vGetOptions(); |     options_type *opts = direct_vGetOptions(); | ||||||
|     opts->iParagraphBreak = 74; |     opts->iParagraphBreak = 74; | ||||||
|     opts->eConversionType = conversion_pdf; |     opts->eConversionType = conversion_pdf; | ||||||
|     opts->bHideHiddenText = 1; |     opts->bHideHiddenText = 1; | ||||||
|     opts->bRemoveRemovedText = 1; |     opts->bRemoveRemovedText = 1; | ||||||
|     opts->bUseLandscape = 0; |     opts->bUseLandscape = 0; | ||||||
|     opts->eEncoding = encoding_latin_2; |     opts->eEncoding = encoding_latin_1; | ||||||
|     opts->iPageHeight = 842; // A4
 |     opts->iPageHeight = 842; // A4
 | ||||||
|     opts->iPageWidth = 595; |     opts->iPageWidth = 595; | ||||||
|     opts->eImageLevel = level_ps_3; |     opts->eImageLevel = level_ps_3; | ||||||
| 
 | 
 | ||||||
|     int doc_word_version = iGuessVersionNumber(file, buf_len); |     int doc_word_version = iGuessVersionNumber(file, buf_len); | ||||||
|     if (doc_word_version < 0 || doc_word_version == 3) { |     if (doc_word_version < 0 || doc_word_version == 3) { | ||||||
|         fclose(file); |  | ||||||
|         free(buf); |         free(buf); | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
| @ -111,7 +107,6 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b | |||||||
| 
 | 
 | ||||||
|     diagram_type *diag = pCreateDiagram("antiword", NULL, file_out); |     diagram_type *diag = pCreateDiagram("antiword", NULL, file_out); | ||||||
|     if (diag == NULL) { |     if (diag == NULL) { | ||||||
|         fclose(file); |  | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
| @ -120,9 +115,8 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b | |||||||
| 
 | 
 | ||||||
|     fclose(file_out); |     fclose(file_out); | ||||||
| 
 | 
 | ||||||
|     parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc); |     parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc, TRUE); | ||||||
| 
 | 
 | ||||||
|     fclose(file); |  | ||||||
|     free(buf); |     free(buf); | ||||||
|     free(out_buf); |     free(out_buf); | ||||||
| } | } | ||||||
| @ -144,8 +138,10 @@ void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) { | |||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     if (ctx->tn_size > 0) { |     if (ctx->tn_size > 0) { | ||||||
|         parse_msdoc_pdf(ctx, doc, file, buf, buf_len); |         char *buf_pdf = malloc(buf_len); | ||||||
|     } else { |         memcpy(buf_pdf, buf, buf_len); | ||||||
|         parse_msdoc_text(ctx, doc, file, buf, buf_len); |         parse_msdoc_pdf(ctx, doc, file, buf_pdf, buf_len); | ||||||
|     } |     } | ||||||
|  |     parse_msdoc_text(ctx, doc, file, buf, buf_len); | ||||||
|  |     fclose(file); | ||||||
| } | } | ||||||
|  | |||||||
| @ -595,6 +595,23 @@ TEST(Arc, Utf8) { | |||||||
|     cleanup(&doc, &f); |     cleanup(&doc, &f); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | TEST(Arc, EncryptedZip) { | ||||||
|  |     vfile_t f; | ||||||
|  |     document_t doc; | ||||||
|  |     load_doc_file("libscan-test-files/test_files/arc/encrypted.zip", &f, &doc); | ||||||
|  | 
 | ||||||
|  |     size_t size_before = store_size; | ||||||
|  | 
 | ||||||
|  |     strcpy(arc_recurse_media_ctx.passphrase, "sist2"); | ||||||
|  |     parse_archive(&arc_recurse_media_ctx, &f, &doc); | ||||||
|  | 
 | ||||||
|  |     arc_recurse_media_ctx.passphrase[0] = '\0'; | ||||||
|  | 
 | ||||||
|  |     ASSERT_NE(size_before, store_size); | ||||||
|  | 
 | ||||||
|  |     cleanup(&doc, &f); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| /* RAW */ | /* RAW */ | ||||||
| TEST(RAW, Panasonic) { | TEST(RAW, Panasonic) { | ||||||
|     vfile_t f; |     vfile_t f; | ||||||
| @ -786,6 +803,34 @@ TEST(Msdoc, Test4Pdf) { | |||||||
|     cleanup(&doc, &f); |     cleanup(&doc, &f); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | TEST(Msdoc, TestUtf8Pdf) { | ||||||
|  |     vfile_t f; | ||||||
|  |     document_t doc; | ||||||
|  |     load_doc_file("libscan-test-files/test_files/msdoc/japanese.doc", &f, &doc); | ||||||
|  | 
 | ||||||
|  |     size_t size_before = store_size; | ||||||
|  | 
 | ||||||
|  |     parse_msdoc(&msdoc_ctx, &f, &doc); | ||||||
|  | 
 | ||||||
|  |     ASSERT_NE(get_meta(&doc, MetaContent), nullptr); | ||||||
|  |     ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "调查项目 A questionnaire") != nullptr); | ||||||
|  |     ASSERT_NE(size_before, store_size); | ||||||
|  | 
 | ||||||
|  |     cleanup(&doc, &f); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | TEST(Msdoc, TestUtf8Text) { | ||||||
|  |     vfile_t f; | ||||||
|  |     document_t doc; | ||||||
|  |     load_doc_file("libscan-test-files/test_files/msdoc/japanese.doc", &f, &doc); | ||||||
|  | 
 | ||||||
|  |     parse_msdoc(&msdoc_text_ctx, &f, &doc); | ||||||
|  | 
 | ||||||
|  |     ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "调查项目 A questionnaire") != nullptr); | ||||||
|  | 
 | ||||||
|  |     cleanup(&doc, &f); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| TEST(Msdoc, TestFuzz1) { | TEST(Msdoc, TestFuzz1) { | ||||||
|     vfile_t f; |     vfile_t f; | ||||||
|     document_t doc; |     document_t doc; | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								third-party/antiword
									
									
									
									
										vendored
									
									
								
							
							
								
								
								
								
								
								
									
									
								
							
						
						
									
										2
									
								
								third-party/antiword
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | |||||||
| Subproject commit eb8d737eea2866bfb45e50423a1fd6c51454c2f6 | Subproject commit 62ae66db99e9dd88dfa31999f516f71bb8bdc8b2 | ||||||
							
								
								
									
										2
									
								
								third-party/utf8.h
									
									
									
									
										vendored
									
									
								
							
							
								
								
								
								
								
								
									
									
								
							
						
						
									
										2
									
								
								third-party/utf8.h
									
									
									
									
										vendored
									
									
								
							| @ -1 +1 @@ | |||||||
| Subproject commit a67acc78fd0fc272ad45362b828efdcb24874e64 | Subproject commit ee5a7d4beb7755da13e4d4ec3eccfb65a0530456 | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user