From da172823745b67662846cf1970a47ebcea8fe50e Mon Sep 17 00:00:00 2001 From: simon987 Date: Sat, 11 Sep 2021 20:46:25 -0400 Subject: [PATCH] Support for rewind buffer --- libscan/arc/arc.c | 59 +++++++++++++++++++++++++++++++++++++++++-- libscan/arc/arc.h | 2 ++ libscan/media/media.c | 52 +++++++++++++++++++++++++------------- libscan/media/media.h | 2 +- libscan/ooxml/ooxml.c | 2 -- libscan/scan.h | 8 +++++- test/main.cpp | 47 ++++++++++++++++++++++++++++++++++ test/test_util.cpp | 6 ++--- 8 files changed, 152 insertions(+), 26 deletions(-) diff --git a/libscan/arc/arc.c b/libscan/arc/arc.c index d89c188..6830b7e 100644 --- a/libscan/arc/arc.c +++ b/libscan/arc/arc.c @@ -36,19 +36,46 @@ int should_parse_filtered_file(const char *filepath, int ext) { void arc_close(struct vfile *f) { SHA1_Final(f->sha1_digest, &f->sha1_ctx); + + if (f->rewind_buffer != NULL) { + free(f->rewind_buffer); + f->rewind_buffer = NULL; + f->rewind_buffer_size = 0; + f->rewind_buffer_cursor = 0; + } } int arc_read(struct vfile *f, void *buf, size_t size) { + + int bytes_copied = 0; + + if (f->rewind_buffer_size != 0) { + if (size > f->rewind_buffer_size) { + memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, f->rewind_buffer_size); + + bytes_copied = f->rewind_buffer_size; + size -= f->rewind_buffer_size; + buf += f->rewind_buffer_size; + f->rewind_buffer_size = 0; + } else { + memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, size); + f->rewind_buffer_size -= (int) size; + f->rewind_buffer_cursor += (int) size; + + return (int) size; + } + } + size_t bytes_read = archive_read_data(f->arc, buf, size); if (bytes_read != 0 && bytes_read <= size && f->calculate_checksum) { f->has_checksum = TRUE; - safe_sha1_update(&f->sha1_ctx, (unsigned char*)buf, bytes_read); + safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, bytes_read); } - if (bytes_read != size) { + if (bytes_read != size && archive_errno(f->arc) != 0) { const char *error_str = archive_error_string(f->arc); if (error_str != NULL) { f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str); @@ -56,6 +83,31 @@ int arc_read(struct vfile *f, void *buf, size_t size) { return -1; } + return (int) bytes_read + bytes_copied; +} + +int arc_read_rewindable(struct vfile *f, void *buf, size_t size) { + + if (f->rewind_buffer != NULL) { + fprintf(stderr, "Allocated rewind buffer more than once for %s", f->filepath); + exit(-1); + } + + size_t bytes_read = archive_read_data(f->arc, buf, size); + + if (bytes_read != size && archive_errno(f->arc) != 0) { + const char *error_str = archive_error_string(f->arc); + if (error_str != NULL) { + f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str); + } + return -1; + } + + f->rewind_buffer = malloc(size); + f->rewind_buffer_size = (int) size; + f->rewind_buffer_cursor = 0; + memcpy(f->rewind_buffer, buf, size); + return (int) bytes_read; } @@ -135,10 +187,13 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { sub_job->vfile.close = arc_close; sub_job->vfile.read = arc_read; + sub_job->vfile.read_rewindable = arc_read_rewindable; sub_job->vfile.reset = NULL; sub_job->vfile.arc = a; sub_job->vfile.filepath = sub_job->filepath; sub_job->vfile.is_fs_file = FALSE; + sub_job->vfile.rewind_buffer_size = 0; + sub_job->vfile.rewind_buffer = NULL; sub_job->vfile.log = ctx->log; sub_job->vfile.logf = ctx->logf; sub_job->vfile.has_checksum = FALSE; diff --git a/libscan/arc/arc.h b/libscan/arc/arc.h index 346f36a..b38ce98 100644 --- a/libscan/arc/arc.h +++ b/libscan/arc/arc.h @@ -72,6 +72,8 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc); int arc_read(struct vfile *f, void *buf, size_t size); +int arc_read_rewindable(struct vfile *f, void *buf, size_t size); + void arc_close(struct vfile *f); #endif diff --git a/libscan/media/media.c b/libscan/media/media.c index dba4bcf..411e8cc 100644 --- a/libscan/media/media.c +++ b/libscan/media/media.c @@ -7,6 +7,22 @@ #define STORE_AS_IS ((void*)-1) +const char *get_filepath_with_ext(document_t *doc, const char *filepath, const char *mime_str) { + + int has_extension = doc->ext > doc->base; + + if (!has_extension) { + if (strcmp(mime_str, "image/png") == 0) { + return "file.png"; + } else if (strcmp(mime_str, "image/jpeg") == 0) { + return "file.jpg"; + } + } + + return filepath; +} + + __always_inline void *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) { @@ -497,7 +513,7 @@ int vfile_read(void *ptr, uint8_t *buf, int buf_size) { } typedef struct { - struct stat info; + size_t size; FILE *file; void *buf; } memfile_t; @@ -518,7 +534,7 @@ long memfile_seek(void *ptr, long offset, int whence) { memfile_t *mem = ptr; if (whence == 0x10000) { - return mem->info.st_size; + return mem->size; } int ret = fseek(mem->file, offset, whence); @@ -530,31 +546,31 @@ long memfile_seek(void *ptr, long offset, int whence) { } int memfile_open(vfile_t *f, memfile_t *mem) { - mem->info = f->info; + mem->size = f->info.st_size; - mem->buf = malloc(mem->info.st_size); + mem->buf = malloc(mem->size); if (mem->buf == NULL) { return -1; } - int ret = f->read(f, mem->buf, mem->info.st_size); - mem->file = fmemopen(mem->buf, mem->info.st_size, "rb"); + int ret = f->read(f, mem->buf, mem->size); + mem->file = fmemopen(mem->buf, mem->size, "rb"); if (f->calculate_checksum) { SHA1_Init(&f->sha1_ctx); - safe_sha1_update(&f->sha1_ctx, mem->buf, mem->info.st_size); + safe_sha1_update(&f->sha1_ctx, mem->buf, mem->size); SHA1_Final(f->sha1_digest, &f->sha1_ctx); f->has_checksum = TRUE; } - return (ret == mem->info.st_size && mem->file != NULL) ? 0 : -1; + return (ret == mem->size && mem->file != NULL) ? 0 : -1; } int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) { - mem->info.st_size = (int) buf_len; + mem->size = (int) buf_len; mem->buf = buf; - mem->file = fmemopen(mem->buf, mem->info.st_size, "rb"); + mem->file = fmemopen(mem->buf, mem->size, "rb"); return mem->file != NULL ? 0 : -1; } @@ -566,7 +582,7 @@ void memfile_close(memfile_t *mem) { } } -void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc) { +void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc, const char *mime_str) { AVFormatContext *pFormatCtx = avformat_alloc_context(); if (pFormatCtx == NULL) { @@ -576,7 +592,9 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc) unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE); AVIOContext *io_ctx = NULL; - memfile_t memfile = {{}, 0, 0}; + memfile_t memfile = {0, 0, 0}; + + const char *filepath = get_filepath_with_ext(doc, f->filepath, mime_str); if (f->info.st_size <= ctx->max_media_buffer) { int ret = memfile_open(f, &memfile); @@ -593,7 +611,7 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc) pFormatCtx->pb = io_ctx; - int res = avformat_open_input(&pFormatCtx, f->filepath, NULL, NULL); + int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL); if (res < 0) { if (res != -5) { CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res)) @@ -612,12 +630,12 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc) memfile_close(&memfile); } -void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc) { +void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char *mime_str) { if (f->is_fs_file) { parse_media_filename(ctx, f->filepath, doc); } else { - parse_media_vfile(ctx, f, doc); + parse_media_vfile(ctx, f, doc, mime_str); } } @@ -626,7 +644,7 @@ void init_media() { } int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) { - memfile_t memfile = {{}, 0, 0}; + memfile_t memfile = {0, 0, 0}; AVIOContext *io_ctx = NULL; AVFormatContext *pFormatCtx = avformat_alloc_context(); @@ -663,7 +681,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu AVStream *stream = pFormatCtx->streams[0]; // Decoder - AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id); + const AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id); AVCodecContext *decoder = avcodec_alloc_context3(video_codec); avcodec_parameters_to_context(decoder, stream->codecpar); avcodec_open2(decoder, video_codec, NULL); diff --git a/libscan/media/media.h b/libscan/media/media.h index 4ca1946..d62c249 100644 --- a/libscan/media/media.h +++ b/libscan/media/media.h @@ -43,7 +43,7 @@ static AVCodecContext *alloc_jpeg_encoder(int w, int h, float qscale) { } -void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc); +void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char*mime_str); void init_media(); diff --git a/libscan/ooxml/ooxml.c b/libscan/ooxml/ooxml.c index d4fd7b3..c975f31 100644 --- a/libscan/ooxml/ooxml.c +++ b/libscan/ooxml/ooxml.c @@ -41,8 +41,6 @@ int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_ if (err->level == XML_ERR_FATAL) { CTX_LOG_ERRORF("ooxml.c", "Got fatal XML error while parsing document: %s", err->message) return -1; - } else { - CTX_LOG_ERRORF("ooxml.c", "Got recoverable XML error while parsing document: %s", err->message) } } diff --git a/libscan/scan.h b/libscan/scan.h index 47b0177..1ecfd22 100644 --- a/libscan/scan.h +++ b/libscan/scan.h @@ -17,7 +17,9 @@ #define UNUSED(x) __attribute__((__unused__)) x typedef void (*store_callback_t)(char *key, size_t key_len, char *buf, size_t buf_len); + typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...); + typedef void (*log_callback_t)(const char *filepath, int level, char *str); typedef int scan_code_t; @@ -140,8 +142,12 @@ typedef struct vfile { SHA_CTX sha1_ctx; unsigned char sha1_digest[SHA1_DIGEST_LENGTH]; + void *rewind_buffer; + int rewind_buffer_size; + int rewind_buffer_cursor; + read_func_t read; - seek_func_t seek; + read_func_t read_rewindable; close_func_t close; reset_func_t reset; log_callback_t log; diff --git a/test/main.cpp b/test/main.cpp index 675a8ab..6e988db 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -18,6 +18,7 @@ extern "C" { static scan_arc_ctx_t arc_recurse_media_ctx; static scan_arc_ctx_t arc_list_ctx; +static scan_arc_ctx_t arc_recurse_ooxml_ctx; static scan_text_ctx_t text_500_ctx; @@ -52,6 +53,10 @@ void _parse_media(parse_job_t *job) { parse_media(&media_ctx, &job->vfile, &LastSubDoc); } +void _parse_ooxml(parse_job_t *job) { + parse_ooxml(&ooxml_500_ctx, &job->vfile, &LastSubDoc); +} + /* Text */ @@ -631,6 +636,42 @@ TEST(Ooxml, Docx1) { cleanup(&doc, &f); } +TEST(Ooxml, Docx2) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/ooxml/docx2.docx", &f, &doc); + + ooxml_500_ctx.content_size = 999999; + parse_ooxml(&ooxml_500_ctx, &f, &doc); + + ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "liz evans"); + ASSERT_EQ(get_meta(&doc, MetaPages)->long_val, 1); + ASSERT_EQ(strlen(get_meta(&doc, MetaContent)->str_val), 2780); + + ooxml_500_ctx.content_size = 500; + + cleanup(&doc, &f); +} + +TEST(Ooxml, Docx2Archive) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/ooxml/docx2.docx.7z", &f, &doc); + + ooxml_500_ctx.content_size = 999999; + parse_archive(&arc_recurse_ooxml_ctx, &f, &doc); + + ASSERT_STREQ(get_meta(&LastSubDoc, MetaAuthor)->str_val, "liz evans"); + ASSERT_EQ(get_meta(&LastSubDoc, MetaPages)->long_val, 1); + ASSERT_EQ(strlen(get_meta(&LastSubDoc, MetaContent)->str_val), 2780); + + fprintf(stderr, "%s\n", get_meta(&LastSubDoc, MetaContent)->str_val); + + ooxml_500_ctx.content_size = 500; + + cleanup(&doc, &f); +} + TEST(Ooxml, Docx2Thumbnail) { vfile_t f; document_t doc; @@ -1033,6 +1074,12 @@ int main(int argc, char **argv) { arc_recurse_media_ctx.mode = ARC_MODE_RECURSE; arc_recurse_media_ctx.parse = _parse_media; + arc_recurse_ooxml_ctx.log = noop_log; + arc_recurse_ooxml_ctx.logf = noop_logf; + arc_recurse_ooxml_ctx.store = counter_store; + arc_recurse_ooxml_ctx.mode = ARC_MODE_RECURSE; + arc_recurse_ooxml_ctx.parse = _parse_ooxml; + arc_list_ctx.log = noop_log; arc_list_ctx.logf = noop_logf; arc_list_ctx.store = counter_store; diff --git a/test/test_util.cpp b/test/test_util.cpp index 0174d40..ee78f10 100644 --- a/test/test_util.cpp +++ b/test/test_util.cpp @@ -16,7 +16,7 @@ int fs_read(struct vfile *f, void *buf, size_t size) { } } - return read(f->fd, buf, size); + return (int) read(f->fd, buf, size); } //Note: No out of bounds check @@ -68,7 +68,7 @@ void load_file(const char *filepath, vfile_t *f) { void load_mem(void *mem, size_t size, vfile_t *f) { f->filepath = "_mem_"; f->_test_data = mem; - f->info.st_size = size; + f->info.st_size = (int) size; f->read = mem_read; f->close = nullptr; f->is_fs_file = TRUE; @@ -108,7 +108,7 @@ void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p) { } for (int disp = 0; disp < width; disp++) { - buf[offset + disp] = (int8_t)rand(); + buf[offset + disp] = (int8_t) rand(); } } }