mirror of
https://github.com/simon987/libscan.git
synced 2025-04-09 13:56:42 +00:00
Support for rewind buffer
This commit is contained in:
parent
52d7649322
commit
da17282374
@ -36,19 +36,46 @@ int should_parse_filtered_file(const char *filepath, int ext) {
|
||||
|
||||
void arc_close(struct vfile *f) {
|
||||
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
|
||||
|
||||
if (f->rewind_buffer != NULL) {
|
||||
free(f->rewind_buffer);
|
||||
f->rewind_buffer = NULL;
|
||||
f->rewind_buffer_size = 0;
|
||||
f->rewind_buffer_cursor = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int arc_read(struct vfile *f, void *buf, size_t size) {
|
||||
|
||||
int bytes_copied = 0;
|
||||
|
||||
if (f->rewind_buffer_size != 0) {
|
||||
if (size > f->rewind_buffer_size) {
|
||||
memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, f->rewind_buffer_size);
|
||||
|
||||
bytes_copied = f->rewind_buffer_size;
|
||||
size -= f->rewind_buffer_size;
|
||||
buf += f->rewind_buffer_size;
|
||||
f->rewind_buffer_size = 0;
|
||||
} else {
|
||||
memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, size);
|
||||
f->rewind_buffer_size -= (int) size;
|
||||
f->rewind_buffer_cursor += (int) size;
|
||||
|
||||
return (int) size;
|
||||
}
|
||||
}
|
||||
|
||||
size_t bytes_read = archive_read_data(f->arc, buf, size);
|
||||
|
||||
if (bytes_read != 0 && bytes_read <= size && f->calculate_checksum) {
|
||||
f->has_checksum = TRUE;
|
||||
|
||||
safe_sha1_update(&f->sha1_ctx, (unsigned char*)buf, bytes_read);
|
||||
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, bytes_read);
|
||||
}
|
||||
|
||||
if (bytes_read != size) {
|
||||
if (bytes_read != size && archive_errno(f->arc) != 0) {
|
||||
const char *error_str = archive_error_string(f->arc);
|
||||
if (error_str != NULL) {
|
||||
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
|
||||
@ -56,6 +83,31 @@ int arc_read(struct vfile *f, void *buf, size_t size) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return (int) bytes_read + bytes_copied;
|
||||
}
|
||||
|
||||
int arc_read_rewindable(struct vfile *f, void *buf, size_t size) {
|
||||
|
||||
if (f->rewind_buffer != NULL) {
|
||||
fprintf(stderr, "Allocated rewind buffer more than once for %s", f->filepath);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
size_t bytes_read = archive_read_data(f->arc, buf, size);
|
||||
|
||||
if (bytes_read != size && archive_errno(f->arc) != 0) {
|
||||
const char *error_str = archive_error_string(f->arc);
|
||||
if (error_str != NULL) {
|
||||
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
f->rewind_buffer = malloc(size);
|
||||
f->rewind_buffer_size = (int) size;
|
||||
f->rewind_buffer_cursor = 0;
|
||||
memcpy(f->rewind_buffer, buf, size);
|
||||
|
||||
return (int) bytes_read;
|
||||
}
|
||||
|
||||
@ -135,10 +187,13 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
sub_job->vfile.close = arc_close;
|
||||
sub_job->vfile.read = arc_read;
|
||||
sub_job->vfile.read_rewindable = arc_read_rewindable;
|
||||
sub_job->vfile.reset = NULL;
|
||||
sub_job->vfile.arc = a;
|
||||
sub_job->vfile.filepath = sub_job->filepath;
|
||||
sub_job->vfile.is_fs_file = FALSE;
|
||||
sub_job->vfile.rewind_buffer_size = 0;
|
||||
sub_job->vfile.rewind_buffer = NULL;
|
||||
sub_job->vfile.log = ctx->log;
|
||||
sub_job->vfile.logf = ctx->logf;
|
||||
sub_job->vfile.has_checksum = FALSE;
|
||||
|
@ -72,6 +72,8 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
int arc_read(struct vfile *f, void *buf, size_t size);
|
||||
|
||||
int arc_read_rewindable(struct vfile *f, void *buf, size_t size);
|
||||
|
||||
void arc_close(struct vfile *f);
|
||||
|
||||
#endif
|
||||
|
@ -7,6 +7,22 @@
|
||||
|
||||
#define STORE_AS_IS ((void*)-1)
|
||||
|
||||
const char *get_filepath_with_ext(document_t *doc, const char *filepath, const char *mime_str) {
|
||||
|
||||
int has_extension = doc->ext > doc->base;
|
||||
|
||||
if (!has_extension) {
|
||||
if (strcmp(mime_str, "image/png") == 0) {
|
||||
return "file.png";
|
||||
} else if (strcmp(mime_str, "image/jpeg") == 0) {
|
||||
return "file.jpg";
|
||||
}
|
||||
}
|
||||
|
||||
return filepath;
|
||||
}
|
||||
|
||||
|
||||
__always_inline
|
||||
void *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) {
|
||||
|
||||
@ -497,7 +513,7 @@ int vfile_read(void *ptr, uint8_t *buf, int buf_size) {
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
struct stat info;
|
||||
size_t size;
|
||||
FILE *file;
|
||||
void *buf;
|
||||
} memfile_t;
|
||||
@ -518,7 +534,7 @@ long memfile_seek(void *ptr, long offset, int whence) {
|
||||
memfile_t *mem = ptr;
|
||||
|
||||
if (whence == 0x10000) {
|
||||
return mem->info.st_size;
|
||||
return mem->size;
|
||||
}
|
||||
|
||||
int ret = fseek(mem->file, offset, whence);
|
||||
@ -530,31 +546,31 @@ long memfile_seek(void *ptr, long offset, int whence) {
|
||||
}
|
||||
|
||||
int memfile_open(vfile_t *f, memfile_t *mem) {
|
||||
mem->info = f->info;
|
||||
mem->size = f->info.st_size;
|
||||
|
||||
mem->buf = malloc(mem->info.st_size);
|
||||
mem->buf = malloc(mem->size);
|
||||
if (mem->buf == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int ret = f->read(f, mem->buf, mem->info.st_size);
|
||||
mem->file = fmemopen(mem->buf, mem->info.st_size, "rb");
|
||||
int ret = f->read(f, mem->buf, mem->size);
|
||||
mem->file = fmemopen(mem->buf, mem->size, "rb");
|
||||
|
||||
if (f->calculate_checksum) {
|
||||
SHA1_Init(&f->sha1_ctx);
|
||||
safe_sha1_update(&f->sha1_ctx, mem->buf, mem->info.st_size);
|
||||
safe_sha1_update(&f->sha1_ctx, mem->buf, mem->size);
|
||||
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
|
||||
f->has_checksum = TRUE;
|
||||
}
|
||||
|
||||
return (ret == mem->info.st_size && mem->file != NULL) ? 0 : -1;
|
||||
return (ret == mem->size && mem->file != NULL) ? 0 : -1;
|
||||
}
|
||||
|
||||
int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) {
|
||||
mem->info.st_size = (int) buf_len;
|
||||
mem->size = (int) buf_len;
|
||||
|
||||
mem->buf = buf;
|
||||
mem->file = fmemopen(mem->buf, mem->info.st_size, "rb");
|
||||
mem->file = fmemopen(mem->buf, mem->size, "rb");
|
||||
|
||||
return mem->file != NULL ? 0 : -1;
|
||||
}
|
||||
@ -566,7 +582,7 @@ void memfile_close(memfile_t *mem) {
|
||||
}
|
||||
}
|
||||
|
||||
void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc) {
|
||||
void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc, const char *mime_str) {
|
||||
|
||||
AVFormatContext *pFormatCtx = avformat_alloc_context();
|
||||
if (pFormatCtx == NULL) {
|
||||
@ -576,7 +592,9 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc)
|
||||
|
||||
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
|
||||
AVIOContext *io_ctx = NULL;
|
||||
memfile_t memfile = {{}, 0, 0};
|
||||
memfile_t memfile = {0, 0, 0};
|
||||
|
||||
const char *filepath = get_filepath_with_ext(doc, f->filepath, mime_str);
|
||||
|
||||
if (f->info.st_size <= ctx->max_media_buffer) {
|
||||
int ret = memfile_open(f, &memfile);
|
||||
@ -593,7 +611,7 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc)
|
||||
|
||||
pFormatCtx->pb = io_ctx;
|
||||
|
||||
int res = avformat_open_input(&pFormatCtx, f->filepath, NULL, NULL);
|
||||
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
|
||||
if (res < 0) {
|
||||
if (res != -5) {
|
||||
CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
|
||||
@ -612,12 +630,12 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc)
|
||||
memfile_close(&memfile);
|
||||
}
|
||||
|
||||
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char *mime_str) {
|
||||
|
||||
if (f->is_fs_file) {
|
||||
parse_media_filename(ctx, f->filepath, doc);
|
||||
} else {
|
||||
parse_media_vfile(ctx, f, doc);
|
||||
parse_media_vfile(ctx, f, doc, mime_str);
|
||||
}
|
||||
}
|
||||
|
||||
@ -626,7 +644,7 @@ void init_media() {
|
||||
}
|
||||
|
||||
int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) {
|
||||
memfile_t memfile = {{}, 0, 0};
|
||||
memfile_t memfile = {0, 0, 0};
|
||||
AVIOContext *io_ctx = NULL;
|
||||
|
||||
AVFormatContext *pFormatCtx = avformat_alloc_context();
|
||||
@ -663,7 +681,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
|
||||
AVStream *stream = pFormatCtx->streams[0];
|
||||
|
||||
// Decoder
|
||||
AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
|
||||
const AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
|
||||
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
|
||||
avcodec_parameters_to_context(decoder, stream->codecpar);
|
||||
avcodec_open2(decoder, video_codec, NULL);
|
||||
|
@ -43,7 +43,7 @@ static AVCodecContext *alloc_jpeg_encoder(int w, int h, float qscale) {
|
||||
}
|
||||
|
||||
|
||||
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char*mime_str);
|
||||
|
||||
void init_media();
|
||||
|
||||
|
@ -41,8 +41,6 @@ int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_
|
||||
if (err->level == XML_ERR_FATAL) {
|
||||
CTX_LOG_ERRORF("ooxml.c", "Got fatal XML error while parsing document: %s", err->message)
|
||||
return -1;
|
||||
} else {
|
||||
CTX_LOG_ERRORF("ooxml.c", "Got recoverable XML error while parsing document: %s", err->message)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -17,7 +17,9 @@
|
||||
#define UNUSED(x) __attribute__((__unused__)) x
|
||||
|
||||
typedef void (*store_callback_t)(char *key, size_t key_len, char *buf, size_t buf_len);
|
||||
|
||||
typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...);
|
||||
|
||||
typedef void (*log_callback_t)(const char *filepath, int level, char *str);
|
||||
|
||||
typedef int scan_code_t;
|
||||
@ -140,8 +142,12 @@ typedef struct vfile {
|
||||
SHA_CTX sha1_ctx;
|
||||
unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
|
||||
|
||||
void *rewind_buffer;
|
||||
int rewind_buffer_size;
|
||||
int rewind_buffer_cursor;
|
||||
|
||||
read_func_t read;
|
||||
seek_func_t seek;
|
||||
read_func_t read_rewindable;
|
||||
close_func_t close;
|
||||
reset_func_t reset;
|
||||
log_callback_t log;
|
||||
|
@ -18,6 +18,7 @@ extern "C" {
|
||||
|
||||
static scan_arc_ctx_t arc_recurse_media_ctx;
|
||||
static scan_arc_ctx_t arc_list_ctx;
|
||||
static scan_arc_ctx_t arc_recurse_ooxml_ctx;
|
||||
|
||||
static scan_text_ctx_t text_500_ctx;
|
||||
|
||||
@ -52,6 +53,10 @@ void _parse_media(parse_job_t *job) {
|
||||
parse_media(&media_ctx, &job->vfile, &LastSubDoc);
|
||||
}
|
||||
|
||||
void _parse_ooxml(parse_job_t *job) {
|
||||
parse_ooxml(&ooxml_500_ctx, &job->vfile, &LastSubDoc);
|
||||
}
|
||||
|
||||
|
||||
/* Text */
|
||||
|
||||
@ -631,6 +636,42 @@ TEST(Ooxml, Docx1) {
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Ooxml, Docx2) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/ooxml/docx2.docx", &f, &doc);
|
||||
|
||||
ooxml_500_ctx.content_size = 999999;
|
||||
parse_ooxml(&ooxml_500_ctx, &f, &doc);
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "liz evans");
|
||||
ASSERT_EQ(get_meta(&doc, MetaPages)->long_val, 1);
|
||||
ASSERT_EQ(strlen(get_meta(&doc, MetaContent)->str_val), 2780);
|
||||
|
||||
ooxml_500_ctx.content_size = 500;
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Ooxml, Docx2Archive) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/ooxml/docx2.docx.7z", &f, &doc);
|
||||
|
||||
ooxml_500_ctx.content_size = 999999;
|
||||
parse_archive(&arc_recurse_ooxml_ctx, &f, &doc);
|
||||
|
||||
ASSERT_STREQ(get_meta(&LastSubDoc, MetaAuthor)->str_val, "liz evans");
|
||||
ASSERT_EQ(get_meta(&LastSubDoc, MetaPages)->long_val, 1);
|
||||
ASSERT_EQ(strlen(get_meta(&LastSubDoc, MetaContent)->str_val), 2780);
|
||||
|
||||
fprintf(stderr, "%s\n", get_meta(&LastSubDoc, MetaContent)->str_val);
|
||||
|
||||
ooxml_500_ctx.content_size = 500;
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Ooxml, Docx2Thumbnail) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
@ -1033,6 +1074,12 @@ int main(int argc, char **argv) {
|
||||
arc_recurse_media_ctx.mode = ARC_MODE_RECURSE;
|
||||
arc_recurse_media_ctx.parse = _parse_media;
|
||||
|
||||
arc_recurse_ooxml_ctx.log = noop_log;
|
||||
arc_recurse_ooxml_ctx.logf = noop_logf;
|
||||
arc_recurse_ooxml_ctx.store = counter_store;
|
||||
arc_recurse_ooxml_ctx.mode = ARC_MODE_RECURSE;
|
||||
arc_recurse_ooxml_ctx.parse = _parse_ooxml;
|
||||
|
||||
arc_list_ctx.log = noop_log;
|
||||
arc_list_ctx.logf = noop_logf;
|
||||
arc_list_ctx.store = counter_store;
|
||||
|
@ -16,7 +16,7 @@ int fs_read(struct vfile *f, void *buf, size_t size) {
|
||||
}
|
||||
}
|
||||
|
||||
return read(f->fd, buf, size);
|
||||
return (int) read(f->fd, buf, size);
|
||||
}
|
||||
|
||||
//Note: No out of bounds check
|
||||
@ -68,7 +68,7 @@ void load_file(const char *filepath, vfile_t *f) {
|
||||
void load_mem(void *mem, size_t size, vfile_t *f) {
|
||||
f->filepath = "_mem_";
|
||||
f->_test_data = mem;
|
||||
f->info.st_size = size;
|
||||
f->info.st_size = (int) size;
|
||||
f->read = mem_read;
|
||||
f->close = nullptr;
|
||||
f->is_fs_file = TRUE;
|
||||
@ -108,7 +108,7 @@ void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p) {
|
||||
}
|
||||
|
||||
for (int disp = 0; disp < width; disp++) {
|
||||
buf[offset + disp] = (int8_t)rand();
|
||||
buf[offset + disp] = (int8_t) rand();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user