Faster comic book parsing, probably fixes simon987/sist2#77

This commit is contained in:
simon987 2020-07-17 19:10:13 -04:00
parent 7c1a832360
commit 0438c0e761
11 changed files with 284 additions and 146 deletions

View File

@ -14,7 +14,7 @@ add_library(
libscan/text/text.c libscan/text/text.h
libscan/arc/arc.c libscan/arc/arc.h
libscan/ebook/ebook.c libscan/ebook/ebook.h
libscan/cbr/cbr.c libscan/cbr/cbr.h
libscan/comic/comic.c libscan/comic/comic.h
libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h
libscan/media/media.c libscan/media/media.h
libscan/font/font.c libscan/font/font.h

View File

@ -1,15 +1,11 @@
#include "arc.h"
#include "../scan.h"
#include "../util.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
int should_parse_filtered_file(const char *filepath, int ext) {
char tmp[PATH_MAX * 2];
@ -41,67 +37,41 @@ int arc_read(struct vfile *f, void *buf, size_t size) {
return archive_read_data(f->arc, buf, size);
}
typedef struct arc_data {
vfile_t *f;
char buf[ARC_BUF_SIZE];
} arc_data_f;
int arc_open(vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) {
arc_data->f = f;
int vfile_open_callback(struct archive *a, void *user_data) {
arc_data_f *data = user_data;
if (f->is_fs_file) {
*a = archive_read_new();
archive_read_support_filter_all(*a);
archive_read_support_format_all(*a);
if (data->f->is_fs_file && data->f->fd == -1) {
data->f->fd = open(data->f->filepath, O_RDONLY);
}
return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE);
} else if (allow_recurse) {
*a = archive_read_new();
archive_read_support_filter_all(*a);
archive_read_support_format_all(*a);
return ARCHIVE_OK;
}
long vfile_read_callback(struct archive *a, void *user_data, const void **buf) {
arc_data_f *data = user_data;
*buf = data->buf;
return data->f->read(data->f, data->buf, ARC_BUF_SIZE);
}
int vfile_close_callback(struct archive *a, void *user_data) {
arc_data_f *data = user_data;
if (data->f->close != NULL) {
data->f->close(data->f);
}
return ARCHIVE_OK;
}
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
struct archive *a;
struct archive_entry *entry;
arc_data_f data;
data.f = f;
int ret = 0;
if (data.f->is_fs_file) {
a = archive_read_new();
archive_read_support_filter_all(a);
archive_read_support_format_all(a);
ret = archive_read_open_filename(a, f->filepath, ARC_BUF_SIZE);
} else if (ctx->mode == ARC_MODE_RECURSE) {
a = archive_read_new();
archive_read_support_filter_all(a);
archive_read_support_format_all(a);
ret = archive_read_open(
a, &data,
return archive_read_open(
*a, arc_data,
vfile_open_callback,
vfile_read_callback,
vfile_close_callback
);
} else {
return ARC_SKIPPED;
}
}
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
arc_data.f = f;
int ret = arc_open(f, &a, &arc_data, ctx->mode == ARC_MODE_RECURSE);
if (ret == ARC_SKIPPED) {
return SCAN_OK;
}
@ -112,15 +82,14 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
}
if (ctx->mode == ARC_MODE_LIST) {
dyn_buffer_t buf = dyn_buffer_create();
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
const char* utf8_name = archive_entry_pathname_utf8(entry);
const char* file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
char *path = (char *) archive_entry_pathname_utf8(entry);
dyn_buffer_append_string(&buf, path);
dyn_buffer_append_string(&buf, file_path);
dyn_buffer_write_char(&buf, ' ');
}
}

View File

@ -3,8 +3,10 @@
#include <archive.h>
#include <archive_entry.h>
#include <fcntl.h>
#include "../scan.h"
# define ARC_SKIPPED -1
#define ARC_MODE_SKIP 0
#define ARC_MODE_LIST 1
#define ARC_MODE_SHALLOW 2
@ -22,6 +24,40 @@ typedef struct {
#define ARC_BUF_SIZE 8192
typedef struct {
vfile_t *f;
char buf[ARC_BUF_SIZE];
} arc_data_t;
static int vfile_open_callback(struct archive *a, void *user_data) {
arc_data_t *data = (arc_data_t*)user_data;
if (data->f->is_fs_file && data->f->fd == -1) {
data->f->fd = open(data->f->filepath, O_RDONLY);
}
return ARCHIVE_OK;
}
static long vfile_read_callback(struct archive *a, void *user_data, const void **buf) {
arc_data_t *data = (arc_data_t*)user_data;
*buf = data->buf;
return data->f->read(data->f, data->buf, ARC_BUF_SIZE);
}
static int vfile_close_callback(struct archive *a, void *user_data) {
arc_data_t *data = (arc_data_t*)user_data;
if (data->f->close != NULL) {
data->f->close(data->f);
}
return ARCHIVE_OK;
}
int arc_open(vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse);
int should_parse_filtered_file(const char *filepath, int ext);
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc);

View File

@ -1,50 +0,0 @@
#include "cbr.h"
#include "../scan.h"
#include "../util.h"
#include "../arc/arc.h"
#include <stdlib.h>
#include <archive.h>
void parse_cbr(scan_cbr_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
void *buf = read_all(f, &buf_len);
char *out_buf = malloc(buf_len * 2); // TODO: we probably only need 1.2x or 1.5x, even better would be a dynamic buffer
size_t out_buf_used = 0;
struct archive *rar_in = archive_read_new();
archive_read_support_filter_none(rar_in);
archive_read_support_format_rar(rar_in);
archive_read_open_memory(rar_in, buf, buf_len);
struct archive *zip_out = archive_write_new();
archive_write_set_format_zip(zip_out);
archive_write_open_memory(zip_out, out_buf, buf_len * 2, &out_buf_used);
struct archive_entry *entry;
while (archive_read_next_header(rar_in, &entry) == ARCHIVE_OK) {
archive_write_header(zip_out, entry);
char arc_buf[ARC_BUF_SIZE];
int len = archive_read_data(rar_in, arc_buf, ARC_BUF_SIZE);
while (len > 0) {
archive_write_data(zip_out, arc_buf, len);
len = archive_read_data(rar_in, arc_buf, ARC_BUF_SIZE);
}
}
archive_write_close(zip_out);
archive_write_free(zip_out);
archive_read_close(rar_in);
archive_read_free(rar_in);
parse_ebook_mem(&ctx->ebook_ctx, out_buf, out_buf_used, "application/x-cbz", doc);
doc->mime = ctx->cbr_mime;
free(out_buf);
free(buf);
}

View File

@ -1,22 +0,0 @@
#ifndef SCAN_CBR_H
#define SCAN_CBR_H
#include <stdlib.h>
#include "../ebook/ebook.h"
typedef struct {
scan_ebook_ctx_t ebook_ctx;
unsigned int cbr_mime;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
} scan_cbr_ctx_t;
__always_inline
static int is_cbr(scan_cbr_ctx_t *ctx, unsigned int mime) {
return mime == ctx->cbr_mime;
}
void parse_cbr(scan_cbr_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

44
libscan/comic/comic.c Normal file
View File

@ -0,0 +1,44 @@
#include "comic.h"
#include "../media/media.h"
#include "../arc/arc.h"
#include <stdlib.h>
#include <archive.h>
void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
int ret = arc_open(f, &a, &arc_data, TRUE);
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(cbr.c) [%d] %s", ret, archive_error_string(a))
archive_read_free(a);
return;
}
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
struct stat info = *archive_entry_stat(entry);
if (S_ISREG(info.st_mode)) {
const char* utf8_name = archive_entry_pathname_utf8(entry);
const char* file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
char *p = strrchr(file_path, '.');
if (p != NULL && strcmp(p, ".png") == 0 || strcmp(p, ".jpg") == 0 || strcmp(p, ".jpeg") == 0) {
size_t entry_size = archive_entry_size(entry);
void* buf = malloc(entry_size);
archive_read_data(a, buf, entry_size);
ret = store_image_thumbnail((scan_media_ctx_t*)ctx, buf, entry_size, doc);
free(buf);
if (ret == TRUE) {
break;
}
}
}
}
archive_read_free(a);
}

31
libscan/comic/comic.h Normal file
View File

@ -0,0 +1,31 @@
#ifndef SCAN_CBR_H
#define SCAN_CBR_H
#include <stdlib.h>
#include "../ebook/ebook.h"
typedef struct {
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int tn_size;
float tn_qscale;
unsigned int cbr_mime;
unsigned int cbz_mime;
} scan_comic_ctx_t;
__always_inline
static int is_cbr(scan_comic_ctx_t *ctx, unsigned int mime) {
return mime == ctx->cbr_mime;
}
__always_inline
static int is_cbz(scan_comic_ctx_t *ctx, unsigned int mime) {
return mime == ctx->cbz_mime;
}
void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

View File

@ -40,6 +40,10 @@ static AVCodecContext *alloc_jpeg_encoder(scan_media_ctx_t *ctx, int dstW, int d
__always_inline
AVFrame *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) {
if (frame->pict_type == AV_PICTURE_TYPE_NONE) {
return NULL;
}
int dstW;
int dstH;
if (frame->width <= size && frame->height <= size) {
@ -443,7 +447,16 @@ int memfile_open(vfile_t *f, memfile_t *mem) {
int ret = f->read(f, mem->buf, mem->info.st_size);
mem->file = fmemopen(mem->buf, mem->info.st_size, "rb");
return ret == mem->info.st_size ? 0 : -1;
return (ret == mem->info.st_size && mem->file != NULL) ? 0 : -1;
}
int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) {
mem->info.st_size = buf_len;
mem->buf = buf;
mem->file = fmemopen(mem->buf, mem->info.st_size, "rb");
return mem->file != NULL ? 0 : -1;
}
void memfile_close(memfile_t *mem) {
@ -517,3 +530,102 @@ void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc) {
void init_media() {
av_log_set_level(AV_LOG_QUIET);
}
int store_image_thumbnail(scan_media_ctx_t *ctx, void* buf, size_t buf_len, document_t *doc) {
memfile_t memfile;
AVIOContext *io_ctx = NULL;
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
return FALSE;
}
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
int ret = memfile_open_buf(buf, buf_len, &memfile);
if (ret == 0) {
CTX_LOG_DEBUGF(doc->filepath, "Loading media file in memory (%ldB)", buf_len)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, &memfile, memfile_read, NULL, memfile_seek);
} else {
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
pFormatCtx->pb = io_ctx;
int res = avformat_open_input(&pFormatCtx, "", NULL, NULL);
if (res != 0) {
av_free(io_ctx->buffer);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
AVStream *stream = pFormatCtx->streams[0];
// Decoder
AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
avcodec_parameters_to_context(decoder, stream->codecpar);
avcodec_open2(decoder, video_codec, NULL);
AVFrame *frame = read_frame(ctx, pFormatCtx, decoder, 0, doc);
if (frame == NULL) {
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
// Scale frame
AVFrame *scaled_frame = scale_frame(decoder, frame, ctx->tn_size);
if (scaled_frame == NULL) {
av_frame_free(&frame);
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
// Encode frame to jpeg
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(ctx, scaled_frame->width, scaled_frame->height, ctx->tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
// Save thumbnail
APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
ctx->store((char *) doc->uuid, sizeof(doc->uuid), (char *) jpeg_packet.data, jpeg_packet.size);
av_packet_unref(&jpeg_packet);
av_frame_free(&frame);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
avcodec_free_context(&jpeg_encoder);
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return TRUE;
}

View File

@ -5,16 +5,18 @@
#include "../scan.h"
typedef struct {
int tn_size;
float tn_qscale;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int tn_size;
float tn_qscale;
long max_media_buffer;
} scan_media_ctx_t;
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc);
void init_media();
int store_image_thumbnail(scan_media_ctx_t *ctx, void* buf, size_t buf_len, document_t *doc);
#endif

View File

@ -117,12 +117,12 @@ static void dyn_buffer_write_char(dyn_buffer_t *buf, char c) {
buf->cur += sizeof(c);
}
static void dyn_buffer_write_str(dyn_buffer_t *buf, char *str) {
static void dyn_buffer_write_str(dyn_buffer_t *buf, const char *str) {
dyn_buffer_write(buf, str, strlen(str));
dyn_buffer_write_char(buf, '\0');
}
static void dyn_buffer_append_string(dyn_buffer_t *buf, char *str) {
static void dyn_buffer_append_string(dyn_buffer_t *buf, const char *str) {
dyn_buffer_write(buf, str, strlen(str));
}

View File

@ -5,6 +5,7 @@ extern "C" {
#include "../libscan/arc/arc.h"
#include "../libscan/text/text.h"
#include "../libscan/ebook/ebook.h"
#include "../libscan/comic/comic.h"
#include "../libscan/media/media.h"
#include "../libscan/ooxml/ooxml.h"
#include "../libscan/mobi/scan_mobi.h"
@ -20,6 +21,8 @@ static scan_text_ctx_t text_500_ctx;
static scan_ebook_ctx_t ebook_ctx;
static scan_ebook_ctx_t ebook_500_ctx;
static scan_comic_ctx_t comic_ctx;
static scan_media_ctx_t media_ctx;
static scan_ooxml_ctx_t ooxml_500_ctx;
@ -203,25 +206,32 @@ TEST(Ebook, Epub1) {
cleanup(&doc, &f);
}
TEST(Ebook, ComicCbz) {
/* Comic */
TEST(Comic, ComicCbz) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ebook/lost_treasure.cbz", &f, &doc);
parse_ebook(&ebook_500_ctx, &f, "application/vnd.comicbook+zip", &doc);
size_t size_before = store_size;
parse_comic(&comic_ctx, &f, &doc);
ASSERT_NE(size_before, store_size);
//TODO: Check that thumbnail was generated correctly
cleanup(&doc, &f);
}
TEST(Ebook, ComicCbr) {
TEST(Comic, ComicCbr) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ebook/laugh.cbr", &f, &doc);
parse_ebook(&ebook_500_ctx, &f, "application/vnd.comicbook-rar", &doc);
size_t size_before = store_size;
parse_comic(&comic_ctx, &f, &doc);
ASSERT_NE(size_before, store_size);
//TODO: Check that thumbnail was generated correctly
cleanup(&doc, &f);
}
@ -589,6 +599,12 @@ int main(int argc, char **argv) {
ebook_500_ctx = ebook_ctx;
ebook_500_ctx.content_size = 500;
comic_ctx.tn_qscale = 1.0;
comic_ctx.tn_size = 500;
comic_ctx.log = noop_log;
comic_ctx.logf = noop_logf;
comic_ctx.store = counter_store;
media_ctx.log = noop_log;
media_ctx.logf = noop_logf;
media_ctx.store = counter_store;