From b7a565a1c4bb3598d6307edf33a3776ece0ea10a Mon Sep 17 00:00:00 2001 From: simon987 Date: Thu, 9 Apr 2020 16:16:01 -0400 Subject: [PATCH] support for mobi files simon987/sist2#41 --- CMakeLists.txt | 28 +++++++++++++-- libscan/macros.h | 6 ++++ libscan/mobi/scan_mobi.c | 73 ++++++++++++++++++++++++++++++++++++++++ libscan/mobi/scan_mobi.h | 14 ++++++++ libscan/ooxml/ooxml.c | 6 ---- libscan/scan.h | 10 ++---- libscan/util.h | 45 +++++++++++++++++++++---- test/main.cpp | 49 +++++++++++++++++++++++++++ test/test_util.h | 4 +-- 9 files changed, 210 insertions(+), 25 deletions(-) create mode 100644 libscan/mobi/scan_mobi.c create mode 100644 libscan/mobi/scan_mobi.h diff --git a/CMakeLists.txt b/CMakeLists.txt index a6b6dfb..2edb926 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,7 +18,7 @@ add_library( libscan/font/font.c libscan/font/font.h third-party/utf8.h -) + libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h) set_target_properties(scan PROPERTIES LINKER_LANGUAGE C) set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib) @@ -49,7 +49,28 @@ target_compile_options( -g ) -#SET(CMAKE_C_LINK_EXECUTABLE "g++ -o ") +include(ExternalProject) +find_program(MAKE_EXE NAMES gmake nmake make) +ExternalProject_Add( + libmobi + GIT_REPOSITORY https://github.com/bfabiszewski/libmobi.git + GIT_TAG "public" + + UPDATE_COMMAND "" + PATCH_COMMAND "" + TEST_COMMAND "" + CONFIGURE_COMMAND ./autogen.sh && ./configure + INSTALL_COMMAND "" + + PREFIX "third-party/ext_libmobi" + SOURCE_DIR "third-party/ext_libmobi/src/libmobi" + BINARY_DIR "third-party/ext_libmobi/src/libmobi" + + BUILD_COMMAND make -j 3 --silent +) + +SET(MOBI_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/.libs/) +SET(MOBI_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/) string(REGEX REPLACE "-lvdpau" "" FFMPEG_LIBRARIES "${FFMPEG_LIBRARIES}") string(REGEX REPLACE "-lX11" "" FFMPEG_LIBRARIES "${FFMPEG_LIBRARIES}") @@ -69,6 +90,8 @@ target_link_libraries( ${HARFBUZZ_LIBRARIES} libmupdf + ${MOBI_LIB_DIR}/libmobi.a + freetype ${HARFBUZZ_LIBRARIES} ${JBIG2DEC_LIB} @@ -106,6 +129,7 @@ target_include_directories( ${JPEG_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR} ${FFMPEG_INCLUDE_DIR} + ${MOBI_INCLUDE_DIR} ) # Testing diff --git a/libscan/macros.h b/libscan/macros.h index 0dfd5b0..3361ed6 100644 --- a/libscan/macros.h +++ b/libscan/macros.h @@ -19,3 +19,9 @@ #undef ABS #define ABS(a) (((a) < 0) ? -(a) : (a)) + +#define APPEND_STR_META(doc, keyname, value) \ + meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \ + meta_str->key = keyname; \ + strcpy(meta_str->str_val, value); \ + APPEND_META(doc, meta_str) diff --git a/libscan/mobi/scan_mobi.c b/libscan/mobi/scan_mobi.c new file mode 100644 index 0000000..aa87e81 --- /dev/null +++ b/libscan/mobi/scan_mobi.c @@ -0,0 +1,73 @@ +#include "scan_mobi.h" + +#include +#include "stdlib.h" + +void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc) { + + MOBIData *m = mobi_init(); + if (m == NULL) { + CTX_LOG_ERROR(f->filepath, "mobi_init() failed") + return; + } + + size_t buf_len; + char* buf = read_all(f, &buf_len); + + FILE *file = fmemopen(buf, buf_len, "rb"); + if (file == NULL) { + mobi_free(m); + free(buf); + CTX_LOG_ERRORF(f->filepath, "fmemopen() failed: %d", ferror(file)) + return; + } + + MOBI_RET mobi_ret = mobi_load_file(m, file); + fclose(file); + if (mobi_ret != MOBI_SUCCESS) { + mobi_free(m); + free(buf); + CTX_LOG_ERRORF(f->filepath, "mobi_laod_file() returned error code [%d]", mobi_ret) + return; + } + + char *author = mobi_meta_get_author(m); + if (author != NULL) { + APPEND_STR_META(doc, MetaAuthor, author) + free(author); + } + char *title = mobi_meta_get_title(m); + if (title != NULL) { + APPEND_STR_META(doc, MetaTitle, title) + free(title); + } + + const size_t maxlen = mobi_get_text_maxsize(m); + if (maxlen == MOBI_NOTSET) { + free(buf); + CTX_LOG_DEBUGF("%s", "Invalid text maxsize: %zu", maxlen) + return; + } + + char *content_str = malloc(maxlen + 1); + size_t length = maxlen; + mobi_ret = mobi_get_rawml(m, content_str, &length); + if (mobi_ret != MOBI_SUCCESS) { + mobi_free(m); + free(content_str); + free(buf); + CTX_LOG_ERRORF(f->filepath, "mobi_get_rawml() returned error code [%d]", mobi_ret) + return; + } + + text_buffer_t tex = text_buffer_create(ctx->content_size); + text_buffer_append_markup(&tex, content_str); + text_buffer_terminate_string(&tex); + + APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf) + + free(content_str); + free(buf); + text_buffer_destroy(&tex); + mobi_free(m); +} \ No newline at end of file diff --git a/libscan/mobi/scan_mobi.h b/libscan/mobi/scan_mobi.h new file mode 100644 index 0000000..dd286f1 --- /dev/null +++ b/libscan/mobi/scan_mobi.h @@ -0,0 +1,14 @@ +#ifndef SCAN_SCAN_MOBI_H +#define SCAN_SCAN_MOBI_H + +#include "../scan.h" + +typedef struct { + long content_size; + log_callback_t log; + logf_callback_t logf; +} scan_mobi_ctx_t; + +void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc); + +#endif diff --git a/libscan/ooxml/ooxml.c b/libscan/ooxml/ooxml.c index 5e91b1a..573c2ad 100644 --- a/libscan/ooxml/ooxml.c +++ b/libscan/ooxml/ooxml.c @@ -105,12 +105,6 @@ static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *bu return ret; } -#define APPEND_STR_META(doc, keyname, value) \ - meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \ - meta_str->key = keyname; \ - strcpy(meta_str->str_val, value); \ - APPEND_META(doc, meta_str) - __always_inline static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) { xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL, diff --git a/libscan/scan.h b/libscan/scan.h index 8f67a6b..1bfd682 100644 --- a/libscan/scan.h +++ b/libscan/scan.h @@ -7,12 +7,6 @@ #include "macros.h" -// TODO: global init: -/* - * av_log_set_level(AV_LOG_QUIET); - */ - - #define META_INT_MASK 0x80 #define META_STR_MASK 0x40 #define META_LONG_MASK 0x20 @@ -151,8 +145,8 @@ typedef struct parse_job_t { #include "util.h" typedef void (*store_callback_t)(char *key, size_t key_len, char *buf, size_t buf_len); -typedef void (*logf_callback_t)(char *filepath, int level, char *format, ...); -typedef void (*log_callback_t)(char *filepath, int level, char *str); +typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...); +typedef void (*log_callback_t)(const char *filepath, int level, char *str); typedef void (*parse_callback_t)(parse_job_t *job); #endif diff --git a/libscan/util.h b/libscan/util.h index 813c7f6..416b58e 100644 --- a/libscan/util.h +++ b/libscan/util.h @@ -81,7 +81,7 @@ static dyn_buffer_t dyn_buffer_create() { buf.size = INITIAL_BUF_SIZE; buf.cur = 0; - buf.buf = (char*)malloc(INITIAL_BUF_SIZE); + buf.buf = (char *) malloc(INITIAL_BUF_SIZE); return buf; } @@ -92,14 +92,14 @@ static void grow_buffer(dyn_buffer_t *buf, size_t size) { buf->size *= 2; } while (buf->cur + size > buf->size); - buf->buf = (char*)realloc(buf->buf, buf->size); + buf->buf = (char *) realloc(buf->buf, buf->size); } } static void grow_buffer_small(dyn_buffer_t *buf) { if (buf->cur + sizeof(long) > buf->size) { buf->size *= 2; - buf->buf = (char*)realloc(buf->buf, buf->size); + buf->buf = (char *) realloc(buf->buf, buf->size); } } @@ -230,7 +230,7 @@ static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t if (len <= 4) { for (int i = 0; i < len; i++) { - if (((utf8_int32_t)0xffffff80 & str[i]) == 0) { + if (((utf8_int32_t) 0xffffff80 & str[i]) == 0) { dyn_buffer_write_char(&buf->dyn_buffer, str[i]); } } @@ -241,7 +241,7 @@ static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t char tmp[16]; do { - ptr = (char*)utf8codepoint(ptr, &c); + ptr = (char *) utf8codepoint(ptr, &c); *(int *) tmp = 0x00000000; memcpy(tmp, oldPtr, ptr - oldPtr); oldPtr = ptr; @@ -264,8 +264,39 @@ static int text_buffer_append_string0(text_buffer_t *buf, char *str) { return text_buffer_append_string(buf, str, strlen(str)); } -static void* read_all(vfile_t *f, size_t *size) { - void* buf = malloc(f->info.st_size); +static int text_buffer_append_markup(text_buffer_t *buf, const char *markup) { + + int tag_open = TRUE; + const char *ptr = markup; + const char *start = markup; + + while (*ptr != '\0') { + if (tag_open) { + if (*ptr == '>') { + tag_open = FALSE; + start = ptr + 1; + } + } else { + if (*ptr == '<') { + tag_open = TRUE; + if (ptr != start) { + if (text_buffer_append_string(buf, start, (ptr - start)) == TEXT_BUF_FULL) { + return TEXT_BUF_FULL; + } + if (text_buffer_append_char(buf, ' ') == TEXT_BUF_FULL) { + return TEXT_BUF_FULL; + } + } + } + } + + ptr += 1; + } + return 0; +} + +static void *read_all(vfile_t *f, size_t *size) { + void *buf = malloc(f->info.st_size); *size = f->read(f, buf, f->info.st_size); //TODO: log diff --git a/test/main.cpp b/test/main.cpp index 8dff3a4..f6cae12 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -7,6 +7,7 @@ extern "C" { #include "../libscan/ebook/ebook.h" #include "../libscan/media/media.h" #include "../libscan/ooxml/ooxml.h" +#include "../libscan/mobi/scan_mobi.h" #include } @@ -22,6 +23,7 @@ static scan_media_ctx_t media_ctx; static scan_ooxml_ctx_t ooxml_500_ctx; +static scan_mobi_ctx_t mobi_500_ctx; /* Text */ @@ -298,6 +300,49 @@ TEST(Ooxml, Xlsx1) { cleanup(&doc, &f); } +/* Mobi */ +TEST(Mobi, Mobi1) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/mobi/Norse Mythology - Neil Gaiman.mobi", &f, &doc); + + parse_mobi(&mobi_500_ctx, &f, &doc); + + ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Gaiman, Neil"); + ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Norse Mythology"); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + + cleanup(&doc, &f); +} + +TEST(Mobi, Azw) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/mobi/sample.azw", &f, &doc); + + parse_mobi(&mobi_500_ctx, &f, &doc); + + ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Nietzsche, Friedrich"); + ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "On the Genealogy of Morality (Hackett Classics)"); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + + cleanup(&doc, &f); +} + +TEST(Mobi, Azw3) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/mobi/sample.azw3", &f, &doc); + + parse_mobi(&mobi_500_ctx, &f, &doc); + + ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "George Orwell; Amélie Audiberti"); + ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "1984"); + ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 1); + + cleanup(&doc, &f); +} + int main(int argc, char **argv) { arc_recurse_ctx.log = noop_log; arc_recurse_ctx.logf = noop_logf; @@ -335,6 +380,10 @@ int main(int argc, char **argv) { ooxml_500_ctx.log = noop_log; ooxml_500_ctx.logf = noop_logf; + mobi_500_ctx.content_size = 500; + mobi_500_ctx.log = noop_log; + mobi_500_ctx.logf = noop_logf; + av_log_set_level(AV_LOG_QUIET); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/test/test_util.h b/test/test_util.h index abc90b4..55d773d 100644 --- a/test/test_util.h +++ b/test/test_util.h @@ -9,11 +9,11 @@ void load_doc_mem(void *mem, size_t mem_len, vfile_t *f, document_t *doc); void load_doc_file(const char *filepath, vfile_t *f, document_t *doc); void cleanup(document_t *doc, vfile_t *f); -static void noop_logf(char *filepath, int level, char *format, ...) { +static void noop_logf(const char *filepath, int level, char *format, ...) { // noop } -static void noop_log(char *filepath, int level, char *str) { +static void noop_log(const char *filepath, int level, char *str) { // noop }