diff --git a/.gitignore b/.gitignore index d55133b..6b82d53 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ libscan.so *.cbp CMakeFiles CMakeCache.txt -scan_test \ No newline at end of file +scan_test +third-party/ \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 4fe427e..f7d4856 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,32 @@ project(scan C) set(CMAKE_C_STANDARD 11) find_package(LibArchive REQUIRED) +find_package(Threads REQUIRED) +find_package(Tesseract CONFIG REQUIRED) +find_package(harfbuzz CONFIG REQUIRED) +find_package(OpenJPEG CONFIG REQUIRED) +find_package(JPEG REQUIRED) + + +include(ExternalProject) +find_program(MAKE_EXE NAMES gmake nmake make) +ExternalProject_Add( + mupdf + # TODO: use master branch ? + URL https://mupdf.com/downloads/archive/mupdf-1.16.1-source.tar.xz + + UPDATE_COMMAND "" + PATCH_COMMAND "" + TEST_COMMAND "" + CONFIGURE_COMMAND "" + INSTALL_COMMAND "" + + PREFIX "third-party/ext_mupdf" + BINARY_DIR "third-party/ext_mupdf/src/mupdf" + + BUILD_COMMAND CFLAGS=-fPIC HAVE_CURL=no HAVE_GLUT=no ${MAKE_EXE} -j 4 --silent + && ar d build/release/libmupdf-third.a jutils.o jdinput.o jdmarker.o jdmaster.o +) add_library( scan @@ -13,6 +39,7 @@ add_library( libscan/text/text.c libscan/text/text.h libscan/arc/arc.c libscan/arc/arc.h + libscan/ebook/ebook.c libscan/ebook/ebook.h third-party/utf8.h ) @@ -24,10 +51,42 @@ target_compile_options( -g ) +add_dependencies( + scan + mupdf +) + +SET(CMAKE_C_LINK_EXECUTABLE "g++ -o ") + target_link_libraries( scan + -static + -static-libgcc + -static-libstdc++ + + -Wl,--whole-archive + m + -Wl,--no-whole-archive + + "${CMAKE_SOURCE_DIR}/third-party/ext_mupdf/src/mupdf/build/release/libmupdf.a" + "${CMAKE_SOURCE_DIR}/third-party/ext_mupdf/src/mupdf/build/release/libmupdf-third.a" + + ${JPEG_LIBRARIES} ${LibArchive_LIBRARIES} + ${Tesseract_LIBRARIES} + + ${CMAKE_THREAD_LIBS_INIT} + + # TODO: Looks like I don't need to explicitly link to libuuid? +) + +target_include_directories( + scan + BEFORE + PRIVATE + "${CMAKE_SOURCE_DIR}/third-party/ext_mupdf/src/mupdf/include/" + ${JPEG_INCLUDE_DIR} ) @@ -37,7 +96,6 @@ add_executable( test/main.c ) - target_link_libraries( scan_test scan diff --git a/README.md b/README.md index 4dc6cf5..71ff6e5 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,8 @@ ```bash -vcpkg install libarchive +vcpkg install libarchive pthread tesseract + +rm -rf CMakeFiles/ CMakeCache.txt +cmake -DCMAKE_TOOLCHAIN_FILE=/usr/share/vcpkg/scripts/buildsystems/vcpkg.cmake . +make -j 4 ``` \ No newline at end of file diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..2d3464b --- /dev/null +++ b/build.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +export CC=gcc +export CXX=g++ + +rm -rf CMakeFiles CMakeCache.txt +cmake -DCMAKE_TOOLCHAIN_FILE=/usr/share/vcpkg/scripts/buildsystems/vcpkg.cmake . || exit +make -j 4 diff --git a/libscan/ebook/ebook.c b/libscan/ebook/ebook.c new file mode 100644 index 0000000..10db675 --- /dev/null +++ b/libscan/ebook/ebook.c @@ -0,0 +1,334 @@ +#include "ebook.h" +#include "../util.h" +#include +#include +#include + +#define MIN_OCR_SIZE 350 +#define MIN_OCR_LEN 10 + +/* fill_image callback doesn't let us pass opaque pointers unless I create my own device */ +__thread text_buffer_t thread_buffer; +__thread scan_ebook_ctx_t thread_ctx; + + +int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_document *fzdoc) { + + int err = 0; + fz_page *cover = NULL; + + fz_var(cover); + fz_var(err); + fz_try(fzctx) + cover = fz_load_page(fzctx, fzdoc, 0); + fz_catch(fzctx) + err = 1; + + if (err != 0) { + fz_drop_page(fzctx, cover); +// LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, ctx->error.message) + return FALSE; + } + + fz_rect bounds = fz_bound_page(fzctx, cover); + + float scale; + float w = (float) bounds.x1 - bounds.x0; + float h = (float) bounds.y1 - bounds.y0; + if (w > h) { + scale = (float) ctx->tn_size / w; + } else { + scale = (float) ctx->tn_size / h; + } + fz_matrix m = fz_scale(scale, scale); + + bounds = fz_transform_rect(bounds, m); + fz_irect bbox = fz_round_rect(bounds); + fz_pixmap *pixmap = fz_new_pixmap_with_bbox(fzctx, fzctx->colorspace->rgb, bbox, NULL, 0); + + fz_clear_pixmap_with_value(fzctx, pixmap, 0xFF); + fz_device *dev = fz_new_draw_device(fzctx, m, pixmap); + + fz_var(err); + fz_try(fzctx) + { + pthread_mutex_lock(&ctx->mupdf_mutex); + fz_run_page(fzctx, cover, dev, fz_identity, NULL); + } + fz_always(fzctx) + { + fz_close_device(fzctx, dev); + fz_drop_device(fzctx, dev); + pthread_mutex_unlock(&ctx->mupdf_mutex); + } + fz_catch(fzctx) + err = fzctx->error.errcode; + + if (err != 0) { +// LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, ctx->error.message) + fz_drop_page(fzctx, cover); + fz_drop_pixmap(fzctx, pixmap); + return FALSE; + } + + fz_buffer *fzbuf = NULL; + fz_var(fzbuf); + fz_var(err); + + fz_try(fzctx) + fzbuf = fz_new_buffer_from_pixmap_as_png(fzctx, pixmap, fz_default_color_params); + fz_catch(fzctx) + err = fzctx->error.errcode; + + if (err == 0) { + unsigned char *tn_buf; + size_t tn_len = fz_buffer_storage(fzctx, fzbuf, &tn_buf); +// store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) tn_buf, tn_len); + } + + fz_drop_buffer(fzctx, fzbuf); + fz_drop_pixmap(fzctx, pixmap); + fz_drop_page(fzctx, cover); + + if (err != 0) { +// LOG_WARNINGF(doc->filepath, "fz_new_buffer_from_pixmap_as_png() returned error code [%d] %s", err, +// ctx->error.message) + return FALSE; + } + + return TRUE; +} + +void fz_err_callback(void *user, UNUSED(const char *message)) { +// if (LogCtx.verbose) { +// document_t *doc = (document_t *) user; +// LOG_WARNINGF(doc->filepath, "FZ: %s", message) +// } +} + +static void init_fzctx(fz_context *fzctx, document_t *doc) { + fz_disable_icc(fzctx); + fz_register_document_handlers(fzctx); + + fzctx->warn.print_user = doc; + fzctx->warn.print = fz_err_callback; + fzctx->error.print_user = doc; + fzctx->error.print = fz_err_callback; +} + +static int read_stext_block(fz_stext_block *block, text_buffer_t *tex) { + if (block->type != FZ_STEXT_BLOCK_TEXT) { + return 0; + } + + fz_stext_line *line = block->u.t.first_line; + while (line != NULL) { + fz_stext_char *c = line->first_char; + while (c != NULL) { + if (text_buffer_append_char(tex, c->c) == TEXT_BUF_FULL) { + return TEXT_BUF_FULL; + } + c = c->next; + } + line = line->next; + } + return 0; +} + +#define IS_VALID_BPP(d) (d==1 || d==2 || d==4 || d==8 || d==16 || d==24 || d==32) + +void fill_image(fz_context *fzctx, UNUSED(fz_device *dev), + fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha), + UNUSED(fz_color_params color_params)) { + + int l2factor = 0; + + if (img->w > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && IS_VALID_BPP(img->n)) { + + fz_pixmap *pix = img->get_pixmap(fzctx, img, NULL, img->w, img->h, &l2factor); + + if (pix->h > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && img->xres != 0) { + TessBaseAPI *api = TessBaseAPICreate(); + TessBaseAPIInit3(api, thread_ctx.tesseract_path, thread_ctx.tesseract_lang); + + TessBaseAPISetImage(api, pix->samples, pix->w, pix->h, pix->n, pix->stride); + TessBaseAPISetSourceResolution(api, pix->xres); + + char *text = TessBaseAPIGetUTF8Text(api); + size_t len = strlen(text); + if (len >= MIN_OCR_LEN) { + text_buffer_append_string(&thread_buffer, text, len - 1); +// LOG_DEBUGF( +// "pdf.c", +// "(OCR) %dx%d got %dB from tesseract (%s), buffer:%dB", +// pix->w, pix->h, len, ScanCtx.tesseract_lang, thread_buffer.dyn_buffer.cur +// ) + } + + TessBaseAPIEnd(api); + TessBaseAPIDelete(api); + } + fz_drop_pixmap(fzctx, pix); + } +} + +void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char* mime_str, document_t *doc) { + + size_t buf_len; + void * buf = read_all(f, &buf_len); + + static int mu_is_initialized = 0; + if (!mu_is_initialized) { + pthread_mutex_init(&ctx->mupdf_mutex, NULL); + mu_is_initialized = 1; + } + fz_context *fzctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED); + + init_fzctx(fzctx, doc); + + int err = 0; + + fz_document *fzdoc = NULL; + fz_stream *stream = NULL; + fz_var(fzdoc); + fz_var(stream); + fz_var(err); + + fz_try(fzctx) + { + stream = fz_open_memory(fzctx, buf, buf_len); + fzdoc = fz_open_document_with_stream(fzctx, mime_str, stream); + } + fz_catch(fzctx) + err = fzctx->error.errcode; + + if (err != 0) { + fz_drop_stream(fzctx, stream); + fz_drop_document(fzctx, fzdoc); + fz_drop_context(fzctx); + return; + } + + char title[4096] = {'\0',}; + fz_try(fzctx) + fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title)); + fz_catch(fzctx) + ; + + if (strlen(title) > 0) { + meta_line_t *meta_content = malloc(sizeof(meta_line_t) + strlen(title)); + meta_content->key = MetaTitle; + strcpy(meta_content->str_val, title); + APPEND_META(doc, meta_content) + } + + int page_count = -1; + fz_var(err); + fz_try(fzctx) + page_count = fz_count_pages(fzctx, fzdoc); + fz_catch(fzctx) + err = fzctx->error.errcode; + + if (err) { +// LOG_WARNINGF(doc->filepath, "fz_count_pages() returned error code [%d] %s", err, ctx->error.message) + fz_drop_stream(fzctx, stream); + fz_drop_document(fzctx, fzdoc); + fz_drop_context(fzctx); + return; + } + + if (ctx->tn_size > 0) { + err = render_cover(ctx, fzctx, doc, fzdoc); + } + + if (err == TRUE) { + fz_drop_stream(fzctx, stream); + fz_drop_document(fzctx, fzdoc); + fz_drop_context(fzctx); + return; + } + + if (ctx->content_size > 0) { + fz_stext_options opts = {0}; + thread_buffer = text_buffer_create(ctx->content_size); + + for (int current_page = 0; current_page < page_count; current_page++) { + fz_page *page = NULL; + fz_var(err); + fz_try(fzctx) + page = fz_load_page(fzctx, fzdoc, current_page); + fz_catch(fzctx) + err = fzctx->error.errcode; + if (err != 0) { +// LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, ctx->error.message) + text_buffer_destroy(&thread_buffer); + fz_drop_page(fzctx, page); + fz_drop_stream(fzctx, stream); + fz_drop_document(fzctx, fzdoc); + fz_drop_context(fzctx); + return; + } + + fz_stext_page *stext = fz_new_stext_page(fzctx, fz_bound_page(fzctx, page)); + fz_device *dev = fz_new_stext_device(fzctx, stext, &opts); + dev->stroke_path = NULL; + dev->stroke_text = NULL; + dev->clip_text = NULL; + dev->clip_stroke_path = NULL; + dev->clip_stroke_text = NULL; + + if (ctx->tesseract_lang!= NULL) { + dev->fill_image = fill_image; + } + + fz_var(err); + fz_try(fzctx) + fz_run_page(fzctx, page, dev, fz_identity, NULL); + fz_always(fzctx) + { + fz_close_device(fzctx, dev); + fz_drop_device(fzctx, dev); + } + fz_catch(fzctx) + err = fzctx->error.errcode; + + if (err != 0) { +// LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, ctx->error.message) + text_buffer_destroy(&thread_buffer); + fz_drop_page(fzctx, page); + fz_drop_stext_page(fzctx, stext); + fz_drop_stream(fzctx, stream); + fz_drop_document(fzctx, fzdoc); + fz_drop_context(fzctx); + return; + } + + fz_stext_block *block = stext->first_block; + while (block != NULL) { + int ret = read_stext_block(block, &thread_buffer); + if (ret == TEXT_BUF_FULL) { + break; + } + block = block->next; + } + fz_drop_stext_page(fzctx, stext); + fz_drop_page(fzctx, page); + + if (thread_buffer.dyn_buffer.cur >= thread_buffer.dyn_buffer.size) { + break; + } + } + text_buffer_terminate_string(&thread_buffer); + + meta_line_t *meta_content = malloc(sizeof(meta_line_t) + thread_buffer.dyn_buffer.cur); + meta_content->key = MetaContent; + memcpy(meta_content->str_val, thread_buffer.dyn_buffer.buf, thread_buffer.dyn_buffer.cur); + APPEND_META(doc, meta_content) + + text_buffer_destroy(&thread_buffer); + } + + fz_drop_stream(fzctx, stream); + fz_drop_document(fzctx, fzdoc); + fz_drop_context(fzctx); +} diff --git a/libscan/ebook/ebook.h b/libscan/ebook/ebook.h new file mode 100644 index 0000000..7621e7f --- /dev/null +++ b/libscan/ebook/ebook.h @@ -0,0 +1,16 @@ +#ifndef SCAN_PDF_H +#define SCAN_PDF_H + +#include "../scan.h" + +typedef struct { + long content_size; + int tn_size; + const char *tesseract_lang; + const char *tesseract_path; + pthread_mutex_t mupdf_mutex; +} scan_ebook_ctx_t; + +void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char* mime_str, document_t *doc); + +#endif diff --git a/libscan/macros.h b/libscan/macros.h index a84ba38..dad6ad4 100644 --- a/libscan/macros.h +++ b/libscan/macros.h @@ -1,5 +1,6 @@ #ifndef FALSE #define FALSE (0) +#define BOOL int #endif #ifndef TRUE diff --git a/libscan/scan.h b/libscan/scan.h index 36a9b3d..626ddb8 100644 --- a/libscan/scan.h +++ b/libscan/scan.h @@ -10,6 +10,8 @@ #define META_STR_MASK 0x40 #define META_LONG_MASK 0x20 +#define UNUSED(x) __attribute__((__unused__)) x + #define META_STR(id) ((unsigned) id) | ((unsigned) META_STR_MASK) #define META_INT(id) ((unsigned) id) | ((unsigned) META_INT_MASK) #define META_LONG(id) ((unsigned) id) | ((unsigned) META_LONG_MASK) @@ -89,6 +91,7 @@ typedef struct vfile { int is_fs_file; char *filepath; + struct stat info; read_func_t read; close_func_t close; diff --git a/libscan/util.h b/libscan/util.h index 20c3426..05c7a62 100644 --- a/libscan/util.h +++ b/libscan/util.h @@ -1,9 +1,9 @@ #ifndef SIST2_UTIL_H #define SIST2_UTIL_H -#include -#include -#include +#include "stdio.h" +#include "stdlib.h" +#include "string.h" #include "../third-party/utf8.h/utf8.h" #include "macros.h" @@ -26,7 +26,6 @@ typedef struct text_buffer { dyn_buffer_t dyn_buffer; } text_buffer_t; -__always_inline static int utf8_validchr2(const char *s) { if (0x00 == (0x80 & *s)) { return TRUE; @@ -75,7 +74,6 @@ static int utf8_validchr2(const char *s) { } -__always_inline static dyn_buffer_t dyn_buffer_create() { dyn_buffer_t buf; @@ -86,7 +84,6 @@ static dyn_buffer_t dyn_buffer_create() { return buf; } -__always_inline static void grow_buffer(dyn_buffer_t *buf, size_t size) { if (buf->cur + size > buf->size) { do { @@ -97,7 +94,6 @@ static void grow_buffer(dyn_buffer_t *buf, size_t size) { } } -__always_inline static void grow_buffer_small(dyn_buffer_t *buf) { if (buf->cur + sizeof(long) > buf->size) { buf->size *= 2; @@ -105,7 +101,6 @@ static void grow_buffer_small(dyn_buffer_t *buf) { } } -__always_inline static void dyn_buffer_write(dyn_buffer_t *buf, const void *data, size_t size) { grow_buffer(buf, size); @@ -113,7 +108,6 @@ static void dyn_buffer_write(dyn_buffer_t *buf, const void *data, size_t size) { buf->cur += size; } -__always_inline static void dyn_buffer_write_char(dyn_buffer_t *buf, char c) { grow_buffer_small(buf); @@ -121,18 +115,15 @@ static void dyn_buffer_write_char(dyn_buffer_t *buf, char c) { buf->cur += sizeof(c); } -__always_inline static void dyn_buffer_write_str(dyn_buffer_t *buf, char *str) { dyn_buffer_write(buf, str, strlen(str)); dyn_buffer_write_char(buf, '\0'); } -__always_inline static void dyn_buffer_append_string(dyn_buffer_t *buf, char *str) { dyn_buffer_write(buf, str, strlen(str)); } -__always_inline static void dyn_buffer_write_int(dyn_buffer_t *buf, int d) { grow_buffer_small(buf); @@ -140,7 +131,6 @@ static void dyn_buffer_write_int(dyn_buffer_t *buf, int d) { buf->cur += sizeof(int); } -__always_inline static void dyn_buffer_write_short(dyn_buffer_t *buf, short s) { grow_buffer_small(buf); @@ -148,7 +138,6 @@ static void dyn_buffer_write_short(dyn_buffer_t *buf, short s) { buf->cur += sizeof(short); } -__always_inline static void dyn_buffer_write_long(dyn_buffer_t *buf, unsigned long l) { grow_buffer_small(buf); @@ -156,17 +145,14 @@ static void dyn_buffer_write_long(dyn_buffer_t *buf, unsigned long l) { buf->cur += sizeof(unsigned long); } -__always_inline static void dyn_buffer_destroy(dyn_buffer_t *buf) { free(buf->buf); } -__always_inline static void text_buffer_destroy(text_buffer_t *buf) { dyn_buffer_destroy(&buf->dyn_buffer); } -__always_inline static text_buffer_t text_buffer_create(long max_size) { text_buffer_t text_buf; @@ -177,7 +163,6 @@ static text_buffer_t text_buffer_create(long max_size) { return text_buf; } -__always_inline static int text_buffer_append_char(text_buffer_t *buf, int c) { if (SHOULD_IGNORE_CHAR(c) || c == ' ') { @@ -218,7 +203,6 @@ static int text_buffer_append_char(text_buffer_t *buf, int c) { } -__always_inline static void text_buffer_terminate_string(text_buffer_t *buf) { if (buf->dyn_buffer.cur > 0 && *(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) == ' ') { *(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) = '\0'; @@ -233,7 +217,6 @@ static void text_buffer_terminate_string(text_buffer_t *buf) { (0xe0 == (0xf0 & *ptr) && ptr - str > len - 3) || \ (0xf0 == (0xf8 & *ptr) && ptr - str > len - 4)) -__always_inline static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t len) { const char *ptr = str; @@ -275,9 +258,17 @@ static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t return 0; } -__always_inline static int text_buffer_append_string0(text_buffer_t *buf, char *str) { return text_buffer_append_string(buf, str, strlen(str)); } +static void* read_all(vfile_t *f, size_t *size) { + void* buf = malloc(f->info.st_size); + *size = f->read(f, buf, f->info.st_size); + + //TODO: log + + return buf; +} + #endif diff --git a/test/main.c b/test/main.c index 821cacc..7e9008d 100644 --- a/test/main.c +++ b/test/main.c @@ -1,10 +1,11 @@ #include "../libscan/text/text.h" #include #include "../libscan/arc/arc.h" +#include "../libscan/ebook/ebook.h" int main() { - scan_text_ctx_t ctx; + scan_ebook_ctx_t ctx; ctx.content_size = 100; vfile_t file; @@ -18,5 +19,5 @@ int main() { doc.meta_tail = NULL; doc.size = 200; - parse_text(&ctx, &file, &doc); + parse_ebook(&ctx, &file,"application/pdf", &doc); } \ No newline at end of file