Remove libscan git submodule

This commit is contained in:
simon987 2021-11-07 09:30:14 -05:00
parent 06f21d5f0f
commit a41b5dcc1f
42 changed files with 5508 additions and 7 deletions

2
.gitignore vendored
View File

@ -16,7 +16,7 @@ bundle.js
*.a
vgcore.*
build/
third-party/
third-party/argparse
*.idx/
VERSION
git_hash.h

6
.gitmodules vendored
View File

@ -4,3 +4,9 @@
[submodule "third-party/argparse"]
path = third-party/argparse
url = https://github.com/cofyc/argparse
[submodule "third-party/libscan/third-party/utf8.h"]
path = third-party/libscan/third-party/utf8.h
url = https://github.com/sheredom/utf8.h
[submodule "third-party/libscan/third-party/antiword"]
path = third-party/libscan/third-party/antiword
url = https://github.com/simon987/antiword

File diff suppressed because one or more lines are too long

@ -1 +1 @@
Subproject commit ffd9c23427d0cb105e27f27f0cf97b463b6a8bf8
Subproject commit c37e04a701a5cf8f246d43d5cd32461a1afcba67

12
third-party/libscan/.gitignore vendored Normal file
View File

@ -0,0 +1,12 @@
.idea/
cmake_install.cmake
Makefile
libscan.a
libscan.so
*.cbp
CMakeFiles
CMakeCache.txt
scan_test
third-party/ext_*
libscan-test-files
scan_*_test

233
third-party/libscan/CMakeLists.txt vendored Normal file
View File

@ -0,0 +1,233 @@
cmake_minimum_required(VERSION 3.15)
project(scan)
set(CMAKE_C_STANDARD 11)
option(BUILD_TESTS "Build tests" on)
add_subdirectory(third-party/antiword)
add_compile_definitions(
antiword
NDEBUG
)
add_library(
scan
libscan/util.c libscan/util.h
libscan/scan.h
libscan/macros.h
libscan/text/text.c libscan/text/text.h
libscan/arc/arc.c libscan/arc/arc.h
libscan/ebook/ebook.c libscan/ebook/ebook.h
libscan/comic/comic.c libscan/comic/comic.h
libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h
libscan/media/media.c libscan/media/media.h
libscan/font/font.c libscan/font/font.h
libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h
libscan/json/json.c libscan/json/json.h
libscan/wpd/wpd.c libscan/wpd/wpd.h libscan/wpd/libwpd_c_api.h libscan/wpd/libwpd_c_api.cpp
third-party/utf8.h
libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h)
set_target_properties(scan PROPERTIES LINKER_LANGUAGE C)
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib .so)
find_package(cJSON CONFIG REQUIRED)
find_package(LibArchive REQUIRED)
find_package(BZip2 REQUIRED)
find_package(lz4 REQUIRED)
find_package(Threads REQUIRED)
find_package(Tesseract CONFIG REQUIRED)
find_package(OpenJPEG CONFIG REQUIRED)
find_package(JPEG REQUIRED)
find_package(LibXml2 REQUIRED)
find_package(LibLZMA REQUIRED)
find_package(ZLIB REQUIRED)
find_package(unofficial-pcre CONFIG REQUIRED)
find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec)
find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd)
find_library(FREETYPE_LIB NAMES freetype freetyped)
find_package(unofficial-brotli CONFIG REQUIRED)
find_library(LZO2_LIB NAMES lzo2)
find_library(RAW_LIB NAMES libraw.a)
find_library(MUPDF_LIB NAMES liblibmupdf.a)
find_library(CMS_LIB NAMES lcms)
find_library(JAS_LIB NAMES jasper)
find_library(GUMBO_LIB NAMES gumbo)
find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/)
target_compile_options(
scan
PRIVATE
-g
)
include(ExternalProject)
find_program(MAKE_EXE NAMES gmake nmake make)
ExternalProject_Add(
libmobi
GIT_REPOSITORY https://github.com/simon987/libmobi.git
GIT_TAG "public"
UPDATE_COMMAND ""
PATCH_COMMAND ""
TEST_COMMAND ""
CONFIGURE_COMMAND ./autogen.sh && ./configure
INSTALL_COMMAND ""
PREFIX "third-party/ext_libmobi"
SOURCE_DIR "third-party/ext_libmobi/src/libmobi"
BINARY_DIR "third-party/ext_libmobi/src/libmobi"
BUILD_COMMAND ${MAKE_EXE} -j 8 --silent
)
SET(MOBI_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/.libs/)
SET(MOBI_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/)
if (SIST_DEBUG)
SET(FFMPEG_DEBUG "--enable-debug=3" "--disable-optimizations")
else()
SET(FFMPEG_DEBUG "")
endif()
ExternalProject_Add(
ffmpeg
GIT_REPOSITORY https://git.ffmpeg.org/ffmpeg.git
GIT_TAG "n4.4"
UPDATE_COMMAND ""
PATCH_COMMAND ""
TEST_COMMAND ""
CONFIGURE_COMMAND ./configure --disable-shared --enable-static --disable-ffmpeg --disable-ffplay
--disable-ffprobe --disable-doc --disable-manpages --disable-postproc --disable-avfilter --disable-alsa
--disable-lzma --disable-xlib --disable-vdpau --disable-vaapi --disable-sdl2
--disable-network ${FFMPEG_DEBUG}
INSTALL_COMMAND ""
PREFIX "third-party/ext_ffmpeg"
SOURCE_DIR "third-party/ext_ffmpeg/src/ffmpeg"
BINARY_DIR "third-party/ext_ffmpeg/src/ffmpeg"
BUILD_COMMAND ${MAKE_EXE} -j33 --silent
)
SET(FFMPEG_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
SET(FFMPEG_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
ExternalProject_Add(
libwpd
URL http://prdownloads.sourceforge.net/libwpd/libwpd-0.9.9.tar.gz
UPDATE_COMMAND ""
PATCH_COMMAND ""
TEST_COMMAND ""
CONFIGURE_COMMAND ./configure --without-docs --enable-static --disable-shared
INSTALL_COMMAND ""
PREFIX "third-party/ext_libwpd"
SOURCE_DIR "third-party/ext_libwpd/src/libwpd"
BINARY_DIR "third-party/ext_libwpd/src/libwpd"
BUILD_COMMAND ${MAKE_EXE} -j33
)
SET(WPD_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/src/lib/.libs/)
SET(WPD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/inc/)
add_dependencies(
scan
libmobi
ffmpeg
antiword
libwpd
)
target_link_libraries(
scan
PUBLIC
cjson
${LibArchive_LIBRARIES}
ZLIB::ZLIB
BZip2::BZip2
lz4::lz4
${LZO2_LIB}
LibLZMA::LibLZMA
${MUPDF_LIB}
openjp2
${MOBI_LIB_DIR}/libmobi.a
${WPD_LIB_DIR}/libwpd-0.9.a
${WPD_LIB_DIR}/libwpd-stream-0.9.a
${FREETYPE_LIB}
${HARFBUZZ_LIB}
${JBIG2DEC_LIB}
stdc++
-Wl,--whole-archive
m
-Wl,--no-whole-archive
${JPEG_LIBRARIES}
${Tesseract_LIBRARIES}
${LIBXML2_LIBRARIES}
${FREETYPE_LIB}
unofficial::brotli::brotlidec-static
${FFMPEG_LIB_DIR}/libavformat/libavformat.a
${FFMPEG_LIB_DIR}/libavcodec/libavcodec.a
${FFMPEG_LIB_DIR}/libavutil/libavutil.a
${FFMPEG_LIB_DIR}/libswresample/libswresample.a
${FFMPEG_LIB_DIR}/libswscale/libswscale.a
z
${CMAKE_THREAD_LIBS_INIT}
${RAW_LIB}
${GOMP_LIB}
${CMS_LIB}
${JAS_LIB}
${GUMBO_LIB}
dl
antiword
unofficial::pcre::pcre unofficial::pcre::pcre16 unofficial::pcre::pcre32 unofficial::pcre::pcrecpp
)
target_include_directories(
scan
PUBLIC
${MUPDF_INC_DIR}
${JPEG_INCLUDE_DIR}
${LIBXML2_INCLUDE_DIR}
${FFMPEG_INCLUDE_DIR}
${MOBI_INCLUDE_DIR}
${WPD_INCLUDE_DIR}
)
if (BUILD_TESTS)
find_package(GTest CONFIG REQUIRED)
add_executable(scan_ub_test test/main.cpp test/test_util.cpp test/test_util.h)
target_compile_options(scan_ub_test PRIVATE -g -fsanitize=undefined -fno-omit-frame-pointer)
target_link_libraries(scan_ub_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=undefined scan)
add_executable(scan_a_test test/main.cpp test/test_util.cpp test/test_util.h)
target_compile_options(scan_a_test PRIVATE -g -fsanitize=address -fno-omit-frame-pointer)
target_link_libraries(scan_a_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=address scan)
add_executable(scan_test test/main.cpp test/test_util.cpp test/test_util.h)
target_compile_options(scan_test PRIVATE -g -fno-omit-frame-pointer)
target_link_libraries(scan_test PRIVATE GTest::gtest GTest::gtest_main scan)
endif()

4
third-party/libscan/README.md vendored Normal file
View File

@ -0,0 +1,4 @@
### Run fuzz tests:
```bash
./scan_a_test --gtest_filter=*Fuzz* --gtest_repeat=100
```

244
third-party/libscan/libscan/arc/arc.c vendored Normal file
View File

@ -0,0 +1,244 @@
#include "arc.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <openssl/evp.h>
#include <pcre.h>
int should_parse_filtered_file(const char *filepath, int ext) {
char tmp[PATH_MAX * 2];
if (ext == 0) {
return FALSE;
}
if (strncmp(filepath + ext, "tgz", 3) == 0) {
return TRUE;
}
memcpy(tmp, filepath, ext - 1);
*(tmp + ext - 1) = '\0';
char *idx = strrchr(tmp, '.');
if (idx == NULL) {
return FALSE;
}
if (strcmp(idx, ".tar") == 0) {
return TRUE;
}
return FALSE;
}
void arc_close(struct vfile *f) {
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
if (f->rewind_buffer != NULL) {
free(f->rewind_buffer);
f->rewind_buffer = NULL;
f->rewind_buffer_size = 0;
f->rewind_buffer_cursor = 0;
}
}
int arc_read(struct vfile *f, void *buf, size_t size) {
int bytes_copied = 0;
if (f->rewind_buffer_size != 0) {
if (size > f->rewind_buffer_size) {
memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, f->rewind_buffer_size);
bytes_copied = f->rewind_buffer_size;
size -= f->rewind_buffer_size;
buf += f->rewind_buffer_size;
f->rewind_buffer_size = 0;
} else {
memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, size);
f->rewind_buffer_size -= (int) size;
f->rewind_buffer_cursor += (int) size;
return (int) size;
}
}
size_t bytes_read = archive_read_data(f->arc, buf, size);
if (bytes_read != 0 && bytes_read <= size && f->calculate_checksum) {
f->has_checksum = TRUE;
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, bytes_read);
}
if (bytes_read != size && archive_errno(f->arc) != 0) {
const char *error_str = archive_error_string(f->arc);
if (error_str != NULL) {
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
}
return -1;
}
return (int) bytes_read + bytes_copied;
}
int arc_read_rewindable(struct vfile *f, void *buf, size_t size) {
if (f->rewind_buffer != NULL) {
fprintf(stderr, "Allocated rewind buffer more than once for %s", f->filepath);
exit(-1);
}
size_t bytes_read = archive_read_data(f->arc, buf, size);
if (bytes_read != size && archive_errno(f->arc) != 0) {
const char *error_str = archive_error_string(f->arc);
if (error_str != NULL) {
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
}
return -1;
}
f->rewind_buffer = malloc(size);
f->rewind_buffer_size = (int) size;
f->rewind_buffer_cursor = 0;
memcpy(f->rewind_buffer, buf, size);
return (int) bytes_read;
}
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) {
arc_data->f = f;
if (f->is_fs_file) {
*a = archive_read_new();
archive_read_support_filter_all(*a);
archive_read_support_format_all(*a);
if (ctx->passphrase[0] != 0) {
archive_read_add_passphrase(*a, ctx->passphrase);
}
return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE);
} else if (allow_recurse) {
*a = archive_read_new();
archive_read_support_filter_all(*a);
archive_read_support_format_all(*a);
if (ctx->passphrase[0] != 0) {
archive_read_add_passphrase(*a, ctx->passphrase);
}
return archive_read_open(
*a, arc_data,
vfile_open_callback,
vfile_read_callback,
vfile_close_callback
);
} else {
return ARC_SKIPPED;
}
}
static __thread int sub_strings[30];
#define EXCLUDED(str) (pcre_exec(exclude, exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
arc_data.f = f;
int ret = arc_open(ctx, f, &a, &arc_data, ctx->mode == ARC_MODE_RECURSE);
if (ret == ARC_SKIPPED) {
return SCAN_OK;
}
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(arc.c) [%d] %s", ret, archive_error_string(a))
archive_read_free(a);
return SCAN_ERR_READ;
}
if (ctx->mode == ARC_MODE_LIST) {
dyn_buffer_t buf = dyn_buffer_create();
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
dyn_buffer_append_string(&buf, file_path);
dyn_buffer_write_char(&buf, ' ');
}
}
dyn_buffer_write_char(&buf, '\0');
meta_line_t *meta_list = malloc(sizeof(meta_line_t) + buf.cur);
meta_list->key = MetaContent;
strcpy(meta_list->str_val, buf.buf);
APPEND_META(doc, meta_list)
dyn_buffer_destroy(&buf);
} else {
parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2);
sub_job->vfile.close = arc_close;
sub_job->vfile.read = arc_read;
sub_job->vfile.read_rewindable = arc_read_rewindable;
sub_job->vfile.reset = NULL;
sub_job->vfile.arc = a;
sub_job->vfile.filepath = sub_job->filepath;
sub_job->vfile.is_fs_file = FALSE;
sub_job->vfile.rewind_buffer_size = 0;
sub_job->vfile.rewind_buffer = NULL;
sub_job->vfile.log = ctx->log;
sub_job->vfile.logf = ctx->logf;
sub_job->vfile.has_checksum = FALSE;
sub_job->vfile.calculate_checksum = f->calculate_checksum;
memcpy(sub_job->parent, doc->path_md5, MD5_DIGEST_LENGTH);
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
sub_job->vfile.info = *archive_entry_stat(entry);
if (S_ISREG(sub_job->vfile.info.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
if (utf8_name == NULL) {
sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry));
} else {
sprintf(sub_job->filepath, "%s#/%s", f->filepath, utf8_name);
}
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
// Handle excludes
if (exclude != NULL && EXCLUDED(sub_job->filepath)) {
CTX_LOG_DEBUGF("arc.c", "Excluded: %s", sub_job->filepath)
continue;
}
char *p = strrchr(sub_job->filepath, '.');
if (p != NULL && (p - sub_job->filepath) > strlen(f->filepath)) {
sub_job->ext = (int) (p - sub_job->filepath + 1);
} else {
sub_job->ext = (int) strlen(sub_job->filepath);
}
SHA1_Init(&sub_job->vfile.sha1_ctx);
ctx->parse(sub_job);
}
}
free(sub_job);
}
archive_read_free(a);
return SCAN_OK;
}

80
third-party/libscan/libscan/arc/arc.h vendored Normal file
View File

@ -0,0 +1,80 @@
#ifndef SCAN_ARC_H
#define SCAN_ARC_H
#include <archive.h>
#include <archive_entry.h>
#include <fcntl.h>
#include <pcre.h>
#include "../scan.h"
# define ARC_SKIPPED (-1)
#define ARC_MODE_SKIP 0
#define ARC_MODE_LIST 1
#define ARC_MODE_SHALLOW 2
#define ARC_MODE_RECURSE 3
typedef int archive_mode_t;
typedef struct {
archive_mode_t mode;
parse_callback_t parse;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
char passphrase[4096];
} scan_arc_ctx_t;
#define ARC_BUF_SIZE 8192
typedef struct {
vfile_t *f;
char buf[ARC_BUF_SIZE];
} arc_data_t;
static int vfile_open_callback(struct archive *a, void *user_data) {
arc_data_t *data = (arc_data_t *) user_data;
if (!data->f->is_fs_file) {
SHA1_Init(&data->f->sha1_ctx);
}
return ARCHIVE_OK;
}
static long vfile_read_callback(struct archive *a, void *user_data, const void **buf) {
arc_data_t *data = (arc_data_t *) user_data;
*buf = data->buf;
long ret = data->f->read(data->f, data->buf, sizeof(data->buf));
if (!data->f->is_fs_file && ret > 0) {
data->f->has_checksum = TRUE;
safe_sha1_update(&data->f->sha1_ctx, (unsigned char*)data->buf, ret);
}
return ret;
}
static int vfile_close_callback(struct archive *a, void *user_data) {
arc_data_t *data = (arc_data_t *) user_data;
if (!data->f->is_fs_file) {
SHA1_Final((unsigned char *) data->f->sha1_digest, &data->f->sha1_ctx);
}
return ARCHIVE_OK;
}
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse);
int should_parse_filtered_file(const char *filepath, int ext);
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra);
int arc_read(struct vfile *f, void *buf, size_t size);
int arc_read_rewindable(struct vfile *f, void *buf, size_t size);
void arc_close(struct vfile *f);
#endif

View File

@ -0,0 +1,58 @@
#include "comic.h"
#include "../media/media.h"
#include "../arc/arc.h"
#include <stdlib.h>
#include <archive.h>
static scan_arc_ctx_t arc_ctx = (scan_arc_ctx_t) {.passphrase = {0,}};
void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
if (ctx->tn_size <= 0) {
return;
}
int ret = arc_open(&arc_ctx, f, &a, &arc_data, TRUE);
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(cbr.c) [%d] %s", ret, archive_error_string(a))
archive_read_free(a);
return;
}
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
struct stat info = *archive_entry_stat(entry);
if (S_ISREG(info.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
char *p = strrchr(file_path, '.');
if (p != NULL && (strcmp(p, ".png") == 0 || strcmp(p, ".jpg") == 0 || strcmp(p, ".jpeg") == 0)) {
size_t entry_size = archive_entry_size(entry);
void *buf = malloc(entry_size);
size_t read = archive_read_data(a, buf, entry_size);
if (read != entry_size) {
const char *err_str = archive_error_string(a);
if (err_str) {
CTX_LOG_ERRORF("comic.c", "Error while reading entry: %s", err_str)
}
free(buf);
break;
}
ret = store_image_thumbnail((scan_media_ctx_t *) ctx, buf, entry_size, doc, file_path);
free(buf);
if (ret == TRUE) {
break;
}
}
}
}
archive_read_free(a);
}

View File

@ -0,0 +1,31 @@
#ifndef SCAN_CBR_H
#define SCAN_CBR_H
#include <stdlib.h>
#include "../ebook/ebook.h"
typedef struct {
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int tn_size;
float tn_qscale;
unsigned int cbr_mime;
unsigned int cbz_mime;
} scan_comic_ctx_t;
__always_inline
static int is_cbr(scan_comic_ctx_t *ctx, unsigned int mime) {
return mime == ctx->cbr_mime;
}
__always_inline
static int is_cbz(scan_comic_ctx_t *ctx, unsigned int mime) {
return mime == ctx->cbz_mime;
}
void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

View File

@ -0,0 +1,495 @@
#include "ebook.h"
#include <mupdf/fitz.h>
#include <pthread.h>
#include <tesseract/capi.h>
#include "../media/media.h"
#include "../arc/arc.h"
#define MIN_OCR_SIZE 350
#define MIN_OCR_LEN 10
/* fill_image callback doesn't let us pass opaque pointers unless I create my own device */
__thread text_buffer_t thread_buffer;
__thread scan_ebook_ctx_t thread_ctx;
pthread_mutex_t Mutex;
static void my_fz_lock(UNUSED(void *user), int lock) {
if (lock == FZ_LOCK_FREETYPE) {
pthread_mutex_lock(&Mutex);
}
}
static void my_fz_unlock(UNUSED(void *user), int lock) {
if (lock == FZ_LOCK_FREETYPE) {
pthread_mutex_unlock(&Mutex);
}
}
int pixmap_is_blank(const fz_pixmap *pixmap) {
int pixmap_size = pixmap->n * pixmap->w * pixmap->h;
const int pixel0 = pixmap->samples[0];
for (int i = 0; i < pixmap_size; i++) {
if (pixmap->samples[i] != pixel0) {
return FALSE;
}
}
return TRUE;
}
fz_pixmap *
load_pixmap(scan_ebook_ctx_t *ctx, int page, fz_context *fzctx, fz_document *fzdoc, document_t *doc, fz_page **cover) {
int err = 0;
fz_var(cover);
fz_var(err);
fz_try(fzctx)*cover = fz_load_page(fzctx, fzdoc, page);
fz_catch(fzctx)err = 1;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message)
return NULL;
}
fz_rect bounds = fz_bound_page(fzctx, *cover);
float scale;
float w = bounds.x1 - bounds.x0;
float h = bounds.y1 - bounds.y0;
if (w > h) {
scale = (float) ctx->tn_size / w;
} else {
scale = (float) ctx->tn_size / h;
}
fz_matrix m = fz_scale(scale, scale);
bounds = fz_transform_rect(bounds, m);
fz_irect bbox = fz_round_rect(bounds);
fz_pixmap *pixmap = fz_new_pixmap_with_bbox(fzctx, fz_device_rgb(fzctx), bbox, NULL, 0);
fz_clear_pixmap_with_value(fzctx, pixmap, 0xFF);
fz_device *dev = fz_new_draw_device(fzctx, m, pixmap);
fz_var(err);
fz_try(fzctx) {
fz_run_page(fzctx, *cover, dev, fz_identity, NULL);
} fz_always(fzctx) {
fz_close_device(fzctx, dev);
fz_drop_device(fzctx, dev);
} fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message)
fz_drop_page(fzctx, *cover);
fz_drop_pixmap(fzctx, pixmap);
return NULL;
}
if (pixmap->n != 3) {
CTX_LOG_ERRORF(doc->filepath, "Got unexpected pixmap depth: %d", pixmap->n)
fz_drop_page(fzctx, *cover);
fz_drop_pixmap(fzctx, pixmap);
return NULL;
}
return pixmap;
}
int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_document *fzdoc) {
fz_page *cover = NULL;
fz_pixmap *pixmap = load_pixmap(ctx, 0, fzctx, fzdoc, doc, &cover);
if (pixmap == NULL) {
return FALSE;
}
if (pixmap_is_blank(pixmap)) {
fz_drop_page(fzctx, cover);
fz_drop_pixmap(fzctx, pixmap);
CTX_LOG_DEBUG(doc->filepath, "Cover page is blank, using page 1 instead")
pixmap = load_pixmap(ctx, 1, fzctx, fzdoc, doc, &cover);
if (pixmap == NULL) {
return FALSE;
}
}
// RGB24 -> YUV420p
AVFrame *scaled_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
pixmap->w, pixmap->h, AV_PIX_FMT_RGB24,
pixmap->w, pixmap->h, AV_PIX_FMT_YUV420P,
SIST_SWS_ALGO, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, pixmap->w, pixmap->h, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len);
av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, pixmap->w, pixmap->h,
1);
unsigned char *samples = calloc(1, 1024 * 1024 * 1024);
memcpy(samples, pixmap->samples, pixmap->stride * pixmap->h);
const uint8_t *in_data[1] = {samples,};
int in_line_size[1] = {(int) pixmap->stride};
sws_scale(sws_ctx,
in_data, in_line_size,
0, pixmap->h,
scaled_frame->data, scaled_frame->linesize
);
scaled_frame->width = pixmap->w;
scaled_frame->height = pixmap->h;
scaled_frame->format = AV_PIX_FMT_YUV420P;
sws_freeContext(sws_ctx);
// YUV420p -> JPEG
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(pixmap->w, pixmap->h, ctx->tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
APPEND_TN_META(doc, pixmap->w, pixmap->h)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
free(samples);
av_packet_unref(&jpeg_packet);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
avcodec_free_context(&jpeg_encoder);
fz_drop_pixmap(fzctx, pixmap);
fz_drop_page(fzctx, cover);
return TRUE;
}
void fz_err_callback(void *user, const char *message) {
document_t *doc = (document_t *) user;
const scan_ebook_ctx_t *ctx = &thread_ctx;
CTX_LOG_WARNINGF(doc->filepath, "FZ: %s", message)
}
void fz_warn_callback(void *user, const char *message) {
document_t *doc = (document_t *) user;
const scan_ebook_ctx_t *ctx = &thread_ctx;
CTX_LOG_DEBUGF(doc->filepath, "FZ: %s", message)
}
static void init_fzctx(fz_context *fzctx, document_t *doc) {
fz_register_document_handlers(fzctx);
static int mu_is_initialized = FALSE;
if (!mu_is_initialized) {
pthread_mutex_init(&Mutex, NULL);
mu_is_initialized = TRUE;
}
fzctx->warn.print_user = doc;
fzctx->warn.print = fz_warn_callback;
fzctx->error.print_user = doc;
fzctx->error.print = fz_err_callback;
fzctx->locks.lock = my_fz_lock;
fzctx->locks.unlock = my_fz_unlock;
}
static int read_stext_block(fz_stext_block *block, text_buffer_t *tex) {
if (block->type != FZ_STEXT_BLOCK_TEXT) {
return 0;
}
fz_stext_line *line = block->u.t.first_line;
while (line != NULL) {
text_buffer_append_char(tex, ' ');
fz_stext_char *c = line->first_char;
while (c != NULL) {
if (text_buffer_append_char(tex, c->c) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
c = c->next;
}
line = line->next;
}
text_buffer_append_char(tex, ' ');
return 0;
}
#define IS_VALID_BPP(d) ((d)==1 || (d)==2 || (d)==4 || (d)==8 || (d)==16 || (d)==24 || (d)==32)
void fill_image(fz_context *fzctx, UNUSED(fz_device *dev),
fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
UNUSED(fz_color_params color_params)) {
int l2factor = 0;
if (img->w > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && IS_VALID_BPP(img->n)) {
fz_pixmap *pix = img->get_pixmap(fzctx, img, NULL, img->w, img->h, &l2factor);
if (pix->h > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && img->xres != 0) {
TessBaseAPI *api = TessBaseAPICreate();
TessBaseAPIInit3(api, thread_ctx.tesseract_path, thread_ctx.tesseract_lang);
TessBaseAPISetImage(api, pix->samples, pix->w, pix->h, pix->n, pix->stride);
TessBaseAPISetSourceResolution(api, pix->xres);
char *text = TessBaseAPIGetUTF8Text(api);
size_t len = strlen(text);
if (len >= MIN_OCR_LEN) {
text_buffer_append_string(&thread_buffer, text, len - 1);
}
TessBaseAPIEnd(api);
TessBaseAPIDelete(api);
}
fz_drop_pixmap(fzctx, pix);
}
}
void
parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only) {
fz_context *fzctx = fz_new_context(NULL, NULL, FZ_STORE_DEFAULT);
thread_ctx = *ctx;
init_fzctx(fzctx, doc);
int err = 0;
fz_document *fzdoc = NULL;
fz_stream *stream = NULL;
fz_var(fzdoc);
fz_var(stream);
fz_var(err);
fz_try(fzctx) {
stream = fz_open_memory(fzctx, buf, buf_len);
fzdoc = fz_open_document_with_stream(fzctx, mime_str, stream);
} fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
int page_count = -1;
fz_var(err);
fz_try(fzctx)page_count = fz_count_pages(fzctx, fzdoc);
fz_catch(fzctx)err = fzctx->error.errcode;
if (err) {
CTX_LOG_WARNINGF(doc->filepath, "fz_count_pages() returned error code [%d] %s", err, fzctx->error.message)
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
APPEND_LONG_META(doc, MetaPages, page_count)
if (ctx->tn_size > 0) {
if (render_cover(ctx, fzctx, doc, fzdoc) == FALSE) {
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
}
if (tn_only) {
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
char title[8192] = {'\0',};
fz_try(fzctx)fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title));
fz_catch(fzctx);
if (strlen(title) > 0) {
APPEND_UTF8_META(doc, MetaTitle, title)
}
char author[4096] = {'\0',};
fz_try(fzctx)fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_AUTHOR, author, sizeof(author));
fz_catch(fzctx);
if (strlen(author) > 0) {
APPEND_UTF8_META(doc, MetaAuthor, author)
}
if (ctx->content_size > 0) {
fz_stext_options opts = {0};
thread_buffer = text_buffer_create(ctx->content_size);
for (int current_page = 0; current_page < page_count; current_page++) {
fz_page *page = NULL;
fz_var(err);
fz_try(fzctx)page = fz_load_page(fzctx, fzdoc, current_page);
fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message)
text_buffer_destroy(&thread_buffer);
fz_drop_page(fzctx, page);
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
fz_stext_page *stext = fz_new_stext_page(fzctx, fz_bound_page(fzctx, page));
fz_device *dev = fz_new_stext_device(fzctx, stext, &opts);
dev->stroke_path = NULL;
dev->stroke_text = NULL;
dev->clip_text = NULL;
dev->clip_stroke_path = NULL;
dev->clip_stroke_text = NULL;
if (ctx->tesseract_lang != NULL) {
dev->fill_image = fill_image;
}
fz_var(err);
fz_try(fzctx)fz_run_page(fzctx, page, dev, fz_identity, NULL);
fz_always(fzctx) {
fz_close_device(fzctx, dev);
fz_drop_device(fzctx, dev);
} fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message)
text_buffer_destroy(&thread_buffer);
fz_drop_page(fzctx, page);
fz_drop_stext_page(fzctx, stext);
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
fz_stext_block *block = stext->first_block;
while (block != NULL) {
int ret = read_stext_block(block, &thread_buffer);
if (ret == TEXT_BUF_FULL) {
break;
}
block = block->next;
}
fz_drop_stext_page(fzctx, stext);
fz_drop_page(fzctx, page);
if (thread_buffer.dyn_buffer.cur >= ctx->content_size) {
break;
}
}
text_buffer_terminate_string(&thread_buffer);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + thread_buffer.dyn_buffer.cur);
meta_content->key = MetaContent;
memcpy(meta_content->str_val, thread_buffer.dyn_buffer.buf, thread_buffer.dyn_buffer.cur);
APPEND_META(doc, meta_content)
text_buffer_destroy(&thread_buffer);
}
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
}
static scan_arc_ctx_t arc_ctx = (scan_arc_ctx_t) {.passphrase = {0,}};
void parse_epub_fast(scan_ebook_ctx_t *ctx, vfile_t *f, document_t *doc) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
text_buffer_t content_buffer = text_buffer_create(ctx->content_size);
if (ctx->tn_size <= 0) {
return;
}
int ret = arc_open(&arc_ctx, f, &a, &arc_data, TRUE);
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(ebook.c) [%d] %s", ret, archive_error_string(a))
archive_read_free(a);
return;
}
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
struct stat info = *archive_entry_stat(entry);
if (S_ISREG(info.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
char *p = strrchr(file_path, '.');
if (p != NULL && (strcmp(p, ".html") == 0 || (strcmp(p, ".xhtml") == 0))) {
size_t entry_size = archive_entry_size(entry);
void *buf = malloc(entry_size + 1);
size_t read = archive_read_data(a, buf, entry_size);
*(char *) (buf + entry_size) = '\0';
if (read != entry_size) {
const char *err_str = archive_error_string(a);
if (err_str) {
CTX_LOG_ERRORF("ebook.c", "Error while reading entry: %s", err_str)
}
free(buf);
break;
}
ret = text_buffer_append_markup(&content_buffer, buf);
free(buf);
if (ret == TEXT_BUF_FULL) {
break;
}
}
}
}
text_buffer_terminate_string(&content_buffer);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + content_buffer.dyn_buffer.cur);
meta_content->key = MetaContent;
memcpy(meta_content->str_val, content_buffer.dyn_buffer.buf, content_buffer.dyn_buffer.cur);
APPEND_META(doc, meta_content)
text_buffer_destroy(&content_buffer);
archive_read_free(a);
}
void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, document_t *doc) {
if (ctx->fast_epub_parse && is_epub(mime_str)) {
parse_epub_fast(ctx, f, doc);
return;
}
size_t buf_len;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
parse_ebook_mem(ctx, buf, buf_len, mime_str, doc, FALSE);
free(buf);
}

View File

@ -0,0 +1,30 @@
#ifndef SCAN_EBOOK_H
#define SCAN_EBOOK_H
#include "../scan.h"
typedef struct {
long content_size;
int tn_size;
const char *tesseract_lang;
const char *tesseract_path;
pthread_mutex_t mupdf_mutex;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int fast_epub_parse;
float tn_qscale;
} scan_ebook_ctx_t;
void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, document_t *doc);
void
parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only);
__always_inline
static int is_epub(const char *mime_string) {
return strcmp(mime_string, "application/epub+zip") == 0;
}
#endif

246
third-party/libscan/libscan/font/font.c vendored Normal file
View File

@ -0,0 +1,246 @@
#include "font.h"
#include <ft2build.h>
#include <freetype/freetype.h>
#include "../util.h"
__thread FT_Library ft_lib = NULL;
typedef struct text_dimensions {
unsigned int width;
unsigned int height;
unsigned int baseline;
} text_dimensions_t;
typedef struct glyph {
int top;
int height;
int width;
int descent;
int ascent;
int advance_width;
unsigned char *pixmap;
} glyph_t;
__always_inline
int kerning_offset(char c, char pc, FT_Face face) {
FT_Vector kerning;
FT_Get_Kerning(face, c, pc, FT_KERNING_DEFAULT, &kerning);
return (int) (kerning.x / 64);
}
__always_inline
glyph_t ft_glyph_to_glyph(FT_GlyphSlot slot) {
glyph_t glyph;
glyph.pixmap = slot->bitmap.buffer;
glyph.width = (int) slot->bitmap.width;
glyph.height = (int) slot->bitmap.rows;
glyph.top = slot->bitmap_top;
glyph.advance_width = (int) slot->advance.x / 64;
glyph.descent = MAX(0, glyph.height - glyph.top);
glyph.ascent = MAX(0, MAX(glyph.top, glyph.height) - glyph.descent);
return glyph;
}
text_dimensions_t text_dimension(char *text, FT_Face face) {
text_dimensions_t dimensions;
dimensions.width = 0;
int num_chars = (int) strlen(text);
unsigned int max_ascent = 0;
int max_descent = 0;
char pc = 0;
for (int i = 0; i < num_chars; i++) {
char c = text[i];
FT_Load_Char(face, c, 0);
glyph_t glyph = ft_glyph_to_glyph(face->glyph);
max_descent = MAX(max_descent, glyph.descent);
max_ascent = MAX(max_ascent, MAX(glyph.height, glyph.ascent));
int kerning_x = kerning_offset(c, pc, face);
dimensions.width += MAX(glyph.advance_width, glyph.width) + kerning_x;
pc = c;
}
dimensions.height = max_ascent + max_descent;
dimensions.baseline = max_descent;
return dimensions;
}
void draw_glyph(glyph_t *glyph, int x, int y, struct text_dimensions text_info, unsigned char *bitmap) {
unsigned int src = 0;
unsigned int dst = y * text_info.width + x;
unsigned int row_offset = text_info.width - glyph->width;
unsigned int buf_len = text_info.width * text_info.height;
for (unsigned int sy = 0; sy < glyph->height; sy++) {
for (unsigned int sx = 0; sx < glyph->width; sx++) {
if (dst < buf_len) {
bitmap[dst] |= glyph->pixmap[src];
}
src++;
dst++;
}
dst += row_offset;
}
}
void bmp_format(dyn_buffer_t *buf, text_dimensions_t dimensions, const unsigned char *bitmap) {
dyn_buffer_write_short(buf, 0x4D42); // Magic
dyn_buffer_write_int(buf, 0); // Size placeholder
dyn_buffer_write_int(buf, 0x5157); //Reserved
dyn_buffer_write_int(buf, 14 + 40 + 256 * 4); // pixels offset
dyn_buffer_write_int(buf, 40); // DIB size
dyn_buffer_write_int(buf, (int) dimensions.width);
dyn_buffer_write_int(buf, (int) dimensions.height);
dyn_buffer_write_short(buf, 1); // Color planes
dyn_buffer_write_short(buf, 8); // bits per pixel
dyn_buffer_write_int(buf, 0); // compression
dyn_buffer_write_int(buf, 0); // Ignored
dyn_buffer_write_int(buf, 3800); // hres
dyn_buffer_write_int(buf, 3800); // vres
dyn_buffer_write_int(buf, 256); // Color count
dyn_buffer_write_int(buf, 0); // Ignored
// RGBA32 Color table (Grayscale)
for (int i = 255; i >= 0; i--) {
dyn_buffer_write_int(buf, i + (i << 8) + (i << 16));
}
// Pixel array: write from bottom to top, with rows padded to multiples of 4-bytes
for (int y = (int) dimensions.height - 1; y >= 0; y--) {
for (unsigned int x = 0; x < dimensions.width; x++) {
dyn_buffer_write_char(buf, (char) bitmap[y * dimensions.width + x]);
}
while (buf->cur % 4 != 0) {
dyn_buffer_write_char(buf, 0);
}
}
// Size
*(int *) ((char *) buf->buf + 2) = buf->cur;
}
void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc) {
if (ft_lib == NULL) {
FT_Init_FreeType(&ft_lib);
}
size_t buf_len = 0;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FT_Face face;
FT_Error err = FT_New_Memory_Face(ft_lib, (unsigned char *) buf, (int) buf_len, 0, &face);
if (err != 0) {
CTX_LOG_ERRORF(doc->filepath, "(font.c) FT_New_Memory_Face() returned error code [%d] %s", err,
FT_Error_String(err))
free(buf);
return;
}
char font_name[4096];
if (face->style_name == NULL || (strcmp(face->style_name, "?") == 0)) {
if (face->family_name == NULL) {
strcpy(font_name, "(null)");
} else {
strncpy(font_name, face->family_name, sizeof(font_name));
}
} else {
snprintf(font_name, sizeof(font_name), "%s %s", face->family_name, face->style_name);
}
meta_line_t *meta_name = malloc(sizeof(meta_line_t) + strlen(font_name));
meta_name->key = MetaFontName;
strcpy(meta_name->str_val, font_name);
APPEND_META(doc, meta_name)
if (ctx->enable_tn == TRUE) {
FT_Done_Face(face);
free(buf);
return;
}
int pixel = 64;
int num_chars = (int) strlen(font_name);
err = FT_Set_Pixel_Sizes(face, 0, pixel);
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "(font.c) FT_Set_Pixel_Sizes() returned error code [%d] %s", err,
FT_Error_String(err))
FT_Done_Face(face);
free(buf);
return;
}
text_dimensions_t dimensions = text_dimension(font_name, face);
unsigned char *bitmap = calloc(dimensions.width * dimensions.height, 1);
FT_Vector pen;
pen.x = 0;
char pc = 0;
for (int i = 0; i < num_chars; i++) {
char c = font_name[i];
err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
if (err != 0) {
c = c >= 'a' && c <= 'z' ? c - 32 : c + 32;
err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "(font.c) FT_Load_Char() returned error code [%d] %s", err,
FT_Error_String(err))
continue;
}
}
glyph_t glyph = ft_glyph_to_glyph(face->glyph);
pen.x += kerning_offset(c, pc, face);
if (pen.x <= 0) {
pen.x = ABS(glyph.advance_width - glyph.width);
}
pen.y = dimensions.height - glyph.ascent - dimensions.baseline;
draw_glyph(&glyph, pen.x, pen.y, dimensions, bitmap);
pen.x += glyph.advance_width;
pc = c;
}
dyn_buffer_t bmp_data = dyn_buffer_create();
bmp_format(&bmp_data, dimensions, bitmap);
APPEND_TN_META(doc, dimensions.width, dimensions.height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) bmp_data.buf, bmp_data.cur);
dyn_buffer_destroy(&bmp_data);
free(bitmap);
FT_Done_Face(face);
free(buf);
}
void cleanup_font() {
FT_Done_FreeType(ft_lib);
}

17
third-party/libscan/libscan/font/font.h vendored Normal file
View File

@ -0,0 +1,17 @@
#ifndef SCAN_FONT_H
#define SCAN_FONT_H
#include "../scan.h"
typedef struct {
int enable_tn;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
} scan_font_ctx_t;
void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc);
void cleanup_font();
#endif

119
third-party/libscan/libscan/json/json.c vendored Normal file
View File

@ -0,0 +1,119 @@
#include "json.h"
#include "cjson/cJSON.h"
#define JSON_MAX_FILE_SIZE (1024 * 1024 * 50)
int json_extract_text(cJSON *json, text_buffer_t *tex) {
if (cJSON_IsObject(json)) {
for (cJSON *child = json->child; child != NULL; child = child->next) {
if (json_extract_text(child, tex)) {
return TRUE;
}
}
} else if (cJSON_IsArray(json)) {
cJSON *child;
cJSON_ArrayForEach(child, json) {
if (json_extract_text(child, tex)) {
return TRUE;
}
}
} else if (cJSON_IsString(json)) {
if (text_buffer_append_string0(tex, json->valuestring) == TEXT_BUF_FULL) {
return TRUE;
}
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
return TRUE;
}
}
return FALSE;
}
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
if (f->info.st_size > JSON_MAX_FILE_SIZE) {
CTX_LOG_WARNINGF("json.c", "File larger than maximum allowed [%s]", f->filepath)
return SCAN_ERR_SKIP;
}
size_t buf_len;
char *buf = read_all(f, &buf_len);
if (buf == NULL) {
return SCAN_ERR_READ;
}
buf_len += 1;
buf = realloc(buf, buf_len);
*(buf + buf_len - 1) = '\0';
cJSON *json = cJSON_ParseWithOpts(buf, NULL, TRUE);
text_buffer_t tex = text_buffer_create(ctx->content_size);
json_extract_text(json, &tex);
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
cJSON_Delete(json);
free(buf);
text_buffer_destroy(&tex);
return SCAN_OK;
}
#define JSON_BUF_SIZE (1024 * 1024 * 5)
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
char *buf = calloc(JSON_BUF_SIZE + 1, sizeof(char));
*(buf + JSON_BUF_SIZE) = '\0';
text_buffer_t tex = text_buffer_create(ctx->content_size);
size_t ret;
int eof = FALSE;
const char *parse_end = buf;
size_t to_read;
char *ptr = buf;
while (TRUE) {
cJSON *json;
if (!eof) {
to_read = parse_end == buf ? JSON_BUF_SIZE : parse_end - buf;
ret = f->read(f, ptr, to_read);
if (ret != to_read) {
eof = TRUE;
}
}
json = cJSON_ParseWithOpts(buf, &parse_end, FALSE);
if (parse_end == buf + JSON_BUF_SIZE) {
CTX_LOG_ERRORF("json.c", "Line too large for buffer [%s]", doc->filepath);
cJSON_Delete(json);
break;
}
if (parse_end == buf) {
cJSON_Delete(json);
break;
}
json_extract_text(json, &tex);
cJSON_Delete(json);
memmove(buf, parse_end, (buf + JSON_BUF_SIZE - parse_end));
ptr = buf + JSON_BUF_SIZE - parse_end + buf;
}
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
free(buf);
text_buffer_destroy(&tex);
}

30
third-party/libscan/libscan/json/json.h vendored Normal file
View File

@ -0,0 +1,30 @@
#ifndef SCAN_JSON_H
#define SCAN_JSON_H
#include "../scan.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
unsigned int json_mime;
unsigned int ndjson_mime;
} scan_json_ctx_t;
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
__always_inline
static int is_json(scan_json_ctx_t *ctx, unsigned int mime) {
return mime == ctx->json_mime;
}
__always_inline
static int is_ndjson(scan_json_ctx_t *ctx, unsigned int mime) {
return mime == ctx->ndjson_mime;
}
#endif

62
third-party/libscan/libscan/macros.h vendored Normal file
View File

@ -0,0 +1,62 @@
#ifndef FALSE
#define FALSE (0)
#define BOOL int
#endif
#ifndef TRUE
#define TRUE (!FALSE)
#endif
#undef MAX
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#undef MIN
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
#ifndef PATH_MAX
#define PATH_MAX 4096
#endif
#undef ABS
#define ABS(a) (((a) < 0) ? -(a) : (a))
#define SHA1_STR_LENGTH 41
#define SHA1_DIGEST_LENGTH 20
#define APPEND_STR_META(doc, keyname, value) \
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \
meta_str->key = keyname; \
strcpy(meta_str->str_val, value); \
APPEND_META(doc, meta_str)}
#define APPEND_LONG_META(doc, keyname, value) \
{meta_line_t *meta_long = malloc(sizeof(meta_line_t)); \
meta_long->key = keyname; \
meta_long->long_val = value; \
APPEND_META(doc, meta_long)}
#define APPEND_TN_META(doc, width, height) \
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + 4 + 1 + 4); \
meta_str->key = MetaThumbnail; \
sprintf(meta_str->str_val, "%04d,%04d", width, height); \
APPEND_META(doc, meta_str)}
#define APPEND_META(doc, meta) \
meta->next = NULL;\
if (doc->meta_head == NULL) {\
doc->meta_head = meta;\
doc->meta_tail = doc->meta_head;\
} else {\
doc->meta_tail->next = meta;\
doc->meta_tail = meta;\
}
#define APPEND_UTF8_META(doc, keyname, str) \
text_buffer_t tex = text_buffer_create(-1); \
text_buffer_append_string0(&tex, str); \
text_buffer_terminate_string(&tex); \
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \
meta_tag->key = keyname; \
strcpy(meta_tag->str_val, tex.dyn_buffer.buf); \
APPEND_META(doc, meta_tag) \
text_buffer_destroy(&tex);

View File

@ -0,0 +1,749 @@
#include "media.h"
#include <ctype.h>
#define MIN_SIZE 32
#define AVIO_BUF_SIZE 8192
#define IS_VIDEO(fmt) (fmt->iformat->name && strcmp(fmt->iformat->name, "image2") != 0)
#define STORE_AS_IS ((void*)-1)
const char *get_filepath_with_ext(document_t *doc, const char *filepath, const char *mime_str) {
int has_extension = doc->ext > doc->base;
if (!has_extension) {
if (strcmp(mime_str, "image/png") == 0) {
return "file.png";
} else if (strcmp(mime_str, "image/jpeg") == 0) {
return "file.jpg";
}
}
return filepath;
}
__always_inline
void *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) {
if (frame->pict_type == AV_PICTURE_TYPE_NONE) {
return NULL;
}
int dstW;
int dstH;
if (frame->width <= size && frame->height <= size) {
if (decoder->codec_id == AV_CODEC_ID_MJPEG || decoder->codec_id == AV_CODEC_ID_PNG) {
return STORE_AS_IS;
}
dstW = frame->width;
dstH = frame->height;
} else {
double ratio = (double) frame->width / frame->height;
if (frame->width > frame->height) {
dstW = size;
dstH = (int) (size / ratio);
} else {
dstW = (int) (size * ratio);
dstH = size;
}
}
if (dstW <= MIN_SIZE || dstH <= MIN_SIZE) {
return NULL;
}
AVFrame *scaled_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
decoder->width, decoder->height, decoder->pix_fmt,
dstW, dstH, AV_PIX_FMT_YUVJ420P,
SIST_SWS_ALGO, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, dstW, dstH, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len * 2);
av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, dstW, dstH, 1);
sws_scale(sws_ctx,
(const uint8_t *const *) frame->data, frame->linesize,
0, decoder->height,
scaled_frame->data, scaled_frame->linesize
);
scaled_frame->width = dstW;
scaled_frame->height = dstH;
scaled_frame->format = AV_PIX_FMT_YUV420P;
sws_freeContext(sws_ctx);
return scaled_frame;
}
typedef struct {
AVPacket *packet;
AVFrame *frame;
} frame_and_packet_t;
static void frame_and_packet_free(frame_and_packet_t *frame_and_packet) {
if (frame_and_packet->packet != NULL) {
av_packet_free(&frame_and_packet->packet);
}
if (frame_and_packet->frame != NULL) {
av_frame_free(&frame_and_packet->frame);
}
free(frame_and_packet->packet);
free(frame_and_packet);
}
__always_inline
static void read_subtitles(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, int stream_idx, document_t *doc) {
text_buffer_t tex = text_buffer_create(-1);
AVPacket packet;
AVSubtitle subtitle;
AVCodec *subtitle_codec = avcodec_find_decoder(pFormatCtx->streams[stream_idx]->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(subtitle_codec);
avcodec_parameters_to_context(decoder, pFormatCtx->streams[stream_idx]->codecpar);
avcodec_open2(decoder, subtitle_codec, NULL);
decoder->sub_text_format = FF_SUB_TEXT_FMT_ASS;
int got_sub;
while (1) {
int read_frame_ret = av_read_frame(pFormatCtx, &packet);
if (read_frame_ret != 0) {
break;
}
if (packet.stream_index != stream_idx) {
av_packet_unref(&packet);
continue;
}
avcodec_decode_subtitle2(decoder, &subtitle, &got_sub, &packet);
if (got_sub) {
for (int i = 0; i < subtitle.num_rects; i++) {
const char *text = subtitle.rects[i]->ass;
if (text == NULL) {
continue;
}
char *idx = strstr(text, "\\N");
if (idx != NULL && strlen(idx + 2) > 1) {
text_buffer_append_string0(&tex, idx + 2);
text_buffer_append_char(&tex, ' ');
}
}
avsubtitle_free(&subtitle);
}
av_packet_unref(&packet);
}
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
text_buffer_destroy(&tex);
avcodec_free_context(&decoder);
}
__always_inline
static frame_and_packet_t *
read_frame(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVCodecContext *decoder, int stream_idx,
document_t *doc) {
frame_and_packet_t *result = calloc(1, sizeof(frame_and_packet_t));
result->packet = av_packet_alloc();
result->frame = av_frame_alloc();
av_init_packet(result->packet);
int receive_ret = -EAGAIN;
while (receive_ret == -EAGAIN) {
// Get video frame
while (1) {
int read_frame_ret = av_read_frame(pFormatCtx, result->packet);
if (read_frame_ret != 0) {
if (read_frame_ret != AVERROR_EOF) {
CTX_LOG_WARNINGF(doc->filepath,
"(media.c) avcodec_read_frame() returned error code [%d] %s",
read_frame_ret, av_err2str(read_frame_ret)
)
}
frame_and_packet_free(result);
return NULL;
}
//Ignore audio/other frames
if (result->packet->stream_index != stream_idx) {
av_packet_unref(result->packet);
continue;
}
break;
}
// Feed it to decoder
int decode_ret = avcodec_send_packet(decoder, result->packet);
if (decode_ret != 0) {
CTX_LOG_ERRORF(doc->filepath,
"(media.c) avcodec_send_packet() returned error code [%d] %s",
decode_ret, av_err2str(decode_ret)
)
frame_and_packet_free(result);
return NULL;
}
receive_ret = avcodec_receive_frame(decoder, result->frame);
if (receive_ret == -EAGAIN && result->packet != NULL) {
av_packet_unref(result->packet);
}
}
return result;
}
void append_tag_meta_if_not_exists(scan_media_ctx_t *ctx, document_t *doc, AVDictionaryEntry *tag, enum metakey key) {
meta_line_t *meta = doc->meta_head;
while (meta != NULL) {
if (meta->key == key) {
CTX_LOG_DEBUGF(doc->filepath, "Ignoring duplicate tag: '%02x=%s' and '%02x=%s'",
key, meta->str_val, key, tag->value)
return;
}
meta = meta->next;
}
text_buffer_t tex = text_buffer_create(-1);
text_buffer_append_string0(&tex, tag->value);
text_buffer_terminate_string(&tex);
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta_tag->key = key;
strcpy(meta_tag->str_val, tex.dyn_buffer.buf);
APPEND_META(doc, meta_tag)
text_buffer_destroy(&tex);
}
#define APPEND_TAG_META(keyname) \
APPEND_UTF8_META(doc, keyname, tag->value)
#define STRCPY_TOLOWER(dst, str) \
strncpy(dst, str, sizeof(dst)); \
char *ptr = dst; \
for (; *ptr; ++ptr) *ptr = (char) tolower(*ptr);
__always_inline
static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
AVDictionaryEntry *tag = NULL;
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
char key[256];
STRCPY_TOLOWER(key, tag->key)
if (strcmp(key, "artist") == 0) {
APPEND_TAG_META(MetaArtist)
} else if (strcmp(key, "genre") == 0) {
APPEND_TAG_META(MetaGenre)
} else if (strcmp(key, "title") == 0) {
APPEND_TAG_META(MetaTitle)
} else if (strcmp(key, "album_artist") == 0) {
APPEND_TAG_META(MetaAlbumArtist)
} else if (strcmp(key, "album") == 0) {
APPEND_TAG_META(MetaAlbum)
} else if (strcmp(key, "comment") == 0) {
APPEND_TAG_META(MetaContent)
}
}
}
__always_inline
static void
append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *frame, document_t *doc, int is_video) {
if (is_video) {
meta_line_t *meta_duration = malloc(sizeof(meta_line_t));
meta_duration->key = MetaMediaDuration;
meta_duration->long_val = pFormatCtx->duration / AV_TIME_BASE;
if (meta_duration->long_val > INT32_MAX) {
meta_duration->long_val = 0;
}
APPEND_META(doc, meta_duration)
meta_line_t *meta_bitrate = malloc(sizeof(meta_line_t));
meta_bitrate->key = MetaMediaBitrate;
meta_bitrate->long_val = pFormatCtx->bit_rate;
APPEND_META(doc, meta_bitrate)
}
AVDictionaryEntry *tag = NULL;
if (is_video) {
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
char key[256];
STRCPY_TOLOWER(key, tag->key)
if (strcmp(key, "title") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaTitle);
} else if (strcmp(key, "comment") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaContent);
} else if (strcmp(key, "artist") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
}
}
} else {
// EXIF metadata
while ((tag = av_dict_get(frame->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
char key[256];
STRCPY_TOLOWER(key, tag->key)
if (strcmp(key, "artist") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
} else if (strcmp(key, "imagedescription") == 0) {
APPEND_TAG_META(MetaContent)
} else if (strcmp(key, "make") == 0) {
APPEND_TAG_META(MetaExifMake)
} else if (strcmp(key, "model") == 0) {
APPEND_TAG_META(MetaExifModel)
} else if (strcmp(key, "software") == 0) {
APPEND_TAG_META(MetaExifSoftware)
} else if (strcmp(key, "fnumber") == 0) {
APPEND_TAG_META(MetaExifFNumber)
} else if (strcmp(key, "focallength") == 0) {
APPEND_TAG_META(MetaExifFocalLength)
} else if (strcmp(key, "usercomment") == 0) {
APPEND_TAG_META(MetaExifUserComment)
} else if (strcmp(key, "isospeedratings") == 0) {
APPEND_TAG_META(MetaExifIsoSpeedRatings)
} else if (strcmp(key, "exposuretime") == 0) {
APPEND_TAG_META(MetaExifExposureTime)
} else if (strcmp(key, "datetime") == 0) {
APPEND_TAG_META(MetaExifDateTime)
} else if (strcmp(key, "gpslatitude") == 0) {
APPEND_TAG_META(MetaExifGpsLatitudeDMS)
} else if (strcmp(key, "gpslatituderef") == 0) {
APPEND_TAG_META(MetaExifGpsLatitudeRef)
} else if (strcmp(key, "gpslongitude") == 0) {
APPEND_TAG_META(MetaExifGpsLongitudeDMS)
} else if (strcmp(key, "gpslongituderef") == 0) {
APPEND_TAG_META(MetaExifGpsLongitudeRef)
}
}
}
}
void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) {
int video_stream = -1;
int audio_stream = -1;
int subtitle_stream = -1;
avformat_find_stream_info(pFormatCtx, NULL);
for (int i = (int) pFormatCtx->nb_streams - 1; i >= 0; i--) {
AVStream *stream = pFormatCtx->streams[i];
if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
if (audio_stream == -1) {
const AVCodecDescriptor *desc = avcodec_descriptor_get(stream->codecpar->codec_id);
if (desc != NULL) {
APPEND_STR_META(doc, MetaMediaAudioCodec, desc->name)
}
audio_stream = i;
}
} else if (stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
if (video_stream == -1) {
const AVCodecDescriptor *desc = avcodec_descriptor_get(stream->codecpar->codec_id);
if (desc != NULL) {
APPEND_STR_META(doc, MetaMediaVideoCodec, desc->name)
}
meta_line_t *meta_w = malloc(sizeof(meta_line_t));
meta_w->key = MetaWidth;
meta_w->long_val = stream->codecpar->width;
APPEND_META(doc, meta_w)
meta_line_t *meta_h = malloc(sizeof(meta_line_t));
meta_h->key = MetaHeight;
meta_h->long_val = stream->codecpar->height;
APPEND_META(doc, meta_h)
video_stream = i;
}
} else if (stream->codecpar->codec_type == AVMEDIA_TYPE_SUBTITLE) {
subtitle_stream = i;
}
}
if (subtitle_stream != -1 && ctx->read_subtitles) {
read_subtitles(ctx, pFormatCtx, subtitle_stream, doc);
// Reset stream
if (video_stream != -1) {
av_seek_frame(pFormatCtx, video_stream, 0, 0);
}
}
if (audio_stream != -1) {
append_audio_meta(pFormatCtx, doc);
}
if (video_stream != -1 && ctx->tn_size > 0) {
AVStream *stream = pFormatCtx->streams[video_stream];
if (stream->codecpar->width <= MIN_SIZE || stream->codecpar->height <= MIN_SIZE) {
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
// Decoder
AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
avcodec_parameters_to_context(decoder, stream->codecpar);
avcodec_open2(decoder, video_codec, NULL);
//Seek
if (stream->nb_frames > 1 && stream->codecpar->codec_id != AV_CODEC_ID_GIF) {
int seek_ret;
for (int i = 20; i >= 0; i--) {
seek_ret = av_seek_frame(pFormatCtx, video_stream,
stream->duration * 0.10, 0);
if (seek_ret == 0) {
break;
}
}
}
frame_and_packet_t *frame_and_packet = read_frame(ctx, pFormatCtx, decoder, video_stream, doc);
if (frame_and_packet == NULL) {
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
append_video_meta(ctx, pFormatCtx, frame_and_packet->frame, doc, IS_VIDEO(pFormatCtx));
// Scale frame
AVFrame *scaled_frame = scale_frame(decoder, frame_and_packet->frame, ctx->tn_size);
if (scaled_frame == NULL) {
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
if (scaled_frame == STORE_AS_IS) {
APPEND_TN_META(doc, frame_and_packet->frame->width, frame_and_packet->frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data,
frame_and_packet->packet->size);
} else {
// Encode frame to jpeg
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
ctx->tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
// Save thumbnail
APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
avcodec_free_context(&jpeg_encoder);
av_packet_unref(&jpeg_packet);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
}
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
}
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
}
void parse_media_filename(scan_media_ctx_t *ctx, const char *filepath, document_t *doc) {
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
return;
}
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
if (res < 0) {
CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
parse_media_format_ctx(ctx, pFormatCtx, doc);
}
int vfile_read(void *ptr, uint8_t *buf, int buf_size) {
struct vfile *f = ptr;
int ret = f->read(f, buf, buf_size);
if (ret == 0) {
return AVERROR_EOF;
}
return ret;
}
typedef struct {
size_t size;
FILE *file;
void *buf;
} memfile_t;
int memfile_read(void *ptr, uint8_t *buf, int buf_size) {
memfile_t *mem = ptr;
size_t ret = fread(buf, 1, buf_size, mem->file);
if (ret == 0 && feof(mem->file)) {
return AVERROR_EOF;
}
return (int) ret;
}
long memfile_seek(void *ptr, long offset, int whence) {
memfile_t *mem = ptr;
if (whence == 0x10000) {
return mem->size;
}
int ret = fseek(mem->file, offset, whence);
if (ret != 0) {
return AVERROR_EOF;
}
return ftell(mem->file);
}
int memfile_open(vfile_t *f, memfile_t *mem) {
mem->size = f->info.st_size;
mem->buf = malloc(mem->size);
if (mem->buf == NULL) {
return -1;
}
int ret = f->read(f, mem->buf, mem->size);
mem->file = fmemopen(mem->buf, mem->size, "rb");
if (f->calculate_checksum) {
SHA1_Init(&f->sha1_ctx);
safe_sha1_update(&f->sha1_ctx, mem->buf, mem->size);
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
f->has_checksum = TRUE;
}
return (ret == mem->size && mem->file != NULL) ? 0 : -1;
}
int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) {
mem->size = (int) buf_len;
mem->buf = buf;
mem->file = fmemopen(mem->buf, mem->size, "rb");
return mem->file != NULL ? 0 : -1;
}
void memfile_close(memfile_t *mem) {
if (mem->buf != NULL) {
free(mem->buf);
fclose(mem->file);
}
}
void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc, const char *mime_str) {
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
return;
}
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
AVIOContext *io_ctx = NULL;
memfile_t memfile = {0, 0, 0};
const char *filepath = get_filepath_with_ext(doc, f->filepath, mime_str);
if (f->info.st_size <= ctx->max_media_buffer) {
int ret = memfile_open(f, &memfile);
if (ret == 0) {
CTX_LOG_DEBUGF(f->filepath, "Loading media file in memory (%ldB)", f->info.st_size)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, &memfile, memfile_read, NULL, memfile_seek);
}
}
if (io_ctx == NULL) {
CTX_LOG_DEBUGF(f->filepath, "Reading media file without seek support", f->info.st_size)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, f, vfile_read, NULL, NULL);
}
pFormatCtx->pb = io_ctx;
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
if (res < 0) {
if (res != -5) {
CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
}
av_free(io_ctx->buffer);
memfile_close(&memfile);
avio_context_free(&io_ctx);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
parse_media_format_ctx(ctx, pFormatCtx, doc);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
memfile_close(&memfile);
}
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char *mime_str) {
if (f->is_fs_file) {
parse_media_filename(ctx, f->filepath, doc);
} else {
parse_media_vfile(ctx, f, doc, mime_str);
}
}
void init_media() {
av_log_set_level(AV_LOG_QUIET);
}
int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) {
memfile_t memfile = {0, 0, 0};
AVIOContext *io_ctx = NULL;
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
return FALSE;
}
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
int ret = memfile_open_buf(buf, buf_len, &memfile);
if (ret == 0) {
CTX_LOG_DEBUGF(doc->filepath, "Loading media file in memory (%ldB)", buf_len)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, &memfile, memfile_read, NULL, memfile_seek);
} else {
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
fclose(memfile.file);
return FALSE;
}
pFormatCtx->pb = io_ctx;
int res = avformat_open_input(&pFormatCtx, url, NULL, NULL);
if (res != 0) {
av_free(io_ctx->buffer);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
AVStream *stream = pFormatCtx->streams[0];
// Decoder
const AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
avcodec_parameters_to_context(decoder, stream->codecpar);
avcodec_open2(decoder, video_codec, NULL);
frame_and_packet_t *frame_and_packet = read_frame(ctx, pFormatCtx, decoder, 0, doc);
if (frame_and_packet == NULL) {
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
// Scale frame
AVFrame *scaled_frame = scale_frame(decoder, frame_and_packet->frame, ctx->tn_size);
if (scaled_frame == NULL) {
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
if (scaled_frame == STORE_AS_IS) {
APPEND_TN_META(doc, frame_and_packet->frame->width, frame_and_packet->frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data,
frame_and_packet->packet->size);
} else {
// Encode frame to jpeg
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
ctx->tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
// Save thumbnail
APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
av_packet_unref(&jpeg_packet);
avcodec_free_context(&jpeg_encoder);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
}
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return TRUE;
}

View File

@ -0,0 +1,52 @@
#ifndef SIST2_MEDIA_H
#define SIST2_MEDIA_H
#include "../scan.h"
#include "libavformat/avformat.h"
#include "libswscale/swscale.h"
#include "libswresample/swresample.h"
#include "libavcodec/avcodec.h"
#include "libavutil/imgutils.h"
typedef struct {
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int tn_size;
float tn_qscale;
long max_media_buffer;
int read_subtitles;
} scan_media_ctx_t;
__always_inline
static AVCodecContext *alloc_jpeg_encoder(int w, int h, float qscale) {
const AVCodec *jpeg_codec = avcodec_find_encoder(AV_CODEC_ID_MJPEG);
AVCodecContext *jpeg = avcodec_alloc_context3(jpeg_codec);
jpeg->width = w;
jpeg->height = h;
jpeg->time_base.den = 1000000;
jpeg->time_base.num = 1;
jpeg->i_quant_factor = qscale;
jpeg->pix_fmt = AV_PIX_FMT_YUVJ420P;
int ret = avcodec_open2(jpeg, jpeg_codec, NULL);
if (ret != 0) {
return NULL;
}
return jpeg;
}
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char*mime_str);
void init_media();
int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url);
#endif

View File

@ -0,0 +1,79 @@
#include "scan_mobi.h"
#include <mobi.h>
#include <errno.h>
#include "stdlib.h"
void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc) {
MOBIData *m = mobi_init();
if (m == NULL) {
CTX_LOG_ERROR(f->filepath, "mobi_init() failed")
return;
}
size_t buf_len;
char* buf = read_all(f, &buf_len);
if (buf == NULL) {
mobi_free(m);
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FILE *file = fmemopen(buf, buf_len, "rb");
if (file == NULL) {
mobi_free(m);
free(buf);
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
return;
}
MOBI_RET mobi_ret = mobi_load_file(m, file);
fclose(file);
if (mobi_ret != MOBI_SUCCESS) {
mobi_free(m);
free(buf);
CTX_LOG_ERRORF(f->filepath, "mobi_laod_file() returned error code [%d]", mobi_ret)
return;
}
char *author = mobi_meta_get_author(m);
if (author != NULL) {
APPEND_STR_META(doc, MetaAuthor, author)
free(author);
}
char *title = mobi_meta_get_title(m);
if (title != NULL) {
APPEND_STR_META(doc, MetaTitle, title)
free(title);
}
const size_t maxlen = mobi_get_text_maxsize(m);
if (maxlen == MOBI_NOTSET) {
free(buf);
CTX_LOG_DEBUGF("%s", "Invalid text maxsize: %zu", maxlen)
return;
}
char *content_str = malloc(maxlen + 1);
size_t length = maxlen;
mobi_ret = mobi_get_rawml(m, content_str, &length);
if (mobi_ret != MOBI_SUCCESS) {
mobi_free(m);
free(content_str);
free(buf);
CTX_LOG_ERRORF(f->filepath, "mobi_get_rawml() returned error code [%d]", mobi_ret)
return;
}
text_buffer_t tex = text_buffer_create(ctx->content_size);
text_buffer_append_markup(&tex, content_str);
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
free(content_str);
free(buf);
text_buffer_destroy(&tex);
mobi_free(m);
}

View File

@ -0,0 +1,14 @@
#ifndef SCAN_SCAN_MOBI_H
#define SCAN_SCAN_MOBI_H
#include "../scan.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
} scan_mobi_ctx_t;
void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

View File

@ -0,0 +1,147 @@
#include "msdoc.h"
#include <errno.h>
#include <sys/mman.h>
#include "../../third-party/antiword/src/antiword.h"
#include "../ebook/ebook.h"
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void *buf, size_t buf_len) {
// Open word doc
options_type *opts = direct_vGetOptions();
opts->iParagraphBreak = 74;
opts->eConversionType = conversion_text;
opts->bHideHiddenText = 1;
opts->bRemoveRemovedText = 1;
opts->bUseLandscape = 0;
opts->eEncoding = encoding_utf_8;
opts->iPageHeight = 842; // A4
opts->iPageWidth = 595;
opts->eImageLevel = level_ps_3;
int doc_word_version = iGuessVersionNumber(file_in, (int) buf_len);
if (doc_word_version < 0 || doc_word_version == 3) {
free(buf);
return;
}
rewind(file_in);
size_t out_len;
char *out_buf;
FILE *file_out = open_memstream(&out_buf, &out_len);
diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
if (diag == NULL) {
fclose(file_in);
return;
}
iInitDocument(file_in, (int) buf_len);
const char *author = szGetAuthor();
if (author != NULL) {
APPEND_UTF8_META(doc, MetaAuthor, author)
}
const char *title = szGetTitle();
if (title != NULL) {
APPEND_UTF8_META(doc, MetaTitle, title)
}
vFreeDocument();
bWordDecryptor(file_in, (int) buf_len, diag);
vDestroyDiagram(diag);
fclose(file_out);
if (buf_len > 0) {
text_buffer_t tex = text_buffer_create(ctx->content_size);
text_buffer_append_string(&tex, out_buf, out_len);
text_buffer_terminate_string(&tex);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta_content->key = MetaContent;
memcpy(meta_content->str_val, tex.dyn_buffer.buf, tex.dyn_buffer.cur);
APPEND_META(doc, meta_content)
text_buffer_destroy(&tex);
}
free(buf);
free(out_buf);
}
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void *buf, size_t buf_len) {
scan_ebook_ctx_t ebook_ctx = {
.content_size = ctx->content_size,
.tn_size = ctx->tn_size,
.log = ctx->log,
.logf = ctx->logf,
.store = ctx->store,
};
// Open word doc
options_type *opts = direct_vGetOptions();
opts->iParagraphBreak = 74;
opts->eConversionType = conversion_pdf;
opts->bHideHiddenText = 1;
opts->bRemoveRemovedText = 1;
opts->bUseLandscape = 0;
opts->eEncoding = encoding_latin_1;
opts->iPageHeight = 842; // A4
opts->iPageWidth = 595;
opts->eImageLevel = level_ps_3;
int doc_word_version = iGuessVersionNumber(file, (int) buf_len);
if (doc_word_version < 0 || doc_word_version == 3) {
free(buf);
return;
}
rewind(file);
size_t out_len;
char *out_buf;
FILE *file_out = open_memstream(&out_buf, &out_len);
diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
if (diag == NULL) {
return;
}
bWordDecryptor(file, (int) buf_len, diag);
vDestroyDiagram(diag);
fclose(file_out);
parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc, TRUE);
free(buf);
free(out_buf);
}
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
char *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FILE *file = fmemopen(buf, buf_len, "rb");
if (file == NULL) {
free(buf);
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
return;
}
if (ctx->tn_size > 0) {
char *buf_pdf = malloc(buf_len);
memcpy(buf_pdf, buf, buf_len);
parse_msdoc_pdf(ctx, doc, file, buf_pdf, buf_len);
}
parse_msdoc_text(ctx, doc, file, buf, buf_len);
fclose(file);
}

View File

@ -0,0 +1,24 @@
#ifndef SCAN_SCAN_MSDOC_H
#define SCAN_SCAN_MSDOC_H
#include "../scan.h"
typedef struct {
long content_size;
int tn_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
unsigned int msdoc_mime;
} scan_msdoc_ctx_t;
__always_inline
static int is_msdoc(scan_msdoc_ctx_t *ctx, unsigned int mime) {
return mime == ctx->msdoc_mime;
}
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc);
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void* buf, size_t buf_len);
#endif

View File

@ -0,0 +1,260 @@
#include "ooxml.h"
#include <archive.h>
#include <archive_entry.h>
#include <libxml/xmlstring.h>
#include <libxml/parser.h>
#define _X(str) ((const xmlChar*)str)
__always_inline
static int should_read_part(const char *part) {
if (part == NULL) {
return FALSE;
}
if ( // Word
STR_STARTS_WITH(part, "word/document.xml")
|| STR_STARTS_WITH(part, "word/footnotes.xml")
|| STR_STARTS_WITH(part, "word/endnotes.xml")
|| STR_STARTS_WITH(part, "word/footer")
|| STR_STARTS_WITH(part, "word/header")
// PowerPoint
|| STR_STARTS_WITH(part, "ppt/slides/slide")
|| STR_STARTS_WITH(part, "ppt/notesSlides/slide")
// Excel
|| STR_STARTS_WITH(part, "xl/worksheets/sheet")
|| STR_STARTS_WITH(part, "xl/sharedStrings.xml")
|| STR_STARTS_WITH(part, "xl/workbook.xml")
) {
return TRUE;
}
return FALSE;
}
int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_t *buf) {
//TODO: Check which nodes are likely to have a 't' child, and ignore nodes that aren't
xmlErrorPtr err = xmlGetLastError();
if (err != NULL) {
if (err->level == XML_ERR_FATAL) {
CTX_LOG_ERRORF("ooxml.c", "Got fatal XML error while parsing document: %s", err->message)
return -1;
}
}
for (xmlNode *child = node; child; child = child->next) {
if (child->name != NULL && *child->name == 't' && *(child->name + 1) == '\0') {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text) {
int ret = text_buffer_append_string0(buf, (char *) text);
text_buffer_append_char(buf, ' ');
xmlFree(text);
if (ret == TEXT_BUF_FULL) {
return ret;
}
}
}
if (extract_text(ctx, xml, child->children, buf) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
}
return 0;
}
int xml_io_read(void *context, char *buffer, int len) {
struct archive *a = context;
return (int) archive_read_data(a, buffer, len);
}
int xml_io_close(UNUSED(void *context)) {
//noop
return 0;
}
#define READ_PART_ERR (-2)
__always_inline
static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
return READ_PART_ERR;
}
xmlNode *root = xmlDocGetRootElement(xml);
if (root == NULL) {
CTX_LOG_ERROR(doc->filepath, "Empty document")
xmlFreeDoc(xml);
return READ_PART_ERR;
}
int ret = extract_text(ctx, xml, root, buf);
xmlFreeDoc(xml);
return ret;
}
__always_inline
static int read_doc_props_app(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
return -1;
}
xmlNode *root = xmlDocGetRootElement(xml);
if (root == NULL) {
CTX_LOG_ERROR(doc->filepath, "Empty document")
xmlFreeDoc(xml);
return -1;
}
if (xmlStrEqual(root->name, _X("Properties"))) {
for (xmlNode *child = root->children; child; child = child->next) {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text == NULL) {
continue;
}
if (xmlStrEqual(child->name, _X("Pages"))) {
APPEND_LONG_META(doc, MetaPages, strtol((char *) text, NULL, 10))
}
xmlFree(text);
}
}
xmlFreeDoc(xml);
return 0;
}
__always_inline
static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
return -1;
}
xmlNode *root = xmlDocGetRootElement(xml);
if (root == NULL) {
CTX_LOG_ERROR(doc->filepath, "Empty document")
xmlFreeDoc(xml);
return -1;
}
if (xmlStrEqual(root->name, _X("coreProperties"))) {
for (xmlNode *child = root->children; child; child = child->next) {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text == NULL) {
continue;
}
if (xmlStrEqual(child->name, _X("title"))) {
APPEND_STR_META(doc, MetaTitle, (char *) text)
} else if (xmlStrEqual(child->name, _X("creator"))) {
APPEND_STR_META(doc, MetaAuthor, (char *) text)
} else if (xmlStrEqual(child->name, _X("lastModifiedBy"))) {
APPEND_STR_META(doc, MetaModifiedBy, (char *) text)
}
xmlFree(text);
}
}
xmlFreeDoc(xml);
return 0;
}
#define MAX_TN_SIZE (1024 * 1024 * 15)
void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, struct archive_entry *entry) {
size_t entry_size = archive_entry_size(entry);
if (entry_size <= 0 || entry_size > MAX_TN_SIZE) {
return;
}
char *buf = malloc(entry_size);
archive_read_data(a, buf, entry_size);
APPEND_TN_META(doc, 1, 1) // Size unknown
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), buf, entry_size);
free(buf);
}
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
struct archive *a = archive_read_new();
archive_read_support_format_zip(a);
int ret = archive_read_open_memory(a, buf, buf_len);
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a))
archive_read_free(a);
free(buf);
return;
}
text_buffer_t tex = text_buffer_create(ctx->content_size);
struct archive_entry *entry;
int buffer_full = FALSE;
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
const char *path = archive_entry_pathname(entry);
if (!buffer_full && should_read_part(path) && ctx->content_size > 0) {
ret = read_part(ctx, a, &tex, doc);
if (ret == READ_PART_ERR) {
break;
} else if (ret == TEXT_BUF_FULL) {
buffer_full = TRUE;
}
} else if (strcmp(path, "docProps/app.xml") == 0) {
if (read_doc_props_app(ctx, a, doc) != 0) {
break;
}
} else if (strcmp(path, "docProps/core.xml") == 0) {
if (read_doc_props(ctx, a, doc) != 0) {
break;
}
} else if (strcmp(path, "docProps/thumbnail.jpeg") == 0) {
read_thumbnail(ctx, doc, a, entry);
}
}
}
if (tex.dyn_buffer.cur > 0) {
text_buffer_terminate_string(&tex);
meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta->key = MetaContent;
strcpy(meta->str_val, tex.dyn_buffer.buf);
APPEND_META(doc, meta)
}
archive_read_close(a);
archive_read_free(a);
text_buffer_destroy(&tex);
free(buf);
}

View File

@ -0,0 +1,16 @@
#ifndef SCAN_OOXML_H
#define SCAN_OOXML_H
#include <stdlib.h>
#include "../scan.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
} scan_ooxml_ctx_t;
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

218
third-party/libscan/libscan/raw/raw.c vendored Normal file
View File

@ -0,0 +1,218 @@
#include "raw.h"
#include <libraw/libraw.h>
#include "../media/media.h"
#include <unistd.h>
#define MIN_SIZE 32
int store_thumbnail_jpeg(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
return store_image_thumbnail((scan_media_ctx_t *) ctx, img->data, img->data_size, doc, "x.jpeg");
}
int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
int dstW;
int dstH;
if (img->width <= ctx->tn_size && img->height <= ctx->tn_size) {
dstW = img->width;
dstH = img->height;
} else {
double ratio = (double) img->width / img->height;
if (img->width > img->height) {
dstW = ctx->tn_size;
dstH = (int) (ctx->tn_size / ratio);
} else {
dstW = (int) (ctx->tn_size * ratio);
dstH = ctx->tn_size;
}
}
if (dstW <= MIN_SIZE || dstH <= MIN_SIZE) {
return FALSE;
}
AVFrame *scaled_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
img->width, img->height, AV_PIX_FMT_RGB24,
dstW, dstH, AV_PIX_FMT_YUVJ420P,
SIST_SWS_ALGO, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, dstW, dstH, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len);
av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, dstW, dstH, 1);
const uint8_t *in_data[1] = {img->data};
int in_line_size[1] = {3 * img->width};
sws_scale(sws_ctx,
in_data, in_line_size,
0, img->height,
scaled_frame->data, scaled_frame->linesize
);
scaled_frame->width = dstW;
scaled_frame->height = dstH;
scaled_frame->format = AV_PIX_FMT_YUV420P;
sws_freeContext(sws_ctx);
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, 1.0f);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
av_packet_unref(&jpeg_packet);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
avcodec_free_context(&jpeg_encoder);
return TRUE;
}
#define DMS_REF(ref) (((ref) == 'S' || (ref) == 'W') ? -1 : 1)
void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
libraw_data_t *libraw_lib = libraw_init(0);
if (!libraw_lib) {
CTX_LOG_ERROR("raw.c", "Cannot create libraw handle")
return;
}
size_t buf_len = 0;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
int ret = libraw_open_buffer(libraw_lib, buf, buf_len);
if (ret != 0) {
CTX_LOG_ERROR(f->filepath, "Could not open raw file")
free(buf);
libraw_close(libraw_lib);
return;
}
if (*libraw_lib->idata.model != '\0') {
APPEND_STR_META(doc, MetaExifModel, libraw_lib->idata.model)
}
if (*libraw_lib->idata.make != '\0') {
APPEND_STR_META(doc, MetaExifMake, libraw_lib->idata.make)
}
if (*libraw_lib->idata.software != '\0') {
APPEND_STR_META(doc, MetaExifSoftware, libraw_lib->idata.software)
}
APPEND_LONG_META(doc, MetaWidth, libraw_lib->sizes.width)
APPEND_LONG_META(doc, MetaHeight, libraw_lib->sizes.height)
char tmp[1024];
snprintf(tmp, sizeof(tmp), "%g", libraw_lib->other.iso_speed);
APPEND_STR_META(doc, MetaExifIsoSpeedRatings, tmp)
if (*libraw_lib->other.desc != '\0') {
APPEND_STR_META(doc, MetaContent, libraw_lib->other.desc)
}
if (*libraw_lib->other.artist != '\0') {
APPEND_STR_META(doc, MetaArtist, libraw_lib->other.artist)
}
struct tm *time = localtime(&libraw_lib->other.timestamp);
strftime(tmp, sizeof(tmp), "%Y:%m:%d %H:%M:%S", time);
APPEND_STR_META(doc, MetaExifDateTime, tmp)
snprintf(tmp, sizeof(tmp), "%.1f", libraw_lib->other.focal_len);
APPEND_STR_META(doc, MetaExifFocalLength, tmp)
snprintf(tmp, sizeof(tmp), "%.1f", libraw_lib->other.aperture);
APPEND_STR_META(doc, MetaExifFNumber, tmp)
int denominator = (int) roundf(1 / libraw_lib->other.shutter);
snprintf(tmp, sizeof(tmp), "1/%d", denominator);
APPEND_STR_META(doc, MetaExifExposureTime, tmp)
libraw_gps_info_t gps = libraw_lib->other.parsed_gps;
double gps_longitude_dec =
(gps.longtitude[0] + gps.longtitude[1] / 60 + gps.longtitude[2] / 3600) * DMS_REF(gps.longref);
snprintf(tmp, sizeof(tmp), "%.15f", gps_longitude_dec);
if (gps_longitude_dec != 0.0) {
APPEND_STR_META(doc, MetaExifGpsLongitudeDec, tmp)
}
double gps_latitude_dec = (gps.latitude[0] + gps.latitude[1] / 60 + gps.latitude[2] / 3600) * DMS_REF(gps.latref);
snprintf(tmp, sizeof(tmp), "%.15f", gps_latitude_dec);
if (gps_latitude_dec != 0.0) {
APPEND_STR_META(doc, MetaExifGpsLatitudeDec, tmp)
}
APPEND_STR_META(doc, MetaMediaVideoCodec, "raw")
if (ctx->tn_size <= 0) {
free(buf);
libraw_close(libraw_lib);
return;
}
libraw_unpack_thumb(libraw_lib);
int errc = 0;
libraw_processed_image_t *thumb = libraw_dcraw_make_mem_thumb(libraw_lib, &errc);
if (errc != 0) {
free(buf);
libraw_dcraw_clear_mem(thumb);
libraw_close(libraw_lib);
return;
}
int tn_ok = 0;
if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_JPEG) {
tn_ok = store_thumbnail_jpeg(ctx, thumb, doc);
} else if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_BITMAP) {
// TODO: technically this should work but is currently untested
tn_ok = store_thumbnail_rgb24(ctx, thumb, doc);
}
libraw_dcraw_clear_mem(thumb);
if (tn_ok == TRUE) {
free(buf);
libraw_close(libraw_lib);
return;
}
ret = libraw_unpack(libraw_lib);
if (ret != 0) {
CTX_LOG_ERROR(f->filepath, "Could not unpack raw file")
free(buf);
libraw_close(libraw_lib);
return;
}
libraw_dcraw_process(libraw_lib);
errc = 0;
libraw_processed_image_t *img = libraw_dcraw_make_mem_image(libraw_lib, &errc);
if (errc != 0) {
free(buf);
libraw_dcraw_clear_mem(img);
libraw_close(libraw_lib);
return;
}
store_thumbnail_rgb24(ctx, img, doc);
libraw_dcraw_clear_mem(img);
libraw_close(libraw_lib);
free(buf);
}

17
third-party/libscan/libscan/raw/raw.h vendored Normal file
View File

@ -0,0 +1,17 @@
#ifndef SIST2_RAW_H
#define SIST2_RAW_H
#include "../scan.h"
typedef struct {
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int tn_size;
float tn_qscale;
} scan_raw_ctx_t;
void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif //SIST2_RAW_H

170
third-party/libscan/libscan/scan.h vendored Normal file
View File

@ -0,0 +1,170 @@
#ifndef SCAN_SCAN_H
#define SCAN_SCAN_H
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <sys/stat.h>
#include <openssl/md5.h>
#include <openssl/sha.h>
#include "macros.h"
#define SIST_SWS_ALGO SWS_LANCZOS
#define UNUSED(x) __attribute__((__unused__)) x
typedef void (*store_callback_t)(char *key, size_t key_len, char *buf, size_t buf_len);
typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...);
typedef void (*log_callback_t)(const char *filepath, int level, char *str);
typedef int scan_code_t;
#define SCAN_OK (scan_code_t) 0
#define SCAN_ERR_READ (scan_code_t) (-1)
#define SCAN_ERR_SKIP (scan_code_t) (-2)
#define LEVEL_DEBUG 0
#define LEVEL_INFO 1
#define LEVEL_WARNING 2
#define LEVEL_ERROR 3
#define LEVEL_FATAL 4
#define CTX_LOG_DEBUGF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_DEBUG, fmt, __VA_ARGS__);
#define CTX_LOG_DEBUG(filepath, str) ctx->log(filepath, LEVEL_DEBUG, str);
#define CTX_LOG_INFOF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_INFO, fmt, __VA_ARGS__);
#define CTX_LOG_INFO(filepath, str) ctx->log(filepath, LEVEL_INFO, str);
#define CTX_LOG_WARNINGF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_WARNING, fmt, __VA_ARGS__);
#define CTX_LOG_WARNING(filepath, str) ctx->log(filepath, LEVEL_WARNING, str);
#define CTX_LOG_ERRORF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_ERROR, fmt, __VA_ARGS__);
#define CTX_LOG_ERROR(filepath, str) ctx->log(filepath, LEVEL_ERROR, str);
#define CTX_LOG_FATALF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_FATAL, fmt, __VA_ARGS__); exit(-1);
#define CTX_LOG_FATAL(filepath, str) ctx->log(filepath, LEVEL_FATAL, str); exit(-1);
enum metakey {
// String
MetaContent = 1,
MetaMediaAudioCodec,
MetaMediaVideoCodec,
MetaArtist,
MetaAlbum,
MetaAlbumArtist,
MetaGenre,
MetaTitle,
MetaFontName,
MetaParent,
MetaExifMake,
MetaExifSoftware,
MetaExifExposureTime,
MetaExifFNumber,
MetaExifFocalLength,
MetaExifUserComment,
MetaExifModel,
MetaExifIsoSpeedRatings,
MetaExifDateTime,
MetaAuthor,
MetaModifiedBy,
MetaThumbnail,
MetaChecksum,
// Number
MetaWidth,
MetaHeight,
MetaMediaDuration,
MetaMediaBitrate,
MetaPages,
// ??
MetaExifGpsLongitudeDMS,
MetaExifGpsLongitudeRef,
MetaExifGpsLatitudeDMS,
MetaExifGpsLatitudeRef,
MetaExifGpsLatitudeDec,
MetaExifGpsLongitudeDec,
};
typedef struct meta_line {
struct meta_line *next;
enum metakey key;
union {
char str_val[0];
unsigned long long_val;
double double_val;
};
} meta_line_t;
typedef struct document {
unsigned char path_md5[MD5_DIGEST_LENGTH];
unsigned long size;
unsigned int mime;
int mtime;
short base;
short ext;
char has_parent;
meta_line_t *meta_head;
meta_line_t *meta_tail;
char *filepath;
} document_t;
typedef struct vfile vfile_t;
__attribute__((warn_unused_result))
typedef int (*read_func_t)(struct vfile *, void *buf, size_t size);
__attribute__((warn_unused_result))
typedef long (*seek_func_t)(struct vfile *, long offset, int whence);
typedef void (*close_func_t)(struct vfile *);
typedef void (*reset_func_t)(struct vfile *);
typedef struct vfile {
union {
int fd;
struct archive *arc;
const void *_test_data;
};
int is_fs_file;
int has_checksum;
int calculate_checksum;
const char *filepath;
struct stat info;
SHA_CTX sha1_ctx;
unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
void *rewind_buffer;
int rewind_buffer_size;
int rewind_buffer_cursor;
read_func_t read;
read_func_t read_rewindable;
close_func_t close;
reset_func_t reset;
log_callback_t log;
logf_callback_t logf;
} vfile_t;
typedef struct parse_job_t {
int base;
int ext;
struct vfile vfile;
unsigned char parent[MD5_DIGEST_LENGTH];
char filepath[1];
} parse_job_t;
#include "util.h"
typedef void (*parse_callback_t)(parse_job_t *job);
#endif

64
third-party/libscan/libscan/text/text.c vendored Normal file
View File

@ -0,0 +1,64 @@
#include "text.h"
scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
int to_read = MIN(ctx->content_size, f->info.st_size);
if (to_read <= 2) {
return SCAN_OK;
}
char *buf = malloc(to_read);
int ret = f->read(f, buf, to_read);
if (ret < 0) {
CTX_LOG_ERRORF(doc->filepath, "read() returned error code: [%d]", ret)
free(buf);
return SCAN_ERR_READ;
}
text_buffer_t tex = text_buffer_create(ctx->content_size);
if ((*(int16_t*)buf) == (int16_t)0xFFFE) {
text_buffer_append_string16_le(&tex, buf + 2, to_read - 2);
} else if((*(int16_t*)buf) == (int16_t)0xFEFF) {
text_buffer_append_string16_be(&tex, buf + 2, to_read - 2);
} else {
text_buffer_append_string(&tex, buf, to_read);
}
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
free(buf);
text_buffer_destroy(&tex);
return SCAN_OK;
}
#define MAX_MARKUP_SIZE (1024 * 1024)
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
int to_read = MIN(MAX_MARKUP_SIZE, f->info.st_size);
char *buf = malloc(to_read + 1);
int ret = f->read(f, buf, to_read);
if (ret < 0) {
CTX_LOG_ERRORF(doc->filepath, "read() returned error code: [%d]", ret)
free(buf);
return SCAN_ERR_READ;
}
*(buf + to_read) = '\0';
text_buffer_t tex = text_buffer_create(ctx->content_size);
text_buffer_append_markup(&tex, buf);
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
free(buf);
text_buffer_destroy(&tex);
return SCAN_OK;
}

18
third-party/libscan/libscan/text/text.h vendored Normal file
View File

@ -0,0 +1,18 @@
#ifndef SCAN_TEXT_H
#define SCAN_TEXT_H
#include "../scan.h"
#include "../util.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
} scan_text_ctx_t;
scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc);
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

0
third-party/libscan/libscan/util.c vendored Normal file
View File

361
third-party/libscan/libscan/util.h vendored Normal file
View File

@ -0,0 +1,361 @@
#ifndef SCAN_UTIL_H
#define SCAN_UTIL_H
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
#include "../third-party/utf8.h/utf8.h"
#include "macros.h"
#define STR_STARTS_WITH(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
#define TEXT_BUF_FULL (-1)
#define INITIAL_BUF_SIZE (1024 * 16)
#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c))
#define SHOULD_KEEP_CHAR(c) (\
((c) >= '\'' && (c) <= ';') || \
((c) >= 'A' && (c) <= 'z') || \
((c) > 127 && (c) != 0x00A0 && (c) && (c) != 0xFFFD))
typedef struct dyn_buffer {
char *buf;
size_t cur;
size_t size;
} dyn_buffer_t;
typedef struct text_buffer {
long max_size;
int last_char_was_whitespace;
dyn_buffer_t dyn_buffer;
} text_buffer_t;
static int utf8_validchr2(const char *s) {
if (0x00 == (0x80 & *s)) {
return TRUE;
} else if (0xf0 == (0xf8 & *s)) {
if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) ||
(0x80 != (0xc0 & s[3]))) {
return FALSE;
}
if (0x80 == (0xc0 & s[4])) {
return FALSE;
}
if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) {
return FALSE;
}
} else if (0xe0 == (0xf0 & *s)) {
if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) {
return FALSE;
}
if (0x80 == (0xc0 & s[3])) {
return FALSE;
}
if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) {
return FALSE;
}
} else if (0xc0 == (0xe0 & *s)) {
if (0x80 != (0xc0 & s[1])) {
return FALSE;
}
if (0x80 == (0xc0 & s[2])) {
return FALSE;
}
if (0 == (0x1e & s[0])) {
return FALSE;
}
} else {
return FALSE;
}
return TRUE;
}
static dyn_buffer_t dyn_buffer_create() {
dyn_buffer_t buf;
buf.size = INITIAL_BUF_SIZE;
buf.cur = 0;
buf.buf = (char *) malloc(INITIAL_BUF_SIZE);
return buf;
}
static void grow_buffer(dyn_buffer_t *buf, size_t size) {
if (buf->cur + size > buf->size) {
do {
buf->size *= 2;
} while (buf->cur + size > buf->size);
buf->buf = (char *) realloc(buf->buf, buf->size);
}
}
static void grow_buffer_small(dyn_buffer_t *buf) {
if (buf->cur + sizeof(long) > buf->size) {
buf->size *= 2;
buf->buf = (char *) realloc(buf->buf, buf->size);
}
}
static void dyn_buffer_write(dyn_buffer_t *buf, const void *data, size_t size) {
grow_buffer(buf, size);
memcpy(buf->buf + buf->cur, data, size);
buf->cur += size;
}
static void dyn_buffer_write_char(dyn_buffer_t *buf, char c) {
grow_buffer_small(buf);
*(buf->buf + buf->cur) = c;
buf->cur += sizeof(c);
}
static void dyn_buffer_write_str(dyn_buffer_t *buf, const char *str) {
dyn_buffer_write(buf, str, strlen(str));
dyn_buffer_write_char(buf, '\0');
}
static void dyn_buffer_append_string(dyn_buffer_t *buf, const char *str) {
dyn_buffer_write(buf, str, strlen(str));
}
static void dyn_buffer_write_int(dyn_buffer_t *buf, int d) {
grow_buffer_small(buf);
*(int *) (buf->buf + buf->cur) = d;
buf->cur += sizeof(int);
}
static void dyn_buffer_write_short(dyn_buffer_t *buf, uint16_t s) {
grow_buffer_small(buf);
*(uint16_t *) (buf->buf + buf->cur) = s;
buf->cur += sizeof(uint16_t);
}
static void dyn_buffer_write_long(dyn_buffer_t *buf, unsigned long l) {
grow_buffer_small(buf);
*(unsigned long *) (buf->buf + buf->cur) = l;
buf->cur += sizeof(unsigned long);
}
static void dyn_buffer_destroy(dyn_buffer_t *buf) {
free(buf->buf);
}
static void text_buffer_destroy(text_buffer_t *buf) {
dyn_buffer_destroy(&buf->dyn_buffer);
}
static text_buffer_t text_buffer_create(long max_size) {
text_buffer_t text_buf;
text_buf.dyn_buffer = dyn_buffer_create();
text_buf.max_size = max_size;
text_buf.last_char_was_whitespace = FALSE;
return text_buf;
}
static int text_buffer_append_char(text_buffer_t *buf, int c) {
if (SHOULD_IGNORE_CHAR(c) || c == ' ') {
if (!buf->last_char_was_whitespace && buf->dyn_buffer.cur != 0) {
dyn_buffer_write_char(&buf->dyn_buffer, ' ');
buf->last_char_was_whitespace = TRUE;
if (buf->max_size > 0 && buf->dyn_buffer.cur > buf->max_size) {
return TEXT_BUF_FULL;
}
}
} else {
buf->last_char_was_whitespace = FALSE;
grow_buffer_small(&buf->dyn_buffer);
if (((utf8_int32_t) 0xffffff80 & c) == 0) {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = (char) c;
} else if (((utf8_int32_t) 0xfffff800 & c) == 0) {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xc0 | (char) (c >> 6);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
} else if (((utf8_int32_t) 0xffff0000 & c) == 0) {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xe0 | (char) (c >> 12);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
} else {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xf0 | (char) (c >> 18);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 12) & 0x3f);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
}
if (buf->max_size > 0 && buf->dyn_buffer.cur > buf->max_size) {
return TEXT_BUF_FULL;
}
}
return 0;
}
static void text_buffer_terminate_string(text_buffer_t *buf) {
if (buf->dyn_buffer.cur > 0 && *(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) == ' ') {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) = '\0';
} else {
dyn_buffer_write_char(&buf->dyn_buffer, '\0');
}
}
// Naive UTF16 -> ascii conversion
static int text_buffer_append_string16_le(text_buffer_t *buf, const char *str, size_t len) {
int ret = 0;
for (int i = 1; i < len; i += 2) {
ret = text_buffer_append_char(buf, str[i]);
}
return ret;
}
static int text_buffer_append_string16_be(text_buffer_t *buf, const char *str, size_t len) {
int ret = 0;
for (int i = 0; i < len; i += 2) {
ret = text_buffer_append_char(buf, str[i]);
}
return ret;
}
#define UTF8_END_OF_STRING \
(ptr - str >= len || *ptr == 0 || \
(0xc0 == (0xe0 & *ptr) && ptr - str > len - 2) || \
(0xe0 == (0xf0 & *ptr) && ptr - str > len - 3) || \
(0xf0 == (0xf8 & *ptr) && ptr - str > len - 4))
static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t len) {
const char *ptr = str;
const char *oldPtr = ptr;
if (str == NULL || UTF8_END_OF_STRING) {
return 0;
}
if (len <= 4) {
for (int i = 0; i < len; i++) {
if (((utf8_int32_t) 0xffffff80 & str[i]) == 0 && SHOULD_KEEP_CHAR(str[i])) {
dyn_buffer_write_char(&buf->dyn_buffer, str[i]);
}
}
return 0;
}
utf8_int32_t c;
char tmp[16] = {0};
do {
ptr = (char *) utf8codepoint(ptr, &c);
*(int *) tmp = 0x00000000;
memcpy(tmp, oldPtr, ptr - oldPtr);
oldPtr = ptr;
if (!utf8_validchr2(tmp)) {
continue;
}
int ret = text_buffer_append_char(buf, c);
if (ret != 0) {
return ret;
}
} while (!UTF8_END_OF_STRING);
return 0;
}
static int text_buffer_append_string0(text_buffer_t *buf, const char *str) {
return text_buffer_append_string(buf, str, strlen(str));
}
static int text_buffer_append_markup(text_buffer_t *buf, const char *markup) {
int tag_open = TRUE;
const char *ptr = markup;
const char *start = markup;
while (*ptr != '\0') {
if (tag_open) {
if (*ptr == '>') {
tag_open = FALSE;
start = ptr + 1;
}
} else {
if (*ptr == '<') {
tag_open = TRUE;
if (ptr != start) {
if (text_buffer_append_string(buf, start, (ptr - start)) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
if (text_buffer_append_char(buf, ' ') == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
}
}
}
ptr += 1;
}
if (ptr != start) {
if (text_buffer_append_string(buf, start, (ptr - start)) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
if (text_buffer_append_char(buf, ' ') == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
}
return 0;
}
static void *read_all(vfile_t *f, size_t *size) {
void *buf = malloc(f->info.st_size);
*size = f->read(f, buf, f->info.st_size);
if (*size != f->info.st_size) {
free(buf);
return NULL;
}
return buf;
}
#define STACK_BUFFER_SIZE (size_t)(4096 * 8)
__always_inline
static void safe_sha1_update(SHA_CTX *ctx, void *buf, size_t size) {
unsigned char stack_buf[STACK_BUFFER_SIZE];
void *sha1_buf;
if (size <= STACK_BUFFER_SIZE) {
sha1_buf = stack_buf;
} else {
void *heap_sha1_buf = malloc(size);
sha1_buf = heap_sha1_buf;
}
memcpy(sha1_buf, buf, size);
SHA1_Update(ctx, (const void *) sha1_buf, size);
if (sha1_buf != stack_buf) {
free(sha1_buf);
}
}
#endif

View File

@ -0,0 +1,200 @@
#include "libwpd_c_api.h"
#include "libwpd/libwpd.h"
#include "libwpd/WPXProperty.h"
#include "libwpd-stream/libwpd-stream.h"
class StringDocument : public WPXDocumentInterface {
private:
text_buffer_t *tex;
document_t *doc;
bool is_full;
public:
StringDocument(text_buffer_t *tex, document_t *doc) {
this->tex = tex;
this->doc = doc;
this->is_full = false;
}
void setDocumentMetaData(const WPXPropertyList &propList) override {
WPXPropertyList::Iter propIter(propList);
for (propIter.rewind(); propIter.next();) {
// TODO: Read metadata here ?!
}
}
void endDocument() override {
text_buffer_terminate_string(this->tex);
}
void closeParagraph() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void closeSpan() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void closeSection() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertTab() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertSpace() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertText(const WPXString &text) override {
if (!this->is_full) {
if (text_buffer_append_string0(tex, text.cstr()) == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertLineBreak() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void definePageStyle(const WPXPropertyList &propList) override { /* noop */ }
void closePageSpan() override { /* noop */ }
void openHeader(const WPXPropertyList &propList) override { /* noop */ }
void closeHeader() override { /* noop */ }
void openFooter(const WPXPropertyList &propList) override { /* noop */ }
void closeFooter() override { /* noop */ }
void
defineParagraphStyle(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void openParagraph(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void defineCharacterStyle(const WPXPropertyList &propList) override { /* noop */ }
void openSpan(const WPXPropertyList &propList) override { /* noop */ }
void
defineSectionStyle(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void openSection(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void insertField(const WPXString &type, const WPXPropertyList &propList) override { /* noop */ }
void defineOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void defineUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void openOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void openUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void closeOrderedListLevel() override { /* noop */ }
void closeUnorderedListLevel() override { /* noop */ }
void openListElement(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void closeListElement() override { /* noop */ }
void openFootnote(const WPXPropertyList &propList) override { /* noop */ }
void closeFootnote() override { /* noop */ }
void openEndnote(const WPXPropertyList &propList) override { /* noop */ }
void closeEndnote() override { /* noop */ }
void openComment(const WPXPropertyList &propList) override { /* noop */ }
void closeComment() override { /* noop */ }
void openTextBox(const WPXPropertyList &propList) override { /* noop */ }
void closeTextBox() override { /* noop */ }
void openTable(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void openTableRow(const WPXPropertyList &propList) override { /* noop */ }
void closeTableRow() override { /* noop */ }
void openTableCell(const WPXPropertyList &propList) override { /* noop */ }
void closeTableCell() override { /* noop */ }
void insertCoveredTableCell(const WPXPropertyList &propList) override { /* noop */ }
void closeTable() override { /* noop */ }
void openFrame(const WPXPropertyList &propList) override { /* noop */ }
void closeFrame() override { /* noop */ }
void insertBinaryObject(const WPXPropertyList &propList, const WPXBinaryData &data) override { /* noop */ }
void insertEquation(const WPXPropertyList &propList, const WPXString &data) override { /* noop */ }
void openPageSpan(const WPXPropertyList &propList) override { /* noop */ }
void startDocument() override { /* noop */ };
};
wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len) {
auto *input = new WPXStringStream(buf, buf_len);
return input;
}
wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t ptr) {
auto *stream = (WPXStringStream *) ptr;
WPDConfidence confidence = WPDocument::isFileFormatSupported(stream);
return (wpd_confidence_t) confidence;
}
wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc) {
auto *stream = (WPXStringStream *) ptr;
auto myDoc = StringDocument(tex, doc);
WPDResult result2 = WPDocument::parse(stream, &myDoc, nullptr);
return (wpd_result_t) result2;
}
void wpd_memory_stream_destroy(wpd_stream_t ptr) {
auto *stream = (WPXStringStream *) ptr;
delete stream;
}

View File

@ -0,0 +1,50 @@
#ifndef SIST2_LIBWPD_C_API_H
#define SIST2_LIBWPD_C_API_H
#include "stdlib.h"
#ifdef __cplusplus
#define EXTERNC extern "C"
#else
#define EXTERNC
#endif
#ifdef __cplusplus
extern "C" {
#endif
#include "../scan.h"
#include "../util.h"
#ifdef __cplusplus
};
#endif
typedef void *wpd_stream_t;
typedef enum {
C_WPD_CONFIDENCE_NONE = 0,
C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION,
C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION,
C_WPD_CONFIDENCE_EXCELLENT
} wpd_confidence_t;
typedef enum {
C_WPD_OK,
C_WPD_FILE_ACCESS_ERROR,
C_WPD_PARSE_ERROR,
C_WPD_UNSUPPORTED_ENCRYPTION_ERROR,
C_WPD_PASSWORD_MISSMATCH_ERROR,
C_WPD_OLE_ERROR,
C_WPD_UNKNOWN_ERROR
} wpd_result_t;
EXTERNC wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t stream);
EXTERNC wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len);
EXTERNC void wpd_memory_stream_destroy(wpd_stream_t stream);
EXTERNC wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc);
#endif

41
third-party/libscan/libscan/wpd/wpd.c vendored Normal file
View File

@ -0,0 +1,41 @@
#include "wpd.h"
#include "libwpd_c_api.h"
scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
void *buf = read_all(f, &buf_len);
void *stream = wpd_memory_stream_create(buf, buf_len);
wpd_confidence_t conf = wpd_is_file_format_supported(stream);
if (conf == C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION || conf == C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION) {
CTX_LOG_DEBUGF("wpd.c", "File is encrypted! Password-protected WPD files are not supported yet (conf=%d)", conf)
wpd_memory_stream_destroy(stream);
free(buf);
return SCAN_ERR_READ;
}
if (conf != C_WPD_CONFIDENCE_EXCELLENT) {
CTX_LOG_ERRORF("wpd.c", "Unsupported file format! [%s] (conf=%d)", doc->filepath, conf)
wpd_memory_stream_destroy(stream);
free(buf);
return SCAN_ERR_READ;
}
text_buffer_t tex = text_buffer_create(-1);
wpd_result_t res = wpd_parse(stream, &tex, doc);
if (res != C_WPD_OK) {
CTX_LOG_ERRORF("wpd.c", "Error while parsing WPD file [%s] (%d)",
doc->filepath, res)
}
if (tex.dyn_buffer.cur != 0) {
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
}
text_buffer_destroy(&tex);
wpd_memory_stream_destroy(stream);
free(buf);
}

23
third-party/libscan/libscan/wpd/wpd.h vendored Normal file
View File

@ -0,0 +1,23 @@
#ifndef SIST2_WPD_H
#define SIST2_WPD_H
#include "../scan.h"
#include "../util.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
unsigned int wpd_mime;
} scan_wpd_ctx_t;
scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc);
__always_inline
static int is_wpd(scan_wpd_ctx_t *ctx, unsigned int mime) {
return mime == ctx->wpd_mime;
}
#endif

1169
third-party/libscan/test/main.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

114
third-party/libscan/test/test_util.cpp vendored Normal file
View File

@ -0,0 +1,114 @@
#include "test_util.h"
#include <gtest/gtest.h>
#include <unistd.h>
#include <fcntl.h>
#define FILE_NOT_FOUND_ERR "Could not file, did you clone the test files repo?"
int fs_read(struct vfile *f, void *buf, size_t size) {
if (f->fd == -1) {
f->fd = open(f->filepath, O_RDONLY);
if (f->fd == -1) {
return -1;
}
}
return (int) read(f->fd, buf, size);
}
//Note: No out of bounds check
int mem_read(vfile_t *f, void *buf, size_t size) {
memcpy(buf, f->_test_data, size);
f->_test_data = (char *) f->_test_data + size;
return 0;
}
void fs_close(vfile_t *f) {
if (f->fd != -1) {
close(f->fd);
}
}
void load_doc_file(const char *filepath, vfile_t *f, document_t *doc) {
doc->meta_head = nullptr;
doc->meta_tail = nullptr;
load_file(filepath, f);
}
void load_doc_mem(void *mem, size_t mem_len, vfile_t *f, document_t *doc) {
doc->meta_head = nullptr;
doc->meta_tail = nullptr;
load_mem(mem, mem_len, f);
}
void cleanup(document_t *doc, vfile_t *f) {
destroy_doc(doc);
CLOSE_FILE((*f))
}
void load_file(const char *filepath, vfile_t *f) {
stat(filepath, &f->info);
f->fd = open(filepath, O_RDONLY);
if (f->fd == -1) {
FAIL() << FILE_NOT_FOUND_ERR;
}
f->filepath = filepath;
f->read = fs_read;
f->close = fs_close;
f->is_fs_file = TRUE;
f->calculate_checksum = TRUE;
f->has_checksum = FALSE;
}
void load_mem(void *mem, size_t size, vfile_t *f) {
f->filepath = "_mem_";
f->_test_data = mem;
f->info.st_size = (int) size;
f->read = mem_read;
f->close = nullptr;
f->is_fs_file = TRUE;
}
meta_line_t *get_meta(document_t *doc, metakey key) {
return get_meta_from(doc->meta_head, key);
}
meta_line_t *get_meta_from(meta_line_t *meta, metakey key) {
while (meta != nullptr) {
if (meta->key == key) {
return meta;
}
meta = meta->next;
}
return nullptr;
}
void destroy_doc(document_t *doc) {
meta_line_t *meta = doc->meta_head;
while (meta != nullptr) {
meta_line_t *tmp = meta;
meta = tmp->next;
free(tmp);
}
}
void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p) {
for (int i = 0; i < n; i++) {
size_t offset = rand() % (*buf_len - width - 1);
if (rand() % 100 < trunc_p) {
*buf_len = MAX(offset, 1000);
continue;
}
for (int disp = 0; disp < width; disp++) {
buf[offset + disp] = (int8_t) rand();
}
}
}

46
third-party/libscan/test/test_util.h vendored Normal file
View File

@ -0,0 +1,46 @@
#ifndef SCAN_TEST_UTIL_H
#define SCAN_TEST_UTIL_H
#include "../libscan/scan.h"
#include <fcntl.h>
#include <unistd.h>
void load_file(const char *filepath, vfile_t *f);
void load_mem(void *mem, size_t size, vfile_t *f);
void load_doc_mem(void *mem, size_t mem_len, vfile_t *f, document_t *doc);
void load_doc_file(const char *filepath, vfile_t *f, document_t *doc);
void cleanup(document_t *doc, vfile_t *f);
static void noop_logf(const char *filepath, int level, char *format, ...) {
// noop
}
static void noop_log(const char *filepath, int level, char *str) {
// noop
}
static size_t store_size = 0;
static void counter_store(char* key, size_t key_len, char *value, size_t value_len) {
store_size += value_len;
// char id[37];
// char tmp[PATH_MAX];
// uuid_unparse(reinterpret_cast<const unsigned char *>(key), id);
// sprintf(tmp, "%s.jpeg", id);
// int fd = open(tmp, O_TRUNC|O_WRONLY|O_CREAT, 0777);
// write(fd, value, value_len);
// close(fd);
}
meta_line_t *get_meta(document_t *doc, metakey key);
meta_line_t *get_meta_from(meta_line_t *meta, metakey key);
#define CLOSE_FILE(f) if (f.close != NULL) {f.close(&f);};
void destroy_doc(document_t *doc);
void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p);
#endif

@ -0,0 +1 @@
Subproject commit 62ae66db99e9dd88dfa31999f516f71bb8bdc8b2

@ -0,0 +1 @@
Subproject commit 146be69f88575d753317d8ef13b16f80e0656fc7