22 Commits
wip ... master

Author SHA1 Message Date
3787475ecb Fix tests 2021-09-20 20:29:37 -04:00
1d95be534b Handle excludes in archive files 2021-09-20 20:15:35 -04:00
da17282374 Support for rewind buffer 2021-09-11 20:46:25 -04:00
52d7649322 Support for sha1sum 2021-09-11 13:00:59 -04:00
23da8ada5f Add basic JSON/NDJSON support 2021-09-06 21:25:05 -04:00
f061212d4b reformat 2021-09-06 21:24:32 -04:00
fe53e1a219 Basic support for WordPerfect files 2021-09-06 14:06:56 -04:00
75ff57fd94 Fix some warnings 2021-09-06 14:04:40 -04:00
1dad8fae20 Parse page numbers from .docx files 2021-09-06 09:47:57 -04:00
22522d7d4a Update CMakeLists.txt 2021-09-05 09:33:17 -04:00
722052e4e1 Bug fixes, rework Meta types, scale ebook tn with ctx args 2021-09-05 09:11:33 -04:00
8a0ac8d0db Fix .docx segmentation fault 2021-08-16 17:50:01 -04:00
413fb4bec7 add fast-epub arg 2021-07-10 12:47:24 -04:00
a12ec1cb06 Fix libgomp path 2021-06-14 14:01:20 -04:00
9be4f02851 Only save GPS data when != 0 for RAW images 2021-06-11 20:40:25 -04:00
598e748214 Fix subtitle problems 2021-06-11 10:04:38 -04:00
58c0758301 Fix memory leak in RAW parsing 2021-06-09 08:22:21 -04:00
ee9c98b488 Fix for segfault in some comic files 2021-06-07 09:01:06 -04:00
bcf3e4695b Use 16-bit ints for meta keys (wip) 2021-06-07 08:40:11 -04:00
8ed4c94314 Add tests for subtitle 2021-05-05 16:10:55 -04:00
f1fc83dc54 Fix build 2021-05-05 14:11:43 -04:00
42d5f09839 Merge pull request #2 from simon987/wip
update
2021-03-26 19:50:14 -04:00
25 changed files with 1284 additions and 242 deletions

View File

@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.15)
project(scan) project(scan)
set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD 11)
option(BUILD_TESTS "Build tests" off) option(BUILD_TESTS "Build tests" on)
add_subdirectory(third-party/antiword) add_subdirectory(third-party/antiword)
add_compile_definitions( add_compile_definitions(
@@ -25,14 +25,16 @@ add_library(
libscan/media/media.c libscan/media/media.h libscan/media/media.c libscan/media/media.h
libscan/font/font.c libscan/font/font.h libscan/font/font.c libscan/font/font.h
libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h
libscan/json/json.c libscan/json/json.h
libscan/wpd/wpd.c libscan/wpd/wpd.h libscan/wpd/libwpd_c_api.h libscan/wpd/libwpd_c_api.cpp
third-party/utf8.h third-party/utf8.h
libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h) libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h)
set_target_properties(scan PROPERTIES LINKER_LANGUAGE C) set_target_properties(scan PROPERTIES LINKER_LANGUAGE C)
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib .so) set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib .so)
target_link_directories(scan PUBLIC /usr/share/vcpkg/installed/x64-linux/lib/)
find_package(cJSON CONFIG REQUIRED)
find_package(LibArchive REQUIRED) find_package(LibArchive REQUIRED)
find_package(BZip2 REQUIRED) find_package(BZip2 REQUIRED)
find_package(lz4 REQUIRED) find_package(lz4 REQUIRED)
@@ -42,10 +44,10 @@ find_package(Tesseract CONFIG REQUIRED)
find_package(OpenJPEG CONFIG REQUIRED) find_package(OpenJPEG CONFIG REQUIRED)
find_package(JPEG REQUIRED) find_package(JPEG REQUIRED)
find_package(LibXml2 REQUIRED) find_package(LibXml2 REQUIRED)
find_package(FFMPEG REQUIRED)
#find_package(OpenSSL REQUIRED)
find_package(LibLZMA REQUIRED) find_package(LibLZMA REQUIRED)
find_package(ZLIB REQUIRED) find_package(ZLIB REQUIRED)
find_package(unofficial-pcre CONFIG REQUIRED)
find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec) find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec)
find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd) find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd)
@@ -58,7 +60,7 @@ find_library(MUPDF_LIB NAMES liblibmupdf.a)
find_library(CMS_LIB NAMES lcms) find_library(CMS_LIB NAMES lcms)
find_library(JAS_LIB NAMES jasper) find_library(JAS_LIB NAMES jasper)
find_library(GUMBO_LIB NAMES gumbo) find_library(GUMBO_LIB NAMES gumbo)
find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/) find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/)
target_compile_options( target_compile_options(
@@ -90,40 +92,68 @@ ExternalProject_Add(
SET(MOBI_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/.libs/) SET(MOBI_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/.libs/)
SET(MOBI_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/) SET(MOBI_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/)
if (SIST_DEBUG)
SET(FFMPEG_DEBUG "--enable-debug=3" "--disable-optimizations")
else()
SET(FFMPEG_DEBUG "")
endif()
ExternalProject_Add( ExternalProject_Add(
ffmpeg ffmpeg
GIT_REPOSITORY https://git.ffmpeg.org/ffmpeg.git GIT_REPOSITORY https://git.ffmpeg.org/ffmpeg.git
GIT_TAG "master" GIT_TAG "n4.4"
UPDATE_COMMAND "" UPDATE_COMMAND ""
PATCH_COMMAND "" PATCH_COMMAND ""
TEST_COMMAND "" TEST_COMMAND ""
CONFIGURE_COMMAND ./configure --disable-shared --enable-static --disable-ffmpeg --disable-ffplay CONFIGURE_COMMAND ./configure --disable-shared --enable-static --disable-ffmpeg --disable-ffplay
--disable-ffprobe --disable-doc --disable-manpages --disable-postproc --disable-avfilter --disable-alsa --disable-ffprobe --disable-doc --disable-manpages --disable-postproc --disable-avfilter --disable-alsa
--disable-lzma --disable-xlib --disable-debug --disable-vdpau --disable-vaapi --disable-sdl2 --disable-lzma --disable-xlib --disable-vdpau --disable-vaapi --disable-sdl2
--disable-network --extra-cflags=-fPIC --disable-network ${FFMPEG_DEBUG}
INSTALL_COMMAND "" INSTALL_COMMAND ""
PREFIX "third-party/ext_ffmpeg" PREFIX "third-party/ext_ffmpeg"
SOURCE_DIR "third-party/ext_ffmpeg/src/ffmpeg" SOURCE_DIR "third-party/ext_ffmpeg/src/ffmpeg"
BINARY_DIR "third-party/ext_ffmpeg/src/ffmpeg" BINARY_DIR "third-party/ext_ffmpeg/src/ffmpeg"
BUILD_COMMAND ${MAKE_EXE} -j 8 --silent BUILD_COMMAND ${MAKE_EXE} -j33 --silent
) )
SET(FFMPEG_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg) SET(FFMPEG_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
SET(FFMPEG_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
ExternalProject_Add(
libwpd
URL http://prdownloads.sourceforge.net/libwpd/libwpd-0.9.9.tar.gz
UPDATE_COMMAND ""
PATCH_COMMAND ""
TEST_COMMAND ""
CONFIGURE_COMMAND ./configure --without-docs --enable-static --disable-shared
INSTALL_COMMAND ""
PREFIX "third-party/ext_libwpd"
SOURCE_DIR "third-party/ext_libwpd/src/libwpd"
BINARY_DIR "third-party/ext_libwpd/src/libwpd"
BUILD_COMMAND ${MAKE_EXE} -j33
)
SET(WPD_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/src/lib/.libs/)
SET(WPD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/inc/)
add_dependencies( add_dependencies(
scan scan
libmobi libmobi
ffmpeg ffmpeg
antiword antiword
libwpd
) )
target_link_libraries( target_link_libraries(
scan scan
PUBLIC PUBLIC
cjson
${LibArchive_LIBRARIES} ${LibArchive_LIBRARIES}
ZLIB::ZLIB ZLIB::ZLIB
BZip2::BZip2 BZip2::BZip2
@@ -136,10 +166,12 @@ target_link_libraries(
${MOBI_LIB_DIR}/libmobi.a ${MOBI_LIB_DIR}/libmobi.a
${WPD_LIB_DIR}/libwpd-0.9.a
${WPD_LIB_DIR}/libwpd-stream-0.9.a
${FREETYPE_LIB} ${FREETYPE_LIB}
${HARFBUZZ_LIB} ${HARFBUZZ_LIB}
${JBIG2DEC_LIB} ${JBIG2DEC_LIB}
# OpenSSL::SSL OpenSSL::Crypto
stdc++ stdc++
@@ -170,6 +202,7 @@ target_link_libraries(
${GUMBO_LIB} ${GUMBO_LIB}
dl dl
antiword antiword
unofficial::pcre::pcre unofficial::pcre::pcre16 unofficial::pcre::pcre32 unofficial::pcre::pcrecpp
) )
target_include_directories( target_include_directories(
@@ -180,6 +213,7 @@ target_include_directories(
${LIBXML2_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR}
${FFMPEG_INCLUDE_DIR} ${FFMPEG_INCLUDE_DIR}
${MOBI_INCLUDE_DIR} ${MOBI_INCLUDE_DIR}
${WPD_INCLUDE_DIR}
) )
if (BUILD_TESTS) if (BUILD_TESTS)

View File

@@ -4,6 +4,8 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <fcntl.h> #include <fcntl.h>
#include <openssl/evp.h>
#include <pcre.h>
int should_parse_filtered_file(const char *filepath, int ext) { int should_parse_filtered_file(const char *filepath, int ext) {
@@ -33,18 +35,81 @@ int should_parse_filtered_file(const char *filepath, int ext) {
return FALSE; return FALSE;
} }
int arc_read(struct vfile *f, void *buf, size_t size) { void arc_close(struct vfile *f) {
size_t read = archive_read_data(f->arc, buf, size); SHA1_Final(f->sha1_digest, &f->sha1_ctx);
if (read != size) { if (f->rewind_buffer != NULL) {
const char* error_str = archive_error_string(f->arc); free(f->rewind_buffer);
f->rewind_buffer = NULL;
f->rewind_buffer_size = 0;
f->rewind_buffer_cursor = 0;
}
}
int arc_read(struct vfile *f, void *buf, size_t size) {
int bytes_copied = 0;
if (f->rewind_buffer_size != 0) {
if (size > f->rewind_buffer_size) {
memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, f->rewind_buffer_size);
bytes_copied = f->rewind_buffer_size;
size -= f->rewind_buffer_size;
buf += f->rewind_buffer_size;
f->rewind_buffer_size = 0;
} else {
memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, size);
f->rewind_buffer_size -= (int) size;
f->rewind_buffer_cursor += (int) size;
return (int) size;
}
}
size_t bytes_read = archive_read_data(f->arc, buf, size);
if (bytes_read != 0 && bytes_read <= size && f->calculate_checksum) {
f->has_checksum = TRUE;
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, bytes_read);
}
if (bytes_read != size && archive_errno(f->arc) != 0) {
const char *error_str = archive_error_string(f->arc);
if (error_str != NULL) { if (error_str != NULL) {
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str); f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
} }
return -1; return -1;
} }
return read; return (int) bytes_read + bytes_copied;
}
int arc_read_rewindable(struct vfile *f, void *buf, size_t size) {
if (f->rewind_buffer != NULL) {
fprintf(stderr, "Allocated rewind buffer more than once for %s", f->filepath);
exit(-1);
}
size_t bytes_read = archive_read_data(f->arc, buf, size);
if (bytes_read != size && archive_errno(f->arc) != 0) {
const char *error_str = archive_error_string(f->arc);
if (error_str != NULL) {
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
}
return -1;
}
f->rewind_buffer = malloc(size);
f->rewind_buffer_size = (int) size;
f->rewind_buffer_cursor = 0;
memcpy(f->rewind_buffer, buf, size);
return (int) bytes_read;
} }
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) { int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) {
@@ -58,7 +123,7 @@ int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *ar
archive_read_add_passphrase(*a, ctx->passphrase); archive_read_add_passphrase(*a, ctx->passphrase);
} }
return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE); return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE);
} else if (allow_recurse) { } else if (allow_recurse) {
*a = archive_read_new(); *a = archive_read_new();
archive_read_support_filter_all(*a); archive_read_support_filter_all(*a);
@@ -78,7 +143,10 @@ int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *ar
} }
} }
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { static __thread int sub_strings[30];
#define EXCLUDED(str) (pcre_exec(exclude, exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra) {
struct archive *a = NULL; struct archive *a = NULL;
struct archive_entry *entry = NULL; struct archive_entry *entry = NULL;
@@ -102,8 +170,8 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
if (S_ISREG(archive_entry_stat(entry)->st_mode)) { if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
const char* utf8_name = archive_entry_pathname_utf8(entry); const char *utf8_name = archive_entry_pathname_utf8(entry);
const char* file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name; const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
dyn_buffer_append_string(&buf, file_path); dyn_buffer_append_string(&buf, file_path);
dyn_buffer_write_char(&buf, ' '); dyn_buffer_write_char(&buf, ' ');
@@ -121,21 +189,26 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2); parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2);
sub_job->vfile.close = NULL; sub_job->vfile.close = arc_close;
sub_job->vfile.read = arc_read; sub_job->vfile.read = arc_read;
sub_job->vfile.read_rewindable = arc_read_rewindable;
sub_job->vfile.reset = NULL; sub_job->vfile.reset = NULL;
sub_job->vfile.arc = a; sub_job->vfile.arc = a;
sub_job->vfile.filepath = sub_job->filepath; sub_job->vfile.filepath = sub_job->filepath;
sub_job->vfile.is_fs_file = FALSE; sub_job->vfile.is_fs_file = FALSE;
sub_job->vfile.rewind_buffer_size = 0;
sub_job->vfile.rewind_buffer = NULL;
sub_job->vfile.log = ctx->log; sub_job->vfile.log = ctx->log;
sub_job->vfile.logf = ctx->logf; sub_job->vfile.logf = ctx->logf;
sub_job->vfile.has_checksum = FALSE;
sub_job->vfile.calculate_checksum = f->calculate_checksum;
memcpy(sub_job->parent, doc->path_md5, MD5_DIGEST_LENGTH); memcpy(sub_job->parent, doc->path_md5, MD5_DIGEST_LENGTH);
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
sub_job->vfile.info = *archive_entry_stat(entry); sub_job->vfile.info = *archive_entry_stat(entry);
if (S_ISREG(sub_job->vfile.info.st_mode)) { if (S_ISREG(sub_job->vfile.info.st_mode)) {
const char* utf8_name = archive_entry_pathname_utf8(entry); const char *utf8_name = archive_entry_pathname_utf8(entry);
if (utf8_name == NULL) { if (utf8_name == NULL) {
sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry)); sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry));
@@ -144,13 +217,21 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
} }
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1; sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
// Handle excludes
if (exclude != NULL && EXCLUDED(sub_job->filepath)) {
CTX_LOG_DEBUGF("arc.c", "Excluded: %s", sub_job->filepath)
continue;
}
char *p = strrchr(sub_job->filepath, '.'); char *p = strrchr(sub_job->filepath, '.');
if (p != NULL) { if (p != NULL && (p - sub_job->filepath) > strlen(f->filepath)) {
sub_job->ext = (int) (p - sub_job->filepath + 1); sub_job->ext = (int) (p - sub_job->filepath + 1);
} else { } else {
sub_job->ext = (int) strlen(sub_job->filepath); sub_job->ext = (int) strlen(sub_job->filepath);
} }
SHA1_Init(&sub_job->vfile.sha1_ctx);
ctx->parse(sub_job); ctx->parse(sub_job);
} }
} }

View File

@@ -4,9 +4,10 @@
#include <archive.h> #include <archive.h>
#include <archive_entry.h> #include <archive_entry.h>
#include <fcntl.h> #include <fcntl.h>
#include <pcre.h>
#include "../scan.h" #include "../scan.h"
# define ARC_SKIPPED -1 # define ARC_SKIPPED (-1)
#define ARC_MODE_SKIP 0 #define ARC_MODE_SKIP 0
#define ARC_MODE_LIST 1 #define ARC_MODE_LIST 1
#define ARC_MODE_SHALLOW 2 #define ARC_MODE_SHALLOW 2
@@ -31,27 +32,34 @@ typedef struct {
} arc_data_t; } arc_data_t;
static int vfile_open_callback(struct archive *a, void *user_data) { static int vfile_open_callback(struct archive *a, void *user_data) {
arc_data_t *data = (arc_data_t*)user_data; arc_data_t *data = (arc_data_t *) user_data;
if (data->f->is_fs_file && data->f->fd == -1) { if (!data->f->is_fs_file) {
data->f->fd = open(data->f->filepath, O_RDONLY); SHA1_Init(&data->f->sha1_ctx);
} }
return ARCHIVE_OK; return ARCHIVE_OK;
} }
static long vfile_read_callback(struct archive *a, void *user_data, const void **buf) { static long vfile_read_callback(struct archive *a, void *user_data, const void **buf) {
arc_data_t *data = (arc_data_t*)user_data; arc_data_t *data = (arc_data_t *) user_data;
*buf = data->buf; *buf = data->buf;
return data->f->read(data->f, data->buf, ARC_BUF_SIZE); long ret = data->f->read(data->f, data->buf, sizeof(data->buf));
if (!data->f->is_fs_file && ret > 0) {
data->f->has_checksum = TRUE;
safe_sha1_update(&data->f->sha1_ctx, (unsigned char*)data->buf, ret);
}
return ret;
} }
static int vfile_close_callback(struct archive *a, void *user_data) { static int vfile_close_callback(struct archive *a, void *user_data) {
arc_data_t *data = (arc_data_t*)user_data; arc_data_t *data = (arc_data_t *) user_data;
if (data->f->close != NULL) { if (!data->f->is_fs_file) {
data->f->close(data->f); SHA1_Final((unsigned char *) data->f->sha1_digest, &data->f->sha1_ctx);
} }
return ARCHIVE_OK; return ARCHIVE_OK;
@@ -61,8 +69,12 @@ int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *ar
int should_parse_filtered_file(const char *filepath, int ext); int should_parse_filtered_file(const char *filepath, int ext);
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc); scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra);
int arc_read(struct vfile * f, void *buf, size_t size); int arc_read(struct vfile *f, void *buf, size_t size);
int arc_read_rewindable(struct vfile *f, void *buf, size_t size);
void arc_close(struct vfile *f);
#endif #endif

View File

@@ -12,6 +12,10 @@ void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc) {
struct archive_entry *entry = NULL; struct archive_entry *entry = NULL;
arc_data_t arc_data; arc_data_t arc_data;
if (ctx->tn_size <= 0) {
return;
}
int ret = arc_open(&arc_ctx, f, &a, &arc_data, TRUE); int ret = arc_open(&arc_ctx, f, &a, &arc_data, TRUE);
if (ret != ARCHIVE_OK) { if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(cbr.c) [%d] %s", ret, archive_error_string(a)) CTX_LOG_ERRORF(f->filepath, "(cbr.c) [%d] %s", ret, archive_error_string(a))
@@ -26,10 +30,10 @@ void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc) {
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name; const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
char *p = strrchr(file_path, '.'); char *p = strrchr(file_path, '.');
if (p != NULL && strcmp(p, ".png") == 0 || strcmp(p, ".jpg") == 0 || strcmp(p, ".jpeg") == 0) { if (p != NULL && (strcmp(p, ".png") == 0 || strcmp(p, ".jpg") == 0 || strcmp(p, ".jpeg") == 0)) {
size_t entry_size = archive_entry_size(entry); size_t entry_size = archive_entry_size(entry);
void *buf = malloc(entry_size); void *buf = malloc(entry_size);
int read = archive_read_data(a, buf, entry_size); size_t read = archive_read_data(a, buf, entry_size);
if (read != entry_size) { if (read != entry_size) {
const char *err_str = archive_error_string(a); const char *err_str = archive_error_string(a);

View File

@@ -4,6 +4,7 @@
#include <tesseract/capi.h> #include <tesseract/capi.h>
#include "../media/media.h" #include "../media/media.h"
#include "../arc/arc.h"
#define MIN_OCR_SIZE 350 #define MIN_OCR_SIZE 350
#define MIN_OCR_LEN 10 #define MIN_OCR_LEN 10
@@ -38,16 +39,15 @@ int pixmap_is_blank(const fz_pixmap *pixmap) {
return TRUE; return TRUE;
} }
fz_pixmap *load_pixmap(scan_ebook_ctx_t *ctx, int page, fz_context *fzctx, fz_document *fzdoc, document_t *doc, fz_page **cover) { fz_pixmap *
load_pixmap(scan_ebook_ctx_t *ctx, int page, fz_context *fzctx, fz_document *fzdoc, document_t *doc, fz_page **cover) {
int err = 0; int err = 0;
fz_var(cover); fz_var(cover);
fz_var(err); fz_var(err);
fz_try(fzctx) fz_try(fzctx)*cover = fz_load_page(fzctx, fzdoc, page);
*cover = fz_load_page(fzctx, fzdoc, page); fz_catch(fzctx)err = 1;
fz_catch(fzctx)
err = 1;
if (err != 0) { if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message) CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message)
@@ -75,14 +75,11 @@ fz_pixmap *load_pixmap(scan_ebook_ctx_t *ctx, int page, fz_context *fzctx, fz_do
fz_var(err); fz_var(err);
fz_try(fzctx) { fz_try(fzctx) {
fz_run_page(fzctx, *cover, dev, fz_identity, NULL); fz_run_page(fzctx, *cover, dev, fz_identity, NULL);
} } fz_always(fzctx) {
fz_always(fzctx) { fz_close_device(fzctx, dev);
fz_close_device(fzctx, dev); fz_drop_device(fzctx, dev);
fz_drop_device(fzctx, dev); } fz_catch(fzctx)err = fzctx->error.errcode;
}
fz_catch(fzctx)
err = fzctx->error.errcode;
if (err != 0) { if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message) CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message)
@@ -131,10 +128,14 @@ int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_d
int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, pixmap->w, pixmap->h, 1); int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, pixmap->w, pixmap->h, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len); uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len);
av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, pixmap->w, pixmap->h, 1); av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, pixmap->w, pixmap->h,
1);
const uint8_t *in_data[1] = {pixmap->samples}; unsigned char *samples = calloc(1, 1024 * 1024 * 1024);
int in_line_size[1] = {pixmap->stride}; memcpy(samples, pixmap->samples, pixmap->stride * pixmap->h);
const uint8_t *in_data[1] = {samples,};
int in_line_size[1] = {(int) pixmap->stride};
sws_scale(sws_ctx, sws_scale(sws_ctx,
in_data, in_line_size, in_data, in_line_size,
@@ -149,7 +150,7 @@ int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_d
sws_freeContext(sws_ctx); sws_freeContext(sws_ctx);
// YUV420p -> JPEG // YUV420p -> JPEG
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(pixmap->w, pixmap->h, 1.0f); AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(pixmap->w, pixmap->h, ctx->tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame); avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet; AVPacket jpeg_packet;
@@ -159,6 +160,7 @@ int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_d
APPEND_TN_META(doc, pixmap->w, pixmap->h) APPEND_TN_META(doc, pixmap->w, pixmap->h)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size); ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
free(samples);
av_packet_unref(&jpeg_packet); av_packet_unref(&jpeg_packet);
av_free(*scaled_frame->data); av_free(*scaled_frame->data);
av_frame_free(&scaled_frame); av_frame_free(&scaled_frame);
@@ -187,10 +189,10 @@ void fz_warn_callback(void *user, const char *message) {
static void init_fzctx(fz_context *fzctx, document_t *doc) { static void init_fzctx(fz_context *fzctx, document_t *doc) {
fz_register_document_handlers(fzctx); fz_register_document_handlers(fzctx);
static int mu_is_initialized = 0; static int mu_is_initialized = FALSE;
if (!mu_is_initialized) { if (!mu_is_initialized) {
pthread_mutex_init(&Mutex, NULL); pthread_mutex_init(&Mutex, NULL);
mu_is_initialized = 1; mu_is_initialized = TRUE;
} }
fzctx->warn.print_user = doc; fzctx->warn.print_user = doc;
@@ -223,7 +225,7 @@ static int read_stext_block(fz_stext_block *block, text_buffer_t *tex) {
return 0; return 0;
} }
#define IS_VALID_BPP(d) (d==1 || d==2 || d==4 || d==8 || d==16 || d==24 || d==32) #define IS_VALID_BPP(d) ((d)==1 || (d)==2 || (d)==4 || (d)==8 || (d)==16 || (d)==24 || (d)==32)
void fill_image(fz_context *fzctx, UNUSED(fz_device *dev), void fill_image(fz_context *fzctx, UNUSED(fz_device *dev),
fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha), fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
@@ -255,9 +257,10 @@ void fill_image(fz_context *fzctx, UNUSED(fz_device *dev),
} }
} }
void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only) { void
parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only) {
fz_context *fzctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED); fz_context *fzctx = fz_new_context(NULL, NULL, FZ_STORE_DEFAULT);
thread_ctx = *ctx; thread_ctx = *ctx;
init_fzctx(fzctx, doc); init_fzctx(fzctx, doc);
@@ -270,13 +273,10 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha
fz_var(stream); fz_var(stream);
fz_var(err); fz_var(err);
fz_try(fzctx) fz_try(fzctx) {
{ stream = fz_open_memory(fzctx, buf, buf_len);
stream = fz_open_memory(fzctx, buf, buf_len); fzdoc = fz_open_document_with_stream(fzctx, mime_str, stream);
fzdoc = fz_open_document_with_stream(fzctx, mime_str, stream); } fz_catch(fzctx)err = fzctx->error.errcode;
}
fz_catch(fzctx)
err = fzctx->error.errcode;
if (err != 0) { if (err != 0) {
fz_drop_stream(fzctx, stream); fz_drop_stream(fzctx, stream);
@@ -287,10 +287,8 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha
int page_count = -1; int page_count = -1;
fz_var(err); fz_var(err);
fz_try(fzctx) fz_try(fzctx)page_count = fz_count_pages(fzctx, fzdoc);
page_count = fz_count_pages(fzctx, fzdoc); fz_catch(fzctx)err = fzctx->error.errcode;
fz_catch(fzctx)
err = fzctx->error.errcode;
if (err) { if (err) {
CTX_LOG_WARNINGF(doc->filepath, "fz_count_pages() returned error code [%d] %s", err, fzctx->error.message) CTX_LOG_WARNINGF(doc->filepath, "fz_count_pages() returned error code [%d] %s", err, fzctx->error.message)
@@ -300,7 +298,7 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha
return; return;
} }
APPEND_INT_META(doc, MetaPages, page_count) APPEND_LONG_META(doc, MetaPages, page_count)
if (ctx->tn_size > 0) { if (ctx->tn_size > 0) {
if (render_cover(ctx, fzctx, doc, fzdoc) == FALSE) { if (render_cover(ctx, fzctx, doc, fzdoc) == FALSE) {
@@ -319,20 +317,16 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha
} }
char title[8192] = {'\0',}; char title[8192] = {'\0',};
fz_try(fzctx) fz_try(fzctx)fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title));
fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title)); fz_catch(fzctx);
fz_catch(fzctx)
;
if (strlen(title) > 0) { if (strlen(title) > 0) {
APPEND_UTF8_META(doc, MetaTitle, title) APPEND_UTF8_META(doc, MetaTitle, title)
} }
char author[4096] = {'\0',}; char author[4096] = {'\0',};
fz_try(fzctx) fz_try(fzctx)fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_AUTHOR, author, sizeof(author));
fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_AUTHOR, author, sizeof(author)); fz_catch(fzctx);
fz_catch(fzctx)
;
if (strlen(author) > 0) { if (strlen(author) > 0) {
APPEND_UTF8_META(doc, MetaAuthor, author) APPEND_UTF8_META(doc, MetaAuthor, author)
@@ -346,10 +340,8 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha
for (int current_page = 0; current_page < page_count; current_page++) { for (int current_page = 0; current_page < page_count; current_page++) {
fz_page *page = NULL; fz_page *page = NULL;
fz_var(err); fz_var(err);
fz_try(fzctx) fz_try(fzctx)page = fz_load_page(fzctx, fzdoc, current_page);
page = fz_load_page(fzctx, fzdoc, current_page); fz_catch(fzctx)err = fzctx->error.errcode;
fz_catch(fzctx)
err = fzctx->error.errcode;
if (err != 0) { if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message) CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message)
text_buffer_destroy(&thread_buffer); text_buffer_destroy(&thread_buffer);
@@ -373,15 +365,11 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha
} }
fz_var(err); fz_var(err);
fz_try(fzctx) fz_try(fzctx)fz_run_page(fzctx, page, dev, fz_identity, NULL);
fz_run_page(fzctx, page, dev, fz_identity, NULL); fz_always(fzctx) {
fz_always(fzctx) fz_close_device(fzctx, dev);
{ fz_drop_device(fzctx, dev);
fz_close_device(fzctx, dev); } fz_catch(fzctx)err = fzctx->error.errcode;
fz_drop_device(fzctx, dev);
}
fz_catch(fzctx)
err = fzctx->error.errcode;
if (err != 0) { if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message) CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message)
@@ -424,7 +412,77 @@ void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const cha
fz_drop_context(fzctx); fz_drop_context(fzctx);
} }
static scan_arc_ctx_t arc_ctx = (scan_arc_ctx_t) {.passphrase = {0,}};
void parse_epub_fast(scan_ebook_ctx_t *ctx, vfile_t *f, document_t *doc) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
text_buffer_t content_buffer = text_buffer_create(ctx->content_size);
if (ctx->tn_size <= 0) {
return;
}
int ret = arc_open(&arc_ctx, f, &a, &arc_data, TRUE);
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(ebook.c) [%d] %s", ret, archive_error_string(a))
archive_read_free(a);
return;
}
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
struct stat info = *archive_entry_stat(entry);
if (S_ISREG(info.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
char *p = strrchr(file_path, '.');
if (p != NULL && (strcmp(p, ".html") == 0 || (strcmp(p, ".xhtml") == 0))) {
size_t entry_size = archive_entry_size(entry);
void *buf = malloc(entry_size + 1);
size_t read = archive_read_data(a, buf, entry_size);
*(char *) (buf + entry_size) = '\0';
if (read != entry_size) {
const char *err_str = archive_error_string(a);
if (err_str) {
CTX_LOG_ERRORF("ebook.c", "Error while reading entry: %s", err_str)
}
free(buf);
break;
}
ret = text_buffer_append_markup(&content_buffer, buf);
free(buf);
if (ret == TEXT_BUF_FULL) {
break;
}
}
}
}
text_buffer_terminate_string(&content_buffer);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + content_buffer.dyn_buffer.cur);
meta_content->key = MetaContent;
memcpy(meta_content->str_val, content_buffer.dyn_buffer.buf, content_buffer.dyn_buffer.cur);
APPEND_META(doc, meta_content)
text_buffer_destroy(&content_buffer);
archive_read_free(a);
}
void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, document_t *doc) { void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, document_t *doc) {
if (ctx->fast_epub_parse && is_epub(mime_str)) {
parse_epub_fast(ctx, f, doc);
return;
}
size_t buf_len; size_t buf_len;
void *buf = read_all(f, &buf_len); void *buf = read_all(f, &buf_len);
if (buf == NULL) { if (buf == NULL) {

View File

@@ -13,9 +13,18 @@ typedef struct {
log_callback_t log; log_callback_t log;
logf_callback_t logf; logf_callback_t logf;
store_callback_t store; store_callback_t store;
int fast_epub_parse;
float tn_qscale;
} scan_ebook_ctx_t; } scan_ebook_ctx_t;
void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char* mime_str, document_t *doc); void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, document_t *doc);
void parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only);
void
parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only);
__always_inline
static int is_epub(const char *mime_string) {
return strcmp(mime_string, "application/epub+zip") == 0;
}
#endif #endif

View File

@@ -144,27 +144,28 @@ void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc) {
} }
size_t buf_len = 0; size_t buf_len = 0;
void * buf = read_all(f, &buf_len); void *buf = read_all(f, &buf_len);
if (buf == NULL) { if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed") CTX_LOG_ERROR(f->filepath, "read_all() failed")
return; return;
} }
FT_Face face; FT_Face face;
FT_Error err = FT_New_Memory_Face(ft_lib, (unsigned char *) buf, buf_len, 0, &face); FT_Error err = FT_New_Memory_Face(ft_lib, (unsigned char *) buf, (int) buf_len, 0, &face);
if (err != 0) { if (err != 0) {
CTX_LOG_ERRORF(doc->filepath, "(font.c) FT_New_Memory_Face() returned error code [%d] %s", err, FT_Error_String(err)) CTX_LOG_ERRORF(doc->filepath, "(font.c) FT_New_Memory_Face() returned error code [%d] %s", err,
FT_Error_String(err))
free(buf); free(buf);
return; return;
} }
char font_name[1024]; char font_name[4096];
if (face->style_name == NULL || *(face->style_name) == '?') { if (face->style_name == NULL || (strcmp(face->style_name, "?") == 0)) {
if (face->family_name == NULL) { if (face->family_name == NULL) {
strcpy(font_name, "(null)"); strcpy(font_name, "(null)");
} else { } else {
strcpy(font_name, face->family_name); strncpy(font_name, face->family_name, sizeof(font_name));
} }
} else { } else {
snprintf(font_name, sizeof(font_name), "%s %s", face->family_name, face->style_name); snprintf(font_name, sizeof(font_name), "%s %s", face->family_name, face->style_name);
@@ -186,7 +187,8 @@ void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc) {
err = FT_Set_Pixel_Sizes(face, 0, pixel); err = FT_Set_Pixel_Sizes(face, 0, pixel);
if (err != 0) { if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "(font.c) FT_Set_Pixel_Sizes() returned error code [%d] %s", err, FT_Error_String(err)) CTX_LOG_WARNINGF(doc->filepath, "(font.c) FT_Set_Pixel_Sizes() returned error code [%d] %s", err,
FT_Error_String(err))
FT_Done_Face(face); FT_Done_Face(face);
free(buf); free(buf);
return; return;
@@ -207,7 +209,8 @@ void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc) {
c = c >= 'a' && c <= 'z' ? c - 32 : c + 32; c = c >= 'a' && c <= 'z' ? c - 32 : c + 32;
err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER); err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
if (err != 0) { if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "(font.c) FT_Load_Char() returned error code [%d] %s", err, FT_Error_String(err)) CTX_LOG_WARNINGF(doc->filepath, "(font.c) FT_Load_Char() returned error code [%d] %s", err,
FT_Error_String(err))
continue; continue;
} }
} }

119
libscan/json/json.c Normal file
View File

@@ -0,0 +1,119 @@
#include "json.h"
#include "cjson/cJSON.h"
#define JSON_MAX_FILE_SIZE (1024 * 1024 * 50)
int json_extract_text(cJSON *json, text_buffer_t *tex) {
if (cJSON_IsObject(json)) {
for (cJSON *child = json->child; child != NULL; child = child->next) {
if (json_extract_text(child, tex)) {
return TRUE;
}
}
} else if (cJSON_IsArray(json)) {
cJSON *child;
cJSON_ArrayForEach(child, json) {
if (json_extract_text(child, tex)) {
return TRUE;
}
}
} else if (cJSON_IsString(json)) {
if (text_buffer_append_string0(tex, json->valuestring) == TEXT_BUF_FULL) {
return TRUE;
}
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
return TRUE;
}
}
return FALSE;
}
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
if (f->info.st_size > JSON_MAX_FILE_SIZE) {
CTX_LOG_WARNINGF("json.c", "File larger than maximum allowed [%s]", f->filepath)
return SCAN_ERR_SKIP;
}
size_t buf_len;
char *buf = read_all(f, &buf_len);
if (buf == NULL) {
return SCAN_ERR_READ;
}
buf_len += 1;
buf = realloc(buf, buf_len);
*(buf + buf_len - 1) = '\0';
cJSON *json = cJSON_ParseWithOpts(buf, NULL, TRUE);
text_buffer_t tex = text_buffer_create(ctx->content_size);
json_extract_text(json, &tex);
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
cJSON_Delete(json);
free(buf);
text_buffer_destroy(&tex);
return SCAN_OK;
}
#define JSON_BUF_SIZE (1024 * 1024 * 5)
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
char *buf = calloc(JSON_BUF_SIZE + 1, sizeof(char));
*(buf + JSON_BUF_SIZE) = '\0';
text_buffer_t tex = text_buffer_create(ctx->content_size);
size_t ret;
int eof = FALSE;
const char *parse_end = buf;
size_t to_read;
char *ptr = buf;
while (TRUE) {
cJSON *json;
if (!eof) {
to_read = parse_end == buf ? JSON_BUF_SIZE : parse_end - buf;
ret = f->read(f, ptr, to_read);
if (ret != to_read) {
eof = TRUE;
}
}
json = cJSON_ParseWithOpts(buf, &parse_end, FALSE);
if (parse_end == buf + JSON_BUF_SIZE) {
CTX_LOG_ERRORF("json.c", "Line too large for buffer [%s]", doc->filepath);
cJSON_Delete(json);
break;
}
if (parse_end == buf) {
cJSON_Delete(json);
break;
}
json_extract_text(json, &tex);
cJSON_Delete(json);
memmove(buf, parse_end, (buf + JSON_BUF_SIZE - parse_end));
ptr = buf + JSON_BUF_SIZE - parse_end + buf;
}
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
free(buf);
text_buffer_destroy(&tex);
}

30
libscan/json/json.h Normal file
View File

@@ -0,0 +1,30 @@
#ifndef SCAN_JSON_H
#define SCAN_JSON_H
#include "../scan.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
unsigned int json_mime;
unsigned int ndjson_mime;
} scan_json_ctx_t;
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
__always_inline
static int is_json(scan_json_ctx_t *ctx, unsigned int mime) {
return mime == ctx->json_mime;
}
__always_inline
static int is_ndjson(scan_json_ctx_t *ctx, unsigned int mime) {
return mime == ctx->ndjson_mime;
}
#endif

View File

@@ -20,17 +20,20 @@
#undef ABS #undef ABS
#define ABS(a) (((a) < 0) ? -(a) : (a)) #define ABS(a) (((a) < 0) ? -(a) : (a))
#define SHA1_STR_LENGTH 41
#define SHA1_DIGEST_LENGTH 20
#define APPEND_STR_META(doc, keyname, value) \ #define APPEND_STR_META(doc, keyname, value) \
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \ {meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \
meta_str->key = keyname; \ meta_str->key = keyname; \
strcpy(meta_str->str_val, value); \ strcpy(meta_str->str_val, value); \
APPEND_META(doc, meta_str)} APPEND_META(doc, meta_str)}
#define APPEND_INT_META(doc, keyname, value) \ #define APPEND_LONG_META(doc, keyname, value) \
{meta_line_t *meta_int = malloc(sizeof(meta_line_t)); \ {meta_line_t *meta_long = malloc(sizeof(meta_line_t)); \
meta_int->key = keyname; \ meta_long->key = keyname; \
meta_int->int_val = value; \ meta_long->long_val = value; \
APPEND_META(doc, meta_int)} APPEND_META(doc, meta_long)}
#define APPEND_TN_META(doc, width, height) \ #define APPEND_TN_META(doc, width, height) \
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + 4 + 1 + 4); \ {meta_line_t *meta_str = malloc(sizeof(meta_line_t) + 4 + 1 + 4); \

View File

@@ -7,6 +7,22 @@
#define STORE_AS_IS ((void*)-1) #define STORE_AS_IS ((void*)-1)
const char *get_filepath_with_ext(document_t *doc, const char *filepath, const char *mime_str) {
int has_extension = doc->ext > doc->base;
if (!has_extension) {
if (strcmp(mime_str, "image/png") == 0) {
return "file.png";
} else if (strcmp(mime_str, "image/jpeg") == 0) {
return "file.jpg";
}
}
return filepath;
}
__always_inline __always_inline
void *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) { void *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) {
@@ -119,6 +135,10 @@ static void read_subtitles(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, i
for (int i = 0; i < subtitle.num_rects; i++) { for (int i = 0; i < subtitle.num_rects; i++) {
const char *text = subtitle.rects[i]->ass; const char *text = subtitle.rects[i]->ass;
if (text == NULL) {
continue;
}
char *idx = strstr(text, "\\N"); char *idx = strstr(text, "\\N");
if (idx != NULL && strlen(idx + 2) > 1) { if (idx != NULL && strlen(idx + 2) > 1) {
text_buffer_append_string0(&tex, idx + 2); text_buffer_append_string0(&tex, idx + 2);
@@ -127,12 +147,15 @@ static void read_subtitles(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, i
} }
avsubtitle_free(&subtitle); avsubtitle_free(&subtitle);
} }
av_packet_unref(&packet);
} }
text_buffer_terminate_string(&tex); text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf) APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
text_buffer_destroy(&tex); text_buffer_destroy(&tex);
avcodec_free_context(&decoder);
} }
__always_inline __always_inline
@@ -254,6 +277,9 @@ append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *f
meta_line_t *meta_duration = malloc(sizeof(meta_line_t)); meta_line_t *meta_duration = malloc(sizeof(meta_line_t));
meta_duration->key = MetaMediaDuration; meta_duration->key = MetaMediaDuration;
meta_duration->long_val = pFormatCtx->duration / AV_TIME_BASE; meta_duration->long_val = pFormatCtx->duration / AV_TIME_BASE;
if (meta_duration->long_val > INT32_MAX) {
meta_duration->long_val = 0;
}
APPEND_META(doc, meta_duration) APPEND_META(doc, meta_duration)
meta_line_t *meta_bitrate = malloc(sizeof(meta_line_t)); meta_line_t *meta_bitrate = malloc(sizeof(meta_line_t));
@@ -284,26 +310,34 @@ append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *f
if (strcmp(key, "artist") == 0) { if (strcmp(key, "artist") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist); append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
} else if (strcmp(tag->key, "ImageDescription") == 0) { } else if (strcmp(key, "imagedescription") == 0) {
APPEND_TAG_META(MetaContent) APPEND_TAG_META(MetaContent)
} else if (strcmp(tag->key, "Make") == 0) { } else if (strcmp(key, "make") == 0) {
APPEND_TAG_META(MetaExifMake) APPEND_TAG_META(MetaExifMake)
} else if (strcmp(tag->key, "Model") == 0) { } else if (strcmp(key, "model") == 0) {
APPEND_TAG_META(MetaExifModel) APPEND_TAG_META(MetaExifModel)
} else if (strcmp(tag->key, "Software") == 0) { } else if (strcmp(key, "software") == 0) {
APPEND_TAG_META(MetaExifSoftware) APPEND_TAG_META(MetaExifSoftware)
} else if (strcmp(tag->key, "FNumber") == 0) { } else if (strcmp(key, "fnumber") == 0) {
APPEND_TAG_META(MetaExifFNumber) APPEND_TAG_META(MetaExifFNumber)
} else if (strcmp(tag->key, "FocalLength") == 0) { } else if (strcmp(key, "focallength") == 0) {
APPEND_TAG_META(MetaExifFocalLength) APPEND_TAG_META(MetaExifFocalLength)
} else if (strcmp(tag->key, "UserComment") == 0) { } else if (strcmp(key, "usercomment") == 0) {
APPEND_TAG_META(MetaExifUserComment) APPEND_TAG_META(MetaExifUserComment)
} else if (strcmp(tag->key, "ISOSpeedRatings") == 0) { } else if (strcmp(key, "isospeedratings") == 0) {
APPEND_TAG_META(MetaExifIsoSpeedRatings) APPEND_TAG_META(MetaExifIsoSpeedRatings)
} else if (strcmp(tag->key, "ExposureTime") == 0) { } else if (strcmp(key, "exposuretime") == 0) {
APPEND_TAG_META(MetaExifExposureTime) APPEND_TAG_META(MetaExifExposureTime)
} else if (strcmp(tag->key, "DateTime") == 0) { } else if (strcmp(key, "datetime") == 0) {
APPEND_TAG_META(MetaExifDateTime) APPEND_TAG_META(MetaExifDateTime)
} else if (strcmp(key, "gpslatitude") == 0) {
APPEND_TAG_META(MetaExifGpsLatitudeDMS)
} else if (strcmp(key, "gpslatituderef") == 0) {
APPEND_TAG_META(MetaExifGpsLatitudeRef)
} else if (strcmp(key, "gpslongitude") == 0) {
APPEND_TAG_META(MetaExifGpsLongitudeDMS)
} else if (strcmp(key, "gpslongituderef") == 0) {
APPEND_TAG_META(MetaExifGpsLongitudeRef)
} }
} }
} }
@@ -328,7 +362,6 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
APPEND_STR_META(doc, MetaMediaAudioCodec, desc->name) APPEND_STR_META(doc, MetaMediaAudioCodec, desc->name)
} }
append_audio_meta(pFormatCtx, doc);
audio_stream = i; audio_stream = i;
} }
} else if (stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) { } else if (stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
@@ -342,12 +375,12 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
meta_line_t *meta_w = malloc(sizeof(meta_line_t)); meta_line_t *meta_w = malloc(sizeof(meta_line_t));
meta_w->key = MetaWidth; meta_w->key = MetaWidth;
meta_w->int_val = stream->codecpar->width; meta_w->long_val = stream->codecpar->width;
APPEND_META(doc, meta_w) APPEND_META(doc, meta_w)
meta_line_t *meta_h = malloc(sizeof(meta_line_t)); meta_line_t *meta_h = malloc(sizeof(meta_line_t));
meta_h->key = MetaHeight; meta_h->key = MetaHeight;
meta_h->int_val = stream->codecpar->height; meta_h->long_val = stream->codecpar->height;
APPEND_META(doc, meta_h) APPEND_META(doc, meta_h)
video_stream = i; video_stream = i;
@@ -366,6 +399,10 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
} }
} }
if (audio_stream != -1) {
append_audio_meta(pFormatCtx, doc);
}
if (video_stream != -1 && ctx->tn_size > 0) { if (video_stream != -1 && ctx->tn_size > 0) {
AVStream *stream = pFormatCtx->streams[video_stream]; AVStream *stream = pFormatCtx->streams[video_stream];
@@ -476,7 +513,7 @@ int vfile_read(void *ptr, uint8_t *buf, int buf_size) {
} }
typedef struct { typedef struct {
struct stat info; size_t size;
FILE *file; FILE *file;
void *buf; void *buf;
} memfile_t; } memfile_t;
@@ -490,14 +527,14 @@ int memfile_read(void *ptr, uint8_t *buf, int buf_size) {
return AVERROR_EOF; return AVERROR_EOF;
} }
return buf_size; return (int) ret;
} }
long memfile_seek(void *ptr, long offset, int whence) { long memfile_seek(void *ptr, long offset, int whence) {
memfile_t *mem = ptr; memfile_t *mem = ptr;
if (whence == 0x10000) { if (whence == 0x10000) {
return mem->info.st_size; return mem->size;
} }
int ret = fseek(mem->file, offset, whence); int ret = fseek(mem->file, offset, whence);
@@ -509,24 +546,31 @@ long memfile_seek(void *ptr, long offset, int whence) {
} }
int memfile_open(vfile_t *f, memfile_t *mem) { int memfile_open(vfile_t *f, memfile_t *mem) {
mem->info = f->info; mem->size = f->info.st_size;
mem->buf = malloc(mem->info.st_size); mem->buf = malloc(mem->size);
if (mem->buf == NULL) { if (mem->buf == NULL) {
return -1; return -1;
} }
int ret = f->read(f, mem->buf, mem->info.st_size); int ret = f->read(f, mem->buf, mem->size);
mem->file = fmemopen(mem->buf, mem->info.st_size, "rb"); mem->file = fmemopen(mem->buf, mem->size, "rb");
return (ret == mem->info.st_size && mem->file != NULL) ? 0 : -1; if (f->calculate_checksum) {
SHA1_Init(&f->sha1_ctx);
safe_sha1_update(&f->sha1_ctx, mem->buf, mem->size);
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
f->has_checksum = TRUE;
}
return (ret == mem->size && mem->file != NULL) ? 0 : -1;
} }
int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) { int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) {
mem->info.st_size = buf_len; mem->size = (int) buf_len;
mem->buf = buf; mem->buf = buf;
mem->file = fmemopen(mem->buf, mem->info.st_size, "rb"); mem->file = fmemopen(mem->buf, mem->size, "rb");
return mem->file != NULL ? 0 : -1; return mem->file != NULL ? 0 : -1;
} }
@@ -538,7 +582,7 @@ void memfile_close(memfile_t *mem) {
} }
} }
void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc) { void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc, const char *mime_str) {
AVFormatContext *pFormatCtx = avformat_alloc_context(); AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) { if (pFormatCtx == NULL) {
@@ -548,7 +592,9 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc)
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE); unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
AVIOContext *io_ctx = NULL; AVIOContext *io_ctx = NULL;
memfile_t memfile = {{}, 0, 0}; memfile_t memfile = {0, 0, 0};
const char *filepath = get_filepath_with_ext(doc, f->filepath, mime_str);
if (f->info.st_size <= ctx->max_media_buffer) { if (f->info.st_size <= ctx->max_media_buffer) {
int ret = memfile_open(f, &memfile); int ret = memfile_open(f, &memfile);
@@ -565,7 +611,7 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc)
pFormatCtx->pb = io_ctx; pFormatCtx->pb = io_ctx;
int res = avformat_open_input(&pFormatCtx, f->filepath, NULL, NULL); int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
if (res < 0) { if (res < 0) {
if (res != -5) { if (res != -5) {
CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res)) CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
@@ -584,12 +630,12 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc)
memfile_close(&memfile); memfile_close(&memfile);
} }
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc) { void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char *mime_str) {
if (f->is_fs_file) { if (f->is_fs_file) {
parse_media_filename(ctx, f->filepath, doc); parse_media_filename(ctx, f->filepath, doc);
} else { } else {
parse_media_vfile(ctx, f, doc); parse_media_vfile(ctx, f, doc, mime_str);
} }
} }
@@ -598,7 +644,7 @@ void init_media() {
} }
int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) { int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) {
memfile_t memfile; memfile_t memfile = {0, 0, 0};
AVIOContext *io_ctx = NULL; AVIOContext *io_ctx = NULL;
AVFormatContext *pFormatCtx = avformat_alloc_context(); AVFormatContext *pFormatCtx = avformat_alloc_context();
@@ -616,8 +662,6 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
} else { } else {
avformat_close_input(&pFormatCtx); avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx); avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file); fclose(memfile.file);
return FALSE; return FALSE;
} }
@@ -637,7 +681,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
AVStream *stream = pFormatCtx->streams[0]; AVStream *stream = pFormatCtx->streams[0];
// Decoder // Decoder
AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id); const AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(video_codec); AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
avcodec_parameters_to_context(decoder, stream->codecpar); avcodec_parameters_to_context(decoder, stream->codecpar);
avcodec_open2(decoder, video_codec, NULL); avcodec_open2(decoder, video_codec, NULL);

View File

@@ -18,12 +18,13 @@ typedef struct {
int tn_size; int tn_size;
float tn_qscale; float tn_qscale;
long max_media_buffer; long max_media_buffer;
int read_subtitles;
} scan_media_ctx_t; } scan_media_ctx_t;
__always_inline __always_inline
static AVCodecContext *alloc_jpeg_encoder(int w, int h, float qscale) { static AVCodecContext *alloc_jpeg_encoder(int w, int h, float qscale) {
AVCodec *jpeg_codec = avcodec_find_encoder(AV_CODEC_ID_MJPEG); const AVCodec *jpeg_codec = avcodec_find_encoder(AV_CODEC_ID_MJPEG);
AVCodecContext *jpeg = avcodec_alloc_context3(jpeg_codec); AVCodecContext *jpeg = avcodec_alloc_context3(jpeg_codec);
jpeg->width = w; jpeg->width = w;
jpeg->height = h; jpeg->height = h;
@@ -42,9 +43,10 @@ static AVCodecContext *alloc_jpeg_encoder(int w, int h, float qscale) {
} }
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc); void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char*mime_str);
void init_media(); void init_media();
int store_image_thumbnail(scan_media_ctx_t *ctx, void* buf, size_t buf_len, document_t *doc, const char *url); int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url);
#endif #endif

View File

@@ -6,7 +6,7 @@
#include "../ebook/ebook.h" #include "../ebook/ebook.h"
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void* buf, size_t buf_len) { void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void *buf, size_t buf_len) {
// Open word doc // Open word doc
options_type *opts = direct_vGetOptions(); options_type *opts = direct_vGetOptions();
@@ -20,7 +20,7 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi
opts->iPageWidth = 595; opts->iPageWidth = 595;
opts->eImageLevel = level_ps_3; opts->eImageLevel = level_ps_3;
int doc_word_version = iGuessVersionNumber(file_in, buf_len); int doc_word_version = iGuessVersionNumber(file_in, (int) buf_len);
if (doc_word_version < 0 || doc_word_version == 3) { if (doc_word_version < 0 || doc_word_version == 3) {
free(buf); free(buf);
return; return;
@@ -38,19 +38,19 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi
return; return;
} }
iInitDocument(file_in, buf_len); iInitDocument(file_in, (int) buf_len);
const char* author = szGetAuthor(); const char *author = szGetAuthor();
if (author != NULL) { if (author != NULL) {
APPEND_UTF8_META(doc, MetaAuthor, author) APPEND_UTF8_META(doc, MetaAuthor, author)
} }
const char* title = szGetTitle(); const char *title = szGetTitle();
if (title != NULL) { if (title != NULL) {
APPEND_UTF8_META(doc, MetaTitle, title) APPEND_UTF8_META(doc, MetaTitle, title)
} }
vFreeDocument(); vFreeDocument();
bWordDecryptor(file_in, buf_len, diag); bWordDecryptor(file_in, (int) buf_len, diag);
vDestroyDiagram(diag); vDestroyDiagram(diag);
fclose(file_out); fclose(file_out);
@@ -71,7 +71,7 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi
free(out_buf); free(out_buf);
} }
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* buf, size_t buf_len) { void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void *buf, size_t buf_len) {
scan_ebook_ctx_t ebook_ctx = { scan_ebook_ctx_t ebook_ctx = {
.content_size = ctx->content_size, .content_size = ctx->content_size,
@@ -93,7 +93,7 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b
opts->iPageWidth = 595; opts->iPageWidth = 595;
opts->eImageLevel = level_ps_3; opts->eImageLevel = level_ps_3;
int doc_word_version = iGuessVersionNumber(file, buf_len); int doc_word_version = iGuessVersionNumber(file, (int) buf_len);
if (doc_word_version < 0 || doc_word_version == 3) { if (doc_word_version < 0 || doc_word_version == 3) {
free(buf); free(buf);
return; return;
@@ -110,7 +110,7 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b
return; return;
} }
bWordDecryptor(file, buf_len, diag); bWordDecryptor(file, (int) buf_len, diag);
vDestroyDiagram(diag); vDestroyDiagram(diag);
fclose(file_out); fclose(file_out);

View File

@@ -41,13 +41,11 @@ int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_
if (err->level == XML_ERR_FATAL) { if (err->level == XML_ERR_FATAL) {
CTX_LOG_ERRORF("ooxml.c", "Got fatal XML error while parsing document: %s", err->message) CTX_LOG_ERRORF("ooxml.c", "Got fatal XML error while parsing document: %s", err->message)
return -1; return -1;
} else {
CTX_LOG_ERRORF("ooxml.c", "Got recoverable XML error while parsing document: %s", err->message)
} }
} }
for (xmlNode *child = node; child; child = child->next) { for (xmlNode *child = node; child; child = child->next) {
if (*child->name == 't' && *(child->name + 1) == '\0') { if (child->name != NULL && *child->name == 't' && *(child->name + 1) == '\0') {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1); xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text) { if (text) {
@@ -70,7 +68,7 @@ int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_
int xml_io_read(void *context, char *buffer, int len) { int xml_io_read(void *context, char *buffer, int len) {
struct archive *a = context; struct archive *a = context;
return archive_read_data(a, buffer, len); return (int) archive_read_data(a, buffer, len);
} }
int xml_io_close(UNUSED(void *context)) { int xml_io_close(UNUSED(void *context)) {
@@ -78,7 +76,7 @@ int xml_io_close(UNUSED(void *context)) {
return 0; return 0;
} }
#define READ_PART_ERR -2 #define READ_PART_ERR (-2)
__always_inline __always_inline
static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) { static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) {
@@ -104,6 +102,42 @@ static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *bu
return ret; return ret;
} }
__always_inline
static int read_doc_props_app(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
return -1;
}
xmlNode *root = xmlDocGetRootElement(xml);
if (root == NULL) {
CTX_LOG_ERROR(doc->filepath, "Empty document")
xmlFreeDoc(xml);
return -1;
}
if (xmlStrEqual(root->name, _X("Properties"))) {
for (xmlNode *child = root->children; child; child = child->next) {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text == NULL) {
continue;
}
if (xmlStrEqual(child->name, _X("Pages"))) {
APPEND_LONG_META(doc, MetaPages, strtol((char *) text, NULL, 10))
}
xmlFree(text);
}
}
xmlFreeDoc(xml);
return 0;
}
__always_inline __always_inline
static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) { static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL, xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
@@ -144,7 +178,7 @@ static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *
return 0; return 0;
} }
#define MAX_TN_SIZE 1024 * 1024 * 15 #define MAX_TN_SIZE (1024 * 1024 * 15)
void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, struct archive_entry *entry) { void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, struct archive_entry *entry) {
size_t entry_size = archive_entry_size(entry); size_t entry_size = archive_entry_size(entry);
@@ -153,7 +187,7 @@ void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, s
return; return;
} }
char* buf = malloc(entry_size); char *buf = malloc(entry_size);
archive_read_data(a, buf, entry_size); archive_read_data(a, buf, entry_size);
APPEND_TN_META(doc, 1, 1) // Size unknown APPEND_TN_META(doc, 1, 1) // Size unknown
@@ -196,6 +230,10 @@ void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
} else if (ret == TEXT_BUF_FULL) { } else if (ret == TEXT_BUF_FULL) {
buffer_full = TRUE; buffer_full = TRUE;
} }
} else if (strcmp(path, "docProps/app.xml") == 0) {
if (read_doc_props_app(ctx, a, doc) != 0) {
break;
}
} else if (strcmp(path, "docProps/core.xml") == 0) { } else if (strcmp(path, "docProps/core.xml") == 0) {
if (read_doc_props(ctx, a, doc) != 0) { if (read_doc_props(ctx, a, doc) != 0) {
break; break;

View File

@@ -8,7 +8,7 @@
#define MIN_SIZE 32 #define MIN_SIZE 32
int store_thumbnail_jpeg(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) { int store_thumbnail_jpeg(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
return store_image_thumbnail((scan_media_ctx_t*)ctx, img->data, img->data_size, doc, "x.jpeg"); return store_image_thumbnail((scan_media_ctx_t *) ctx, img->data, img->data_size, doc, "x.jpeg");
} }
int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) { int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
@@ -36,7 +36,7 @@ int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, do
AVFrame *scaled_frame = av_frame_alloc(); AVFrame *scaled_frame = av_frame_alloc();
struct SwsContext *sws_ctx= sws_getContext( struct SwsContext *sws_ctx = sws_getContext(
img->width, img->height, AV_PIX_FMT_RGB24, img->width, img->height, AV_PIX_FMT_RGB24,
dstW, dstH, AV_PIX_FMT_YUVJ420P, dstW, dstH, AV_PIX_FMT_YUVJ420P,
SIST_SWS_ALGO, 0, 0, 0 SIST_SWS_ALGO, 0, 0, 0
@@ -80,6 +80,8 @@ int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, do
return TRUE; return TRUE;
} }
#define DMS_REF(ref) (((ref) == 'S' || (ref) == 'W') ? -1 : 1)
void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) { void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
libraw_data_t *libraw_lib = libraw_init(0); libraw_data_t *libraw_lib = libraw_init(0);
@@ -99,6 +101,7 @@ void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
if (ret != 0) { if (ret != 0) {
CTX_LOG_ERROR(f->filepath, "Could not open raw file") CTX_LOG_ERROR(f->filepath, "Could not open raw file")
free(buf); free(buf);
libraw_close(libraw_lib);
return; return;
} }
@@ -111,8 +114,8 @@ void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
if (*libraw_lib->idata.software != '\0') { if (*libraw_lib->idata.software != '\0') {
APPEND_STR_META(doc, MetaExifSoftware, libraw_lib->idata.software) APPEND_STR_META(doc, MetaExifSoftware, libraw_lib->idata.software)
} }
APPEND_INT_META(doc, MetaWidth, libraw_lib->sizes.width) APPEND_LONG_META(doc, MetaWidth, libraw_lib->sizes.width)
APPEND_INT_META(doc, MetaHeight, libraw_lib->sizes.height) APPEND_LONG_META(doc, MetaHeight, libraw_lib->sizes.height)
char tmp[1024]; char tmp[1024];
snprintf(tmp, sizeof(tmp), "%g", libraw_lib->other.iso_speed); snprintf(tmp, sizeof(tmp), "%g", libraw_lib->other.iso_speed);
APPEND_STR_META(doc, MetaExifIsoSpeedRatings, tmp) APPEND_STR_META(doc, MetaExifIsoSpeedRatings, tmp)
@@ -134,10 +137,24 @@ void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
snprintf(tmp, sizeof(tmp), "%.1f", libraw_lib->other.aperture); snprintf(tmp, sizeof(tmp), "%.1f", libraw_lib->other.aperture);
APPEND_STR_META(doc, MetaExifFNumber, tmp) APPEND_STR_META(doc, MetaExifFNumber, tmp)
int denominator = (int)roundf(1 / libraw_lib->other.shutter); int denominator = (int) roundf(1 / libraw_lib->other.shutter);
snprintf(tmp, sizeof(tmp), "1/%d", denominator); snprintf(tmp, sizeof(tmp), "1/%d", denominator);
APPEND_STR_META(doc, MetaExifExposureTime, tmp) APPEND_STR_META(doc, MetaExifExposureTime, tmp)
libraw_gps_info_t gps = libraw_lib->other.parsed_gps;
double gps_longitude_dec =
(gps.longtitude[0] + gps.longtitude[1] / 60 + gps.longtitude[2] / 3600) * DMS_REF(gps.longref);
snprintf(tmp, sizeof(tmp), "%.15f", gps_longitude_dec);
if (gps_longitude_dec != 0.0) {
APPEND_STR_META(doc, MetaExifGpsLongitudeDec, tmp)
}
double gps_latitude_dec = (gps.latitude[0] + gps.latitude[1] / 60 + gps.latitude[2] / 3600) * DMS_REF(gps.latref);
snprintf(tmp, sizeof(tmp), "%.15f", gps_latitude_dec);
if (gps_latitude_dec != 0.0) {
APPEND_STR_META(doc, MetaExifGpsLatitudeDec, tmp)
}
APPEND_STR_META(doc, MetaMediaVideoCodec, "raw") APPEND_STR_META(doc, MetaMediaVideoCodec, "raw")
if (ctx->tn_size <= 0) { if (ctx->tn_size <= 0) {

View File

@@ -8,32 +8,24 @@
#include <stdio.h> #include <stdio.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <openssl/md5.h> #include <openssl/md5.h>
#include <openssl/sha.h>
#include "macros.h" #include "macros.h"
#define SIST_SWS_ALGO SWS_LANCZOS #define SIST_SWS_ALGO SWS_LANCZOS
#define META_INT_MASK 0x80
#define META_STR_MASK 0x40
#define META_LONG_MASK 0x20
#define UNUSED(x) __attribute__((__unused__)) x #define UNUSED(x) __attribute__((__unused__)) x
#define META_STR(id) ((unsigned) id) | ((unsigned) META_STR_MASK)
#define META_INT(id) ((unsigned) id) | ((unsigned) META_INT_MASK)
#define META_LONG(id) ((unsigned) id) | ((unsigned) META_LONG_MASK)
#define IS_META_INT(key) (key & META_INT_MASK) == META_INT_MASK
#define IS_META_LONG(key) (key & META_LONG_MASK) == META_LONG_MASK
#define IS_META_STR(key) (key & META_STR_MASK) == META_STR_MASK
typedef void (*store_callback_t)(char *key, size_t key_len, char *buf, size_t buf_len); typedef void (*store_callback_t)(char *key, size_t key_len, char *buf, size_t buf_len);
typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...); typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...);
typedef void (*log_callback_t)(const char *filepath, int level, char *str); typedef void (*log_callback_t)(const char *filepath, int level, char *str);
typedef int scan_code_t; typedef int scan_code_t;
#define SCAN_OK (scan_code_t) 0 #define SCAN_OK (scan_code_t) 0
#define SCAN_ERR_READ (scan_code_t) -1 #define SCAN_ERR_READ (scan_code_t) (-1)
#define SCAN_ERR_SKIP (scan_code_t) (-2)
#define LEVEL_DEBUG 0 #define LEVEL_DEBUG 0
#define LEVEL_INFO 1 #define LEVEL_INFO 1
@@ -56,35 +48,46 @@ typedef int scan_code_t;
#define CTX_LOG_FATALF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_FATAL, fmt, __VA_ARGS__); exit(-1); #define CTX_LOG_FATALF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_FATAL, fmt, __VA_ARGS__); exit(-1);
#define CTX_LOG_FATAL(filepath, str) ctx->log(filepath, LEVEL_FATAL, str); exit(-1); #define CTX_LOG_FATAL(filepath, str) ctx->log(filepath, LEVEL_FATAL, str); exit(-1);
// This is written to file as a 16-bit int!
enum metakey { enum metakey {
MetaContent = META_STR(1), // String
MetaWidth = META_INT(2), MetaContent = 1,
MetaHeight = META_INT(3), MetaMediaAudioCodec,
MetaMediaDuration = META_LONG(4), MetaMediaVideoCodec,
MetaMediaAudioCodec = META_STR(5), MetaArtist,
MetaMediaVideoCodec = META_STR(6), MetaAlbum,
MetaMediaBitrate = META_LONG(7), MetaAlbumArtist,
MetaArtist = META_STR(8), MetaGenre,
MetaAlbum = META_STR(9), MetaTitle,
MetaAlbumArtist = META_STR(10), MetaFontName,
MetaGenre = META_STR(11), MetaParent,
MetaTitle = META_STR(12), MetaExifMake,
MetaFontName = META_STR(13), MetaExifSoftware,
MetaParent = META_STR(14), MetaExifExposureTime,
MetaExifMake = META_STR(15), MetaExifFNumber,
MetaExifSoftware = META_STR(16), MetaExifFocalLength,
MetaExifExposureTime = META_STR(17), MetaExifUserComment,
MetaExifFNumber = META_STR(18), MetaExifModel,
MetaExifFocalLength = META_STR(19), MetaExifIsoSpeedRatings,
MetaExifUserComment = META_STR(20), MetaExifDateTime,
MetaExifModel = META_STR(21), MetaAuthor,
MetaExifIsoSpeedRatings = META_STR(22), MetaModifiedBy,
MetaExifDateTime = META_STR(23), MetaThumbnail,
MetaAuthor = META_STR(24), MetaChecksum,
MetaModifiedBy = META_STR(25),
MetaThumbnail = META_STR(26), // Number
MetaPages = META_INT(27), MetaWidth,
MetaHeight,
MetaMediaDuration,
MetaMediaBitrate,
MetaPages,
// ??
MetaExifGpsLongitudeDMS,
MetaExifGpsLongitudeRef,
MetaExifGpsLatitudeDMS,
MetaExifGpsLatitudeRef,
MetaExifGpsLatitudeDec,
MetaExifGpsLongitudeDec,
}; };
typedef struct meta_line { typedef struct meta_line {
@@ -92,8 +95,8 @@ typedef struct meta_line {
enum metakey key; enum metakey key;
union { union {
char str_val[0]; char str_val[0];
int int_val;
unsigned long long_val; unsigned long long_val;
double double_val;
}; };
} meta_line_t; } meta_line_t;
@@ -131,11 +134,20 @@ typedef struct vfile {
}; };
int is_fs_file; int is_fs_file;
int has_checksum;
int calculate_checksum;
const char *filepath; const char *filepath;
struct stat info; struct stat info;
SHA_CTX sha1_ctx;
unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
void *rewind_buffer;
int rewind_buffer_size;
int rewind_buffer_cursor;
read_func_t read; read_func_t read;
seek_func_t seek; read_func_t read_rewindable;
close_func_t close; close_func_t close;
reset_func_t reset; reset_func_t reset;
log_callback_t log; log_callback_t log;

View File

@@ -35,7 +35,7 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
return SCAN_OK; return SCAN_OK;
} }
#define MAX_MARKUP_SIZE 1024 * 1024 #define MAX_MARKUP_SIZE (1024 * 1024)
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) { scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {

View File

@@ -9,11 +9,14 @@
#define STR_STARTS_WITH(x, y) (strncmp(y, x, sizeof(y) - 1) == 0) #define STR_STARTS_WITH(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
#define TEXT_BUF_FULL -1 #define TEXT_BUF_FULL (-1)
#define INITIAL_BUF_SIZE 1024 * 16 #define INITIAL_BUF_SIZE (1024 * 16)
#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c)) #define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c))
#define SHOULD_KEEP_CHAR(c) ((c >= '\'' && c <= ';') || (c >= 'A' && c <= 'z') || (c > 127)) #define SHOULD_KEEP_CHAR(c) (\
((c) >= '\'' && (c) <= ';') || \
((c) >= 'A' && (c) <= 'z') || \
((c) > 127 && (c) != 0x00A0 && (c) && (c) != 0xFFFD))
typedef struct dyn_buffer { typedef struct dyn_buffer {
@@ -133,11 +136,11 @@ static void dyn_buffer_write_int(dyn_buffer_t *buf, int d) {
buf->cur += sizeof(int); buf->cur += sizeof(int);
} }
static void dyn_buffer_write_short(dyn_buffer_t *buf, short s) { static void dyn_buffer_write_short(dyn_buffer_t *buf, uint16_t s) {
grow_buffer_small(buf); grow_buffer_small(buf);
*(short *) (buf->buf + buf->cur) = s; *(uint16_t *) (buf->buf + buf->cur) = s;
buf->cur += sizeof(short); buf->cur += sizeof(uint16_t);
} }
static void dyn_buffer_write_long(dyn_buffer_t *buf, unsigned long l) { static void dyn_buffer_write_long(dyn_buffer_t *buf, unsigned long l) {
@@ -255,7 +258,7 @@ static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t
} }
utf8_int32_t c; utf8_int32_t c;
char tmp[16]; char tmp[16] = {0};
do { do {
ptr = (char *) utf8codepoint(ptr, &c); ptr = (char *) utf8codepoint(ptr, &c);
@@ -333,4 +336,26 @@ static void *read_all(vfile_t *f, size_t *size) {
return buf; return buf;
} }
#define STACK_BUFFER_SIZE (size_t)(4096 * 8)
__always_inline
static void safe_sha1_update(SHA_CTX *ctx, void *buf, size_t size) {
unsigned char stack_buf[STACK_BUFFER_SIZE];
void *sha1_buf;
if (size <= STACK_BUFFER_SIZE) {
sha1_buf = stack_buf;
} else {
void *heap_sha1_buf = malloc(size);
sha1_buf = heap_sha1_buf;
}
memcpy(sha1_buf, buf, size);
SHA1_Update(ctx, (const void *) sha1_buf, size);
if (sha1_buf != stack_buf) {
free(sha1_buf);
}
}
#endif #endif

View File

@@ -0,0 +1,200 @@
#include "libwpd_c_api.h"
#include "libwpd/libwpd.h"
#include "libwpd/WPXProperty.h"
#include "libwpd-stream/libwpd-stream.h"
class StringDocument : public WPXDocumentInterface {
private:
text_buffer_t *tex;
document_t *doc;
bool is_full;
public:
StringDocument(text_buffer_t *tex, document_t *doc) {
this->tex = tex;
this->doc = doc;
this->is_full = false;
}
void setDocumentMetaData(const WPXPropertyList &propList) override {
WPXPropertyList::Iter propIter(propList);
for (propIter.rewind(); propIter.next();) {
// TODO: Read metadata here ?!
}
}
void endDocument() override {
text_buffer_terminate_string(this->tex);
}
void closeParagraph() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void closeSpan() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void closeSection() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertTab() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertSpace() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertText(const WPXString &text) override {
if (!this->is_full) {
if (text_buffer_append_string0(tex, text.cstr()) == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertLineBreak() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void definePageStyle(const WPXPropertyList &propList) override { /* noop */ }
void closePageSpan() override { /* noop */ }
void openHeader(const WPXPropertyList &propList) override { /* noop */ }
void closeHeader() override { /* noop */ }
void openFooter(const WPXPropertyList &propList) override { /* noop */ }
void closeFooter() override { /* noop */ }
void
defineParagraphStyle(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void openParagraph(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void defineCharacterStyle(const WPXPropertyList &propList) override { /* noop */ }
void openSpan(const WPXPropertyList &propList) override { /* noop */ }
void
defineSectionStyle(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void openSection(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void insertField(const WPXString &type, const WPXPropertyList &propList) override { /* noop */ }
void defineOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void defineUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void openOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void openUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void closeOrderedListLevel() override { /* noop */ }
void closeUnorderedListLevel() override { /* noop */ }
void openListElement(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void closeListElement() override { /* noop */ }
void openFootnote(const WPXPropertyList &propList) override { /* noop */ }
void closeFootnote() override { /* noop */ }
void openEndnote(const WPXPropertyList &propList) override { /* noop */ }
void closeEndnote() override { /* noop */ }
void openComment(const WPXPropertyList &propList) override { /* noop */ }
void closeComment() override { /* noop */ }
void openTextBox(const WPXPropertyList &propList) override { /* noop */ }
void closeTextBox() override { /* noop */ }
void openTable(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void openTableRow(const WPXPropertyList &propList) override { /* noop */ }
void closeTableRow() override { /* noop */ }
void openTableCell(const WPXPropertyList &propList) override { /* noop */ }
void closeTableCell() override { /* noop */ }
void insertCoveredTableCell(const WPXPropertyList &propList) override { /* noop */ }
void closeTable() override { /* noop */ }
void openFrame(const WPXPropertyList &propList) override { /* noop */ }
void closeFrame() override { /* noop */ }
void insertBinaryObject(const WPXPropertyList &propList, const WPXBinaryData &data) override { /* noop */ }
void insertEquation(const WPXPropertyList &propList, const WPXString &data) override { /* noop */ }
void openPageSpan(const WPXPropertyList &propList) override { /* noop */ }
void startDocument() override { /* noop */ };
};
wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len) {
auto *input = new WPXStringStream(buf, buf_len);
return input;
}
wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t ptr) {
auto *stream = (WPXStringStream *) ptr;
WPDConfidence confidence = WPDocument::isFileFormatSupported(stream);
return (wpd_confidence_t) confidence;
}
wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc) {
auto *stream = (WPXStringStream *) ptr;
auto myDoc = StringDocument(tex, doc);
WPDResult result2 = WPDocument::parse(stream, &myDoc, nullptr);
return (wpd_result_t) result2;
}
void wpd_memory_stream_destroy(wpd_stream_t ptr) {
auto *stream = (WPXStringStream *) ptr;
delete stream;
}

View File

@@ -0,0 +1,50 @@
#ifndef SIST2_LIBWPD_C_API_H
#define SIST2_LIBWPD_C_API_H
#include "stdlib.h"
#ifdef __cplusplus
#define EXTERNC extern "C"
#else
#define EXTERNC
#endif
#ifdef __cplusplus
extern "C" {
#endif
#include "../scan.h"
#include "../util.h"
#ifdef __cplusplus
};
#endif
typedef void *wpd_stream_t;
typedef enum {
C_WPD_CONFIDENCE_NONE = 0,
C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION,
C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION,
C_WPD_CONFIDENCE_EXCELLENT
} wpd_confidence_t;
typedef enum {
C_WPD_OK,
C_WPD_FILE_ACCESS_ERROR,
C_WPD_PARSE_ERROR,
C_WPD_UNSUPPORTED_ENCRYPTION_ERROR,
C_WPD_PASSWORD_MISSMATCH_ERROR,
C_WPD_OLE_ERROR,
C_WPD_UNKNOWN_ERROR
} wpd_result_t;
EXTERNC wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t stream);
EXTERNC wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len);
EXTERNC void wpd_memory_stream_destroy(wpd_stream_t stream);
EXTERNC wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc);
#endif

41
libscan/wpd/wpd.c Normal file
View File

@@ -0,0 +1,41 @@
#include "wpd.h"
#include "libwpd_c_api.h"
scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
void *buf = read_all(f, &buf_len);
void *stream = wpd_memory_stream_create(buf, buf_len);
wpd_confidence_t conf = wpd_is_file_format_supported(stream);
if (conf == C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION || conf == C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION) {
CTX_LOG_DEBUGF("wpd.c", "File is encrypted! Password-protected WPD files are not supported yet (conf=%d)", conf)
wpd_memory_stream_destroy(stream);
free(buf);
return SCAN_ERR_READ;
}
if (conf != C_WPD_CONFIDENCE_EXCELLENT) {
CTX_LOG_ERRORF("wpd.c", "Unsupported file format! [%s] (conf=%d)", doc->filepath, conf)
wpd_memory_stream_destroy(stream);
free(buf);
return SCAN_ERR_READ;
}
text_buffer_t tex = text_buffer_create(-1);
wpd_result_t res = wpd_parse(stream, &tex, doc);
if (res != C_WPD_OK) {
CTX_LOG_ERRORF("wpd.c", "Error while parsing WPD file [%s] (%d)",
doc->filepath, res)
}
if (tex.dyn_buffer.cur != 0) {
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
}
text_buffer_destroy(&tex);
wpd_memory_stream_destroy(stream);
free(buf);
}

23
libscan/wpd/wpd.h Normal file
View File

@@ -0,0 +1,23 @@
#ifndef SIST2_WPD_H
#define SIST2_WPD_H
#include "../scan.h"
#include "../util.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
unsigned int wpd_mime;
} scan_wpd_ctx_t;
scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc);
__always_inline
static int is_wpd(scan_wpd_ctx_t *ctx, unsigned int mime) {
return mime == ctx->wpd_mime;
}
#endif

View File

@@ -11,16 +11,20 @@ extern "C" {
#include "../libscan/mobi/scan_mobi.h" #include "../libscan/mobi/scan_mobi.h"
#include "../libscan/raw/raw.h" #include "../libscan/raw/raw.h"
#include "../libscan/msdoc/msdoc.h" #include "../libscan/msdoc/msdoc.h"
#include "../libscan/wpd/wpd.h"
#include "../libscan/json/json.h"
#include <libavutil/avutil.h> #include <libavutil/avutil.h>
} }
static scan_arc_ctx_t arc_recurse_media_ctx; static scan_arc_ctx_t arc_recurse_media_ctx;
static scan_arc_ctx_t arc_list_ctx; static scan_arc_ctx_t arc_list_ctx;
static scan_arc_ctx_t arc_recurse_ooxml_ctx;
static scan_text_ctx_t text_500_ctx; static scan_text_ctx_t text_500_ctx;
static scan_ebook_ctx_t ebook_ctx; static scan_ebook_ctx_t ebook_ctx;
static scan_ebook_ctx_t ebook_500_ctx; static scan_ebook_ctx_t ebook_500_ctx;
static scan_ebook_ctx_t ebook_fast_ctx;
static scan_comic_ctx_t comic_ctx; static scan_comic_ctx_t comic_ctx;
@@ -38,11 +42,20 @@ static scan_msdoc_ctx_t msdoc_ctx;
static scan_msdoc_ctx_t msdoc_text_ctx; static scan_msdoc_ctx_t msdoc_text_ctx;
static scan_wpd_ctx_t wpd_ctx;
document_t LastSubDoc; static scan_json_ctx_t json_ctx;
static document_t LastSubDoc;
static char *RecurseMediaMime = (char *) "";
void _parse_media(parse_job_t *job) { void _parse_media(parse_job_t *job) {
parse_media(&media_ctx, &job->vfile, &LastSubDoc); parse_media(&media_ctx, &job->vfile, &LastSubDoc, RecurseMediaMime);
}
void _parse_ooxml(parse_job_t *job) {
parse_ooxml(&ooxml_500_ctx, &job->vfile, &LastSubDoc);
} }
@@ -202,7 +215,7 @@ TEST(Ebook, CandlePdf) {
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
ASSERT_NE(get_meta(&doc, MetaContent)->str_val[0], ' '); ASSERT_NE(get_meta(&doc, MetaContent)->str_val[0], ' ');
ASSERT_NE(size_before, store_size); ASSERT_NE(size_before, store_size);
ASSERT_EQ(get_meta(&doc, MetaPages)->int_val, 16); ASSERT_EQ(get_meta(&doc, MetaPages)->long_val, 16);
cleanup(&doc, &f); cleanup(&doc, &f);
} }
@@ -218,6 +231,24 @@ TEST(Ebook, Utf8Pdf) {
cleanup(&doc, &f); cleanup(&doc, &f);
} }
TEST(Ebook, Utf8PdfInvalidChars) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ebook/invalid_chars.pdf", &f, &doc);
ebook_ctx.tesseract_lang = nullptr;
parse_ebook(&ebook_ctx, &f, "application/pdf", &doc);
ebook_ctx.tesseract_lang = "eng";
// It should say "HART is a group of highly qualified ..." but the PDF
// text is been intentionally fucked with by the authors
// We can at least filter out the non-printable/invalid characters like '<27>' etc
ASSERT_TRUE(STR_STARTS_WITH(get_meta(&doc, MetaContent)->str_val, "HART i a g f highl alified "));
cleanup(&doc, &f);
}
TEST(Ebook, Pdf2) { TEST(Ebook, Pdf2) {
vfile_t f; vfile_t f;
document_t doc; document_t doc;
@@ -250,6 +281,28 @@ TEST(Ebook, Epub1) {
cleanup(&doc, &f); cleanup(&doc, &f);
} }
TEST(Ebook, EpubFastMupdfError) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ebook/mupdf-issue-129.epub", &f, &doc);
parse_ebook(&ebook_fast_ctx, &f, "application/epub+zip", &doc);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
cleanup(&doc, &f);
}
TEST(Ebook, Epub1Fast) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ebook/epub1.epub", &f, &doc);
parse_ebook(&ebook_fast_ctx, &f, "application/epub+zip", &doc);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
cleanup(&doc, &f);
}
TEST(Ebook, EpubBlankFirstPage) { TEST(Ebook, EpubBlankFirstPage) {
vfile_t f; vfile_t f;
document_t doc; document_t doc;
@@ -291,6 +344,19 @@ TEST(Comic, ComicCbr) {
cleanup(&doc, &f); cleanup(&doc, &f);
} }
TEST(Comic, ComicIssue160) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ebook/comic-segfault-issue-160.cbr", &f, &doc);
int tn_size_saved = comic_ctx.tn_size;
comic_ctx.tn_size = 0;
parse_comic(&comic_ctx, &f, &doc);
comic_ctx.tn_size = tn_size_saved;
cleanup(&doc, &f);
}
TEST(Comic, ComicCbrAsIs) { TEST(Comic, ComicCbrAsIs) {
vfile_t f; vfile_t f;
document_t doc; document_t doc;
@@ -319,12 +385,28 @@ TEST(Comic, ComicCbrFilters) {
/* Media (image) */ /* Media (image) */
TEST(MediaImage, ExifGps1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/media/exif_GPS.jpg", &f, &doc);
parse_media(&media_ctx, &f, &doc, "image/jpeg");
ASSERT_STREQ(get_meta(&doc, MetaExifGpsLatitudeRef)->str_val, "N");
ASSERT_STREQ(get_meta(&doc, MetaExifGpsLatitudeDMS)->str_val, "48:1 , 56585399:1000000, 0:1");
ASSERT_STREQ(get_meta(&doc, MetaExifGpsLongitudeRef)->str_val, "E");
ASSERT_STREQ(get_meta(&doc, MetaExifGpsLongitudeDMS)->str_val, "9:1 , 28046900:1000000, 0:1");
cleanup(&doc, &f);
}
TEST(MediaImage, Exif1) { TEST(MediaImage, Exif1) {
vfile_t f; vfile_t f;
document_t doc; document_t doc;
load_doc_file("libscan-test-files/test_files/media/exiftest1.jpg", &f, &doc); load_doc_file("libscan-test-files/test_files/media/exiftest1.jpg", &f, &doc);
parse_media(&media_ctx, &f, &doc); parse_media(&media_ctx, &f, &doc, "image/jpeg");
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "I don't know if it's a thing mostly done for high end " ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "I don't know if it's a thing mostly done for high end "
"hotels or what, but I've seen it in a few places in Thailand: " "hotels or what, but I've seen it in a few places in Thailand: "
@@ -353,13 +435,28 @@ TEST(MediaImage, Mem1) {
size_t size_before = store_size; size_t size_before = store_size;
parse_archive(&arc_recurse_media_ctx, &f, &doc); RecurseMediaMime = (char *) "image/jpeg";
parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr);
ASSERT_NE(size_before, store_size); ASSERT_NE(size_before, store_size);
cleanup(&doc, &f); cleanup(&doc, &f);
} }
TEST(MediaImage, AsIsFs) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/media/9555.jpg", &f, &doc);
size_t size_before = store_size;
parse_media(&media_ctx, &f, &doc, "image/jpeg");
ASSERT_EQ(size_before + 14098, store_size);
cleanup(&doc, &f);
}
TEST(MediaImage, Mem2AsIs) { TEST(MediaImage, Mem2AsIs) {
vfile_t f; vfile_t f;
document_t doc; document_t doc;
@@ -367,19 +464,50 @@ TEST(MediaImage, Mem2AsIs) {
size_t size_before = store_size; size_t size_before = store_size;
parse_archive(&arc_recurse_media_ctx, &f, &doc); RecurseMediaMime = (char *) "image/jpeg";
parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr);
ASSERT_EQ(size_before + 14098, store_size); ASSERT_EQ(size_before + 14098, store_size);
cleanup(&doc, &f); cleanup(&doc, &f);
} }
TEST(MediaVideo, VidMkvSubDisabled) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/media/berd.mkv", &f, &doc);
size_t size_before = store_size;
parse_media(&media_ctx, &f, &doc, "video/x-matroska");
ASSERT_NE(size_before, store_size);
ASSERT_EQ(get_meta(&doc, MetaContent), nullptr);
cleanup(&doc, &f);
}
TEST(MediaVideo, VidMkvSubEnabled) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/media/berd.mkv", &f, &doc);
size_t size_before = store_size;
media_ctx.read_subtitles = TRUE;
parse_media(&media_ctx, &f, &doc, "video/x-matroska");
media_ctx.read_subtitles = FALSE;
ASSERT_NE(size_before, store_size);
ASSERT_NE(get_meta(&doc, MetaContent), nullptr);
cleanup(&doc, &f);
}
TEST(MediaVideo, Vid3Mp4) { TEST(MediaVideo, Vid3Mp4) {
vfile_t f; vfile_t f;
document_t doc; document_t doc;
load_doc_file("libscan-test-files/test_files/media/vid3.mp4", &f, &doc); load_doc_file("libscan-test-files/test_files/media/vid3.mp4", &f, &doc);
parse_media(&media_ctx, &f, &doc); parse_media(&media_ctx, &f, &doc, "video/mp4");
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Helicopter (((Accident))) - " ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Helicopter (((Accident))) - "
"https://archive.org/details/Virginia_Helicopter_Crash"); "https://archive.org/details/Virginia_Helicopter_Crash");
@@ -396,7 +524,7 @@ TEST(MediaVideo, Vid3Ogv) {
document_t doc; document_t doc;
load_doc_file("libscan-test-files/test_files/media/vid3.ogv", &f, &doc); load_doc_file("libscan-test-files/test_files/media/vid3.ogv", &f, &doc);
parse_media(&media_ctx, &f, &doc); parse_media(&media_ctx, &f, &doc, "application/ogg");
ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "theora"); ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "theora");
ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 590261); ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 590261);
@@ -411,7 +539,7 @@ TEST(MediaVideo, Vid3Webm) {
document_t doc; document_t doc;
load_doc_file("libscan-test-files/test_files/media/vid3.webm", &f, &doc); load_doc_file("libscan-test-files/test_files/media/vid3.webm", &f, &doc);
parse_media(&media_ctx, &f, &doc); parse_media(&media_ctx, &f, &doc, "video/webm");
ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "vp8"); ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "vp8");
ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 343153); ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 343153);
@@ -428,7 +556,8 @@ TEST(MediaVideoVfile, Vid3Ogv) {
size_t size_before = store_size; size_t size_before = store_size;
parse_archive(&arc_recurse_media_ctx, &f, &doc); RecurseMediaMime = (char *) "video/webm";
parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr);
// ASSERT_STREQ(get_meta(&LastSubDoc, MetaMediaVideoCodec)->str_val, "theora"); // ASSERT_STREQ(get_meta(&LastSubDoc, MetaMediaVideoCodec)->str_val, "theora");
ASSERT_EQ(get_meta(&LastSubDoc, MetaMediaBitrate)->long_val, 590261); ASSERT_EQ(get_meta(&LastSubDoc, MetaMediaBitrate)->long_val, 590261);
@@ -443,7 +572,7 @@ TEST(MediaVideo, VidDuplicateTags) {
document_t doc; document_t doc;
load_doc_file("libscan-test-files/test_files/media/vid_tags.mkv", &f, &doc); load_doc_file("libscan-test-files/test_files/media/vid_tags.mkv", &f, &doc);
parse_media(&media_ctx, &f, &doc); parse_media(&media_ctx, &f, &doc, "video/x-matroska");
meta_line_t *meta_content = get_meta(&doc, MetaContent); meta_line_t *meta_content = get_meta(&doc, MetaContent);
ASSERT_STREQ(meta_content->str_val, "he's got a point"); ASSERT_STREQ(meta_content->str_val, "he's got a point");
@@ -467,7 +596,7 @@ TEST(MediaAudio, MusicMp3) {
document_t doc; document_t doc;
load_doc_file("libscan-test-files/test_files/media/02-The Watchmaker-Barry James_spoken.mp3", &f, &doc); load_doc_file("libscan-test-files/test_files/media/02-The Watchmaker-Barry James_spoken.mp3", &f, &doc);
parse_media(&media_ctx, &f, &doc); parse_media(&media_ctx, &f, &doc, "audio/x-mpeg-3");
ASSERT_STREQ(get_meta(&doc, MetaArtist)->str_val, "Barry James"); ASSERT_STREQ(get_meta(&doc, MetaArtist)->str_val, "Barry James");
ASSERT_STREQ(get_meta(&doc, MetaAlbum)->str_val, "Strange Slumber, Music for Wonderful Dreams"); ASSERT_STREQ(get_meta(&doc, MetaAlbum)->str_val, "Strange Slumber, Music for Wonderful Dreams");
@@ -505,11 +634,48 @@ TEST(Ooxml, Docx1) {
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Thomas"); ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Thomas");
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Thomas"); ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Thomas");
ASSERT_EQ(get_meta(&doc, MetaPages)->long_val, 2);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
cleanup(&doc, &f); cleanup(&doc, &f);
} }
TEST(Ooxml, Docx2) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ooxml/docx2.docx", &f, &doc);
ooxml_500_ctx.content_size = 999999;
parse_ooxml(&ooxml_500_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "liz evans");
ASSERT_EQ(get_meta(&doc, MetaPages)->long_val, 1);
ASSERT_EQ(strlen(get_meta(&doc, MetaContent)->str_val), 2780);
ooxml_500_ctx.content_size = 500;
cleanup(&doc, &f);
}
TEST(Ooxml, Docx2Archive) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ooxml/docx2.docx.7z", &f, &doc);
ooxml_500_ctx.content_size = 999999;
parse_archive(&arc_recurse_ooxml_ctx, &f, &doc, nullptr, nullptr);
ASSERT_STREQ(get_meta(&LastSubDoc, MetaAuthor)->str_val, "liz evans");
ASSERT_EQ(get_meta(&LastSubDoc, MetaPages)->long_val, 1);
ASSERT_EQ(strlen(get_meta(&LastSubDoc, MetaContent)->str_val), 2780);
fprintf(stderr, "%s\n", get_meta(&LastSubDoc, MetaContent)->str_val);
ooxml_500_ctx.content_size = 500;
cleanup(&doc, &f);
}
TEST(Ooxml, Docx2Thumbnail) { TEST(Ooxml, Docx2Thumbnail) {
vfile_t f; vfile_t f;
document_t doc; document_t doc;
@@ -520,6 +686,7 @@ TEST(Ooxml, Docx2Thumbnail) {
parse_ooxml(&ooxml_500_ctx, &f, &doc); parse_ooxml(&ooxml_500_ctx, &f, &doc);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4); ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
ASSERT_EQ(get_meta(&doc, MetaPages)->long_val, 2);
ASSERT_NE(size_before, store_size); ASSERT_NE(size_before, store_size);
cleanup(&doc, &f); cleanup(&doc, &f);
@@ -588,7 +755,7 @@ TEST(Arc, Utf8) {
document_t doc; document_t doc;
load_doc_file("libscan-test-files/test_files/arc/test1.zip", &f, &doc); load_doc_file("libscan-test-files/test_files/arc/test1.zip", &f, &doc);
parse_archive(&arc_list_ctx, &f, &doc); parse_archive(&arc_list_ctx, &f, &doc, nullptr, nullptr);
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "arctest/ȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬ.txt") != nullptr); ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "arctest/ȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬ.txt") != nullptr);
@@ -603,7 +770,7 @@ TEST(Arc, EncryptedZip) {
size_t size_before = store_size; size_t size_before = store_size;
strcpy(arc_recurse_media_ctx.passphrase, "sist2"); strcpy(arc_recurse_media_ctx.passphrase, "sist2");
parse_archive(&arc_recurse_media_ctx, &f, &doc); parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr);
arc_recurse_media_ctx.passphrase[0] = '\0'; arc_recurse_media_ctx.passphrase[0] = '\0';
@@ -629,13 +796,30 @@ TEST(RAW, Panasonic) {
ASSERT_STREQ(get_meta(&doc, MetaExifDateTime)->str_val, "2020:07:20 10:00:34"); ASSERT_STREQ(get_meta(&doc, MetaExifDateTime)->str_val, "2020:07:20 10:00:34");
ASSERT_STREQ(get_meta(&doc, MetaExifFocalLength)->str_val, "20.0"); ASSERT_STREQ(get_meta(&doc, MetaExifFocalLength)->str_val, "20.0");
ASSERT_STREQ(get_meta(&doc, MetaExifFNumber)->str_val, "2.0"); ASSERT_STREQ(get_meta(&doc, MetaExifFNumber)->str_val, "2.0");
ASSERT_EQ(get_meta(&doc, MetaWidth)->int_val, 5200); ASSERT_EQ(get_meta(&doc, MetaWidth)->long_val, 5200);
ASSERT_EQ(get_meta(&doc, MetaHeight)->int_val, 3904); ASSERT_EQ(get_meta(&doc, MetaHeight)->long_val, 3904);
ASSERT_NE(size_before, store_size); ASSERT_NE(size_before, store_size);
cleanup(&doc, &f); cleanup(&doc, &f);
} }
TEST(RAW, ExifGps1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/raw/exif_gps.DNG", &f, &doc);
size_t size_before = store_size;
parse_raw(&raw_ctx, &f, &doc);
ASSERT_NE(size_before, store_size);
ASSERT_STREQ(get_meta(&doc, MetaExifGpsLatitudeDec)->str_val, "48.943088531494141");
ASSERT_STREQ(get_meta(&doc, MetaExifGpsLongitudeDec)->str_val, "9.467448234558105");
cleanup(&doc, &f);
}
TEST(RAW, Nikon) { TEST(RAW, Nikon) {
vfile_t f; vfile_t f;
document_t doc; document_t doc;
@@ -648,8 +832,8 @@ TEST(RAW, Nikon) {
ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "raw"); ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "raw");
ASSERT_STREQ(get_meta(&doc, MetaExifModel)->str_val, "D750"); ASSERT_STREQ(get_meta(&doc, MetaExifModel)->str_val, "D750");
ASSERT_STREQ(get_meta(&doc, MetaExifMake)->str_val, "Nikon"); ASSERT_STREQ(get_meta(&doc, MetaExifMake)->str_val, "Nikon");
ASSERT_EQ(get_meta(&doc, MetaWidth)->int_val, 6032); ASSERT_EQ(get_meta(&doc, MetaWidth)->long_val, 6032);
ASSERT_EQ(get_meta(&doc, MetaHeight)->int_val, 4032); ASSERT_EQ(get_meta(&doc, MetaHeight)->long_val, 4032);
ASSERT_NE(size_before, store_size); ASSERT_NE(size_before, store_size);
cleanup(&doc, &f); cleanup(&doc, &f);
@@ -667,8 +851,8 @@ TEST(RAW, Sony) {
ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "raw"); ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "raw");
ASSERT_STREQ(get_meta(&doc, MetaExifModel)->str_val, "ILCE-7RM3"); ASSERT_STREQ(get_meta(&doc, MetaExifModel)->str_val, "ILCE-7RM3");
ASSERT_STREQ(get_meta(&doc, MetaExifMake)->str_val, "Sony"); ASSERT_STREQ(get_meta(&doc, MetaExifMake)->str_val, "Sony");
ASSERT_EQ(get_meta(&doc, MetaWidth)->int_val, 7968); ASSERT_EQ(get_meta(&doc, MetaWidth)->long_val, 7968);
ASSERT_EQ(get_meta(&doc, MetaHeight)->int_val, 5320); ASSERT_EQ(get_meta(&doc, MetaHeight)->long_val, 5320);
ASSERT_NE(size_before, store_size); ASSERT_NE(size_before, store_size);
cleanup(&doc, &f); cleanup(&doc, &f);
@@ -686,8 +870,8 @@ TEST(RAW, Olympus) {
ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "raw"); ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "raw");
ASSERT_STREQ(get_meta(&doc, MetaExifModel)->str_val, "E-M5MarkII"); ASSERT_STREQ(get_meta(&doc, MetaExifModel)->str_val, "E-M5MarkII");
ASSERT_STREQ(get_meta(&doc, MetaExifMake)->str_val, "Olympus"); ASSERT_STREQ(get_meta(&doc, MetaExifMake)->str_val, "Olympus");
ASSERT_EQ(get_meta(&doc, MetaWidth)->int_val, 4640); ASSERT_EQ(get_meta(&doc, MetaWidth)->long_val, 4640);
ASSERT_EQ(get_meta(&doc, MetaHeight)->int_val, 3472); ASSERT_EQ(get_meta(&doc, MetaHeight)->long_val, 3472);
ASSERT_NE(size_before, store_size); ASSERT_NE(size_before, store_size);
cleanup(&doc, &f); cleanup(&doc, &f);
@@ -704,8 +888,8 @@ TEST(RAW, Fuji) {
ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "raw"); ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "raw");
ASSERT_STREQ(get_meta(&doc, MetaExifModel)->str_val, "X-T2"); ASSERT_STREQ(get_meta(&doc, MetaExifModel)->str_val, "X-T2");
ASSERT_STREQ(get_meta(&doc, MetaExifMake)->str_val, "Fujifilm"); ASSERT_STREQ(get_meta(&doc, MetaExifMake)->str_val, "Fujifilm");
ASSERT_EQ(get_meta(&doc, MetaWidth)->int_val, 6032); ASSERT_EQ(get_meta(&doc, MetaWidth)->long_val, 6032);
ASSERT_EQ(get_meta(&doc, MetaHeight)->int_val, 4028); ASSERT_EQ(get_meta(&doc, MetaHeight)->long_val, 4028);
ASSERT_NE(size_before, store_size); ASSERT_NE(size_before, store_size);
cleanup(&doc, &f); cleanup(&doc, &f);
@@ -724,7 +908,7 @@ TEST(Msdoc, Test1Pdf) {
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "October 2000") != nullptr); ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "October 2000") != nullptr);
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "INTERNATIONAL ORGANIZATION FOR STANDARDIZATION"); ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "INTERNATIONAL ORGANIZATION FOR STANDARDIZATION");
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Oliver Morgan"); ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Oliver Morgan");
ASSERT_EQ(get_meta(&doc, MetaPages)->int_val, 57); ASSERT_EQ(get_meta(&doc, MetaPages)->long_val, 57);
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4); ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
ASSERT_NE(size_before, store_size); ASSERT_NE(size_before, store_size);
@@ -841,7 +1025,7 @@ TEST(Msdoc, TestFuzz1) {
for (int i = 0; i < 1000; i++) { for (int i = 0; i < 1000; i++) {
size_t buf_len_copy = buf_len; size_t buf_len_copy = buf_len;
char *buf_copy = (char*)malloc(buf_len); char *buf_copy = (char *) malloc(buf_len);
memcpy(buf_copy, buf, buf_len); memcpy(buf_copy, buf, buf_len);
fuzz_buffer(buf_copy, &buf_len_copy, 3, 8, 5); fuzz_buffer(buf_copy, &buf_len_copy, 3, 8, 5);
@@ -852,6 +1036,38 @@ TEST(Msdoc, TestFuzz1) {
cleanup(&doc, &f); cleanup(&doc, &f);
} }
TEST(Wpd, Wpd51_1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/wpd/test51_1.wpd", &f, &doc);
parse_wpd(&wpd_ctx, &f, &doc);
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val,
"Hello, WordPerfect This is a test This is the next page This is another page");
cleanup(&doc, &f);
}
TEST(Json, Json1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/json/json1.json", &f, &doc);
parse_json(&json_ctx, &f, &doc);
cleanup(&doc, &f);
}
TEST(Json, NDJson1) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/json/ndjson1.jsonl", &f, &doc);
parse_ndjson(&json_ctx, &f, &doc);
cleanup(&doc, &f);
}
int main(int argc, char **argv) { int main(int argc, char **argv) {
setlocale(LC_ALL, ""); setlocale(LC_ALL, "");
@@ -862,6 +1078,12 @@ int main(int argc, char **argv) {
arc_recurse_media_ctx.mode = ARC_MODE_RECURSE; arc_recurse_media_ctx.mode = ARC_MODE_RECURSE;
arc_recurse_media_ctx.parse = _parse_media; arc_recurse_media_ctx.parse = _parse_media;
arc_recurse_ooxml_ctx.log = noop_log;
arc_recurse_ooxml_ctx.logf = noop_logf;
arc_recurse_ooxml_ctx.store = counter_store;
arc_recurse_ooxml_ctx.mode = ARC_MODE_RECURSE;
arc_recurse_ooxml_ctx.parse = _parse_ooxml;
arc_list_ctx.log = noop_log; arc_list_ctx.log = noop_log;
arc_list_ctx.logf = noop_logf; arc_list_ctx.logf = noop_logf;
arc_list_ctx.store = counter_store; arc_list_ctx.store = counter_store;
@@ -878,10 +1100,15 @@ int main(int argc, char **argv) {
ebook_ctx.tn_size = 500; ebook_ctx.tn_size = 500;
ebook_ctx.log = noop_log; ebook_ctx.log = noop_log;
ebook_ctx.logf = noop_logf; ebook_ctx.logf = noop_logf;
ebook_ctx.fast_epub_parse = 0;
ebook_ctx.tn_qscale = 1.0;
ebook_500_ctx = ebook_ctx; ebook_500_ctx = ebook_ctx;
ebook_500_ctx.content_size = 500; ebook_500_ctx.content_size = 500;
ebook_fast_ctx = ebook_500_ctx;
ebook_fast_ctx.fast_epub_parse = 1;
comic_ctx.tn_qscale = 1.0; comic_ctx.tn_qscale = 1.0;
comic_ctx.tn_size = 500; comic_ctx.tn_size = 500;
comic_ctx.log = noop_log; comic_ctx.log = noop_log;
@@ -928,6 +1155,14 @@ int main(int argc, char **argv) {
msdoc_text_ctx.content_size = 500; msdoc_text_ctx.content_size = 500;
msdoc_text_ctx.tn_size = 0; msdoc_text_ctx.tn_size = 0;
wpd_ctx.log = noop_log;
wpd_ctx.logf = noop_logf;
wpd_ctx.content_size = 500;
json_ctx.log = noop_log;
json_ctx.logf = noop_logf;
json_ctx.content_size = 5000;
av_log_set_level(AV_LOG_QUIET); av_log_set_level(AV_LOG_QUIET);
::testing::InitGoogleTest(&argc, argv); ::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();

View File

@@ -16,7 +16,7 @@ int fs_read(struct vfile *f, void *buf, size_t size) {
} }
} }
return read(f->fd, buf, size); return (int) read(f->fd, buf, size);
} }
//Note: No out of bounds check //Note: No out of bounds check
@@ -61,12 +61,14 @@ void load_file(const char *filepath, vfile_t *f) {
f->read = fs_read; f->read = fs_read;
f->close = fs_close; f->close = fs_close;
f->is_fs_file = TRUE; f->is_fs_file = TRUE;
f->calculate_checksum = TRUE;
f->has_checksum = FALSE;
} }
void load_mem(void *mem, size_t size, vfile_t *f) { void load_mem(void *mem, size_t size, vfile_t *f) {
f->filepath = "_mem_"; f->filepath = "_mem_";
f->_test_data = mem; f->_test_data = mem;
f->info.st_size = size; f->info.st_size = (int) size;
f->read = mem_read; f->read = mem_read;
f->close = nullptr; f->close = nullptr;
f->is_fs_file = TRUE; f->is_fs_file = TRUE;
@@ -106,7 +108,7 @@ void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p) {
} }
for (int disp = 0; disp < width; disp++) { for (int disp = 0; disp < width; disp++) {
buf[offset + disp] = (int8_t)rand(); buf[offset + disp] = (int8_t) rand();
} }
} }
} }