mirror of
https://github.com/simon987/libscan.git
synced 2025-12-20 09:35:57 +00:00
Compare commits
9 Commits
22522d7d4a
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 3787475ecb | |||
| 1d95be534b | |||
| da17282374 | |||
| 52d7649322 | |||
| 23da8ada5f | |||
| f061212d4b | |||
| fe53e1a219 | |||
| 75ff57fd94 | |||
| 1dad8fae20 |
@@ -25,6 +25,8 @@ add_library(
|
||||
libscan/media/media.c libscan/media/media.h
|
||||
libscan/font/font.c libscan/font/font.h
|
||||
libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h
|
||||
libscan/json/json.c libscan/json/json.h
|
||||
libscan/wpd/wpd.c libscan/wpd/wpd.h libscan/wpd/libwpd_c_api.h libscan/wpd/libwpd_c_api.cpp
|
||||
|
||||
third-party/utf8.h
|
||||
libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h)
|
||||
@@ -32,6 +34,7 @@ set_target_properties(scan PROPERTIES LINKER_LANGUAGE C)
|
||||
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib .so)
|
||||
|
||||
find_package(cJSON CONFIG REQUIRED)
|
||||
find_package(LibArchive REQUIRED)
|
||||
find_package(BZip2 REQUIRED)
|
||||
find_package(lz4 REQUIRED)
|
||||
@@ -43,6 +46,8 @@ find_package(JPEG REQUIRED)
|
||||
find_package(LibXml2 REQUIRED)
|
||||
find_package(LibLZMA REQUIRED)
|
||||
find_package(ZLIB REQUIRED)
|
||||
find_package(unofficial-pcre CONFIG REQUIRED)
|
||||
|
||||
|
||||
find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec)
|
||||
find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd)
|
||||
@@ -117,37 +122,38 @@ ExternalProject_Add(
|
||||
SET(FFMPEG_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
|
||||
SET(FFMPEG_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
|
||||
|
||||
#ExternalProject_Add(
|
||||
# libwpd
|
||||
# URL http://prdownloads.sourceforge.net/libwpd/libwpd-0.9.9.tar.gz
|
||||
#
|
||||
# UPDATE_COMMAND ""
|
||||
# PATCH_COMMAND ""
|
||||
# TEST_COMMAND ""
|
||||
# CONFIGURE_COMMAND ./configure --without-docs --enable-static --disable-shared
|
||||
# INSTALL_COMMAND ""
|
||||
#
|
||||
# PREFIX "third-party/ext_libwpd"
|
||||
# SOURCE_DIR "third-party/ext_libwpd/src/libwpd"
|
||||
# BINARY_DIR "third-party/ext_libwpd/src/libwpd"
|
||||
#
|
||||
# BUILD_COMMAND ${MAKE_EXE} -j33
|
||||
#)
|
||||
#SET(WPD_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/src/lib/.libs/)
|
||||
#SET(WPD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/inc/)
|
||||
ExternalProject_Add(
|
||||
libwpd
|
||||
URL http://prdownloads.sourceforge.net/libwpd/libwpd-0.9.9.tar.gz
|
||||
|
||||
UPDATE_COMMAND ""
|
||||
PATCH_COMMAND ""
|
||||
TEST_COMMAND ""
|
||||
CONFIGURE_COMMAND ./configure --without-docs --enable-static --disable-shared
|
||||
INSTALL_COMMAND ""
|
||||
|
||||
PREFIX "third-party/ext_libwpd"
|
||||
SOURCE_DIR "third-party/ext_libwpd/src/libwpd"
|
||||
BINARY_DIR "third-party/ext_libwpd/src/libwpd"
|
||||
|
||||
BUILD_COMMAND ${MAKE_EXE} -j33
|
||||
)
|
||||
SET(WPD_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/src/lib/.libs/)
|
||||
SET(WPD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/inc/)
|
||||
|
||||
add_dependencies(
|
||||
scan
|
||||
libmobi
|
||||
ffmpeg
|
||||
antiword
|
||||
# libwpd
|
||||
libwpd
|
||||
)
|
||||
|
||||
target_link_libraries(
|
||||
scan
|
||||
PUBLIC
|
||||
|
||||
cjson
|
||||
${LibArchive_LIBRARIES}
|
||||
ZLIB::ZLIB
|
||||
BZip2::BZip2
|
||||
@@ -160,7 +166,8 @@ target_link_libraries(
|
||||
|
||||
${MOBI_LIB_DIR}/libmobi.a
|
||||
|
||||
# ${WPD_LIB_DIR}/libwpd-0.9.a
|
||||
${WPD_LIB_DIR}/libwpd-0.9.a
|
||||
${WPD_LIB_DIR}/libwpd-stream-0.9.a
|
||||
|
||||
${FREETYPE_LIB}
|
||||
${HARFBUZZ_LIB}
|
||||
@@ -195,6 +202,7 @@ target_link_libraries(
|
||||
${GUMBO_LIB}
|
||||
dl
|
||||
antiword
|
||||
unofficial::pcre::pcre unofficial::pcre::pcre16 unofficial::pcre::pcre32 unofficial::pcre::pcrecpp
|
||||
)
|
||||
|
||||
target_include_directories(
|
||||
@@ -205,7 +213,7 @@ target_include_directories(
|
||||
${LIBXML2_INCLUDE_DIR}
|
||||
${FFMPEG_INCLUDE_DIR}
|
||||
${MOBI_INCLUDE_DIR}
|
||||
# ${WPD_INCLUDE_DIR}
|
||||
${WPD_INCLUDE_DIR}
|
||||
)
|
||||
|
||||
if (BUILD_TESTS)
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <openssl/evp.h>
|
||||
#include <pcre.h>
|
||||
|
||||
|
||||
int should_parse_filtered_file(const char *filepath, int ext) {
|
||||
@@ -33,18 +35,81 @@ int should_parse_filtered_file(const char *filepath, int ext) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
int arc_read(struct vfile *f, void *buf, size_t size) {
|
||||
size_t read = archive_read_data(f->arc, buf, size);
|
||||
void arc_close(struct vfile *f) {
|
||||
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
|
||||
|
||||
if (read != size) {
|
||||
const char* error_str = archive_error_string(f->arc);
|
||||
if (f->rewind_buffer != NULL) {
|
||||
free(f->rewind_buffer);
|
||||
f->rewind_buffer = NULL;
|
||||
f->rewind_buffer_size = 0;
|
||||
f->rewind_buffer_cursor = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int arc_read(struct vfile *f, void *buf, size_t size) {
|
||||
|
||||
int bytes_copied = 0;
|
||||
|
||||
if (f->rewind_buffer_size != 0) {
|
||||
if (size > f->rewind_buffer_size) {
|
||||
memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, f->rewind_buffer_size);
|
||||
|
||||
bytes_copied = f->rewind_buffer_size;
|
||||
size -= f->rewind_buffer_size;
|
||||
buf += f->rewind_buffer_size;
|
||||
f->rewind_buffer_size = 0;
|
||||
} else {
|
||||
memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, size);
|
||||
f->rewind_buffer_size -= (int) size;
|
||||
f->rewind_buffer_cursor += (int) size;
|
||||
|
||||
return (int) size;
|
||||
}
|
||||
}
|
||||
|
||||
size_t bytes_read = archive_read_data(f->arc, buf, size);
|
||||
|
||||
if (bytes_read != 0 && bytes_read <= size && f->calculate_checksum) {
|
||||
f->has_checksum = TRUE;
|
||||
|
||||
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, bytes_read);
|
||||
}
|
||||
|
||||
if (bytes_read != size && archive_errno(f->arc) != 0) {
|
||||
const char *error_str = archive_error_string(f->arc);
|
||||
if (error_str != NULL) {
|
||||
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
return read;
|
||||
return (int) bytes_read + bytes_copied;
|
||||
}
|
||||
|
||||
int arc_read_rewindable(struct vfile *f, void *buf, size_t size) {
|
||||
|
||||
if (f->rewind_buffer != NULL) {
|
||||
fprintf(stderr, "Allocated rewind buffer more than once for %s", f->filepath);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
size_t bytes_read = archive_read_data(f->arc, buf, size);
|
||||
|
||||
if (bytes_read != size && archive_errno(f->arc) != 0) {
|
||||
const char *error_str = archive_error_string(f->arc);
|
||||
if (error_str != NULL) {
|
||||
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
f->rewind_buffer = malloc(size);
|
||||
f->rewind_buffer_size = (int) size;
|
||||
f->rewind_buffer_cursor = 0;
|
||||
memcpy(f->rewind_buffer, buf, size);
|
||||
|
||||
return (int) bytes_read;
|
||||
}
|
||||
|
||||
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) {
|
||||
@@ -58,7 +123,7 @@ int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *ar
|
||||
archive_read_add_passphrase(*a, ctx->passphrase);
|
||||
}
|
||||
|
||||
return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE);
|
||||
return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE);
|
||||
} else if (allow_recurse) {
|
||||
*a = archive_read_new();
|
||||
archive_read_support_filter_all(*a);
|
||||
@@ -78,7 +143,10 @@ int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *ar
|
||||
}
|
||||
}
|
||||
|
||||
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
static __thread int sub_strings[30];
|
||||
#define EXCLUDED(str) (pcre_exec(exclude, exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
|
||||
|
||||
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra) {
|
||||
|
||||
struct archive *a = NULL;
|
||||
struct archive_entry *entry = NULL;
|
||||
@@ -102,8 +170,8 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
|
||||
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
|
||||
const char* utf8_name = archive_entry_pathname_utf8(entry);
|
||||
const char* file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
|
||||
const char *utf8_name = archive_entry_pathname_utf8(entry);
|
||||
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
|
||||
|
||||
dyn_buffer_append_string(&buf, file_path);
|
||||
dyn_buffer_write_char(&buf, ' ');
|
||||
@@ -121,21 +189,26 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2);
|
||||
|
||||
sub_job->vfile.close = NULL;
|
||||
sub_job->vfile.close = arc_close;
|
||||
sub_job->vfile.read = arc_read;
|
||||
sub_job->vfile.read_rewindable = arc_read_rewindable;
|
||||
sub_job->vfile.reset = NULL;
|
||||
sub_job->vfile.arc = a;
|
||||
sub_job->vfile.filepath = sub_job->filepath;
|
||||
sub_job->vfile.is_fs_file = FALSE;
|
||||
sub_job->vfile.rewind_buffer_size = 0;
|
||||
sub_job->vfile.rewind_buffer = NULL;
|
||||
sub_job->vfile.log = ctx->log;
|
||||
sub_job->vfile.logf = ctx->logf;
|
||||
sub_job->vfile.has_checksum = FALSE;
|
||||
sub_job->vfile.calculate_checksum = f->calculate_checksum;
|
||||
memcpy(sub_job->parent, doc->path_md5, MD5_DIGEST_LENGTH);
|
||||
|
||||
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
|
||||
sub_job->vfile.info = *archive_entry_stat(entry);
|
||||
if (S_ISREG(sub_job->vfile.info.st_mode)) {
|
||||
|
||||
const char* utf8_name = archive_entry_pathname_utf8(entry);
|
||||
const char *utf8_name = archive_entry_pathname_utf8(entry);
|
||||
|
||||
if (utf8_name == NULL) {
|
||||
sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry));
|
||||
@@ -144,13 +217,21 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
}
|
||||
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
|
||||
|
||||
// Handle excludes
|
||||
if (exclude != NULL && EXCLUDED(sub_job->filepath)) {
|
||||
CTX_LOG_DEBUGF("arc.c", "Excluded: %s", sub_job->filepath)
|
||||
continue;
|
||||
}
|
||||
|
||||
char *p = strrchr(sub_job->filepath, '.');
|
||||
if (p != NULL) {
|
||||
if (p != NULL && (p - sub_job->filepath) > strlen(f->filepath)) {
|
||||
sub_job->ext = (int) (p - sub_job->filepath + 1);
|
||||
} else {
|
||||
sub_job->ext = (int) strlen(sub_job->filepath);
|
||||
}
|
||||
|
||||
SHA1_Init(&sub_job->vfile.sha1_ctx);
|
||||
|
||||
ctx->parse(sub_job);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,9 +4,10 @@
|
||||
#include <archive.h>
|
||||
#include <archive_entry.h>
|
||||
#include <fcntl.h>
|
||||
#include <pcre.h>
|
||||
#include "../scan.h"
|
||||
|
||||
# define ARC_SKIPPED -1
|
||||
# define ARC_SKIPPED (-1)
|
||||
#define ARC_MODE_SKIP 0
|
||||
#define ARC_MODE_LIST 1
|
||||
#define ARC_MODE_SHALLOW 2
|
||||
@@ -31,27 +32,34 @@ typedef struct {
|
||||
} arc_data_t;
|
||||
|
||||
static int vfile_open_callback(struct archive *a, void *user_data) {
|
||||
arc_data_t *data = (arc_data_t*)user_data;
|
||||
arc_data_t *data = (arc_data_t *) user_data;
|
||||
|
||||
if (data->f->is_fs_file && data->f->fd == -1) {
|
||||
data->f->fd = open(data->f->filepath, O_RDONLY);
|
||||
if (!data->f->is_fs_file) {
|
||||
SHA1_Init(&data->f->sha1_ctx);
|
||||
}
|
||||
|
||||
return ARCHIVE_OK;
|
||||
}
|
||||
|
||||
static long vfile_read_callback(struct archive *a, void *user_data, const void **buf) {
|
||||
arc_data_t *data = (arc_data_t*)user_data;
|
||||
arc_data_t *data = (arc_data_t *) user_data;
|
||||
|
||||
*buf = data->buf;
|
||||
return data->f->read(data->f, data->buf, ARC_BUF_SIZE);
|
||||
long ret = data->f->read(data->f, data->buf, sizeof(data->buf));
|
||||
|
||||
if (!data->f->is_fs_file && ret > 0) {
|
||||
data->f->has_checksum = TRUE;
|
||||
safe_sha1_update(&data->f->sha1_ctx, (unsigned char*)data->buf, ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int vfile_close_callback(struct archive *a, void *user_data) {
|
||||
arc_data_t *data = (arc_data_t*)user_data;
|
||||
arc_data_t *data = (arc_data_t *) user_data;
|
||||
|
||||
if (data->f->close != NULL) {
|
||||
data->f->close(data->f);
|
||||
if (!data->f->is_fs_file) {
|
||||
SHA1_Final((unsigned char *) data->f->sha1_digest, &data->f->sha1_ctx);
|
||||
}
|
||||
|
||||
return ARCHIVE_OK;
|
||||
@@ -61,8 +69,12 @@ int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *ar
|
||||
|
||||
int should_parse_filtered_file(const char *filepath, int ext);
|
||||
|
||||
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra);
|
||||
|
||||
int arc_read(struct vfile * f, void *buf, size_t size);
|
||||
int arc_read(struct vfile *f, void *buf, size_t size);
|
||||
|
||||
int arc_read_rewindable(struct vfile *f, void *buf, size_t size);
|
||||
|
||||
void arc_close(struct vfile *f);
|
||||
|
||||
#endif
|
||||
|
||||
119
libscan/json/json.c
Normal file
119
libscan/json/json.c
Normal file
@@ -0,0 +1,119 @@
|
||||
#include "json.h"
|
||||
#include "cjson/cJSON.h"
|
||||
|
||||
|
||||
#define JSON_MAX_FILE_SIZE (1024 * 1024 * 50)
|
||||
|
||||
int json_extract_text(cJSON *json, text_buffer_t *tex) {
|
||||
if (cJSON_IsObject(json)) {
|
||||
for (cJSON *child = json->child; child != NULL; child = child->next) {
|
||||
if (json_extract_text(child, tex)) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
} else if (cJSON_IsArray(json)) {
|
||||
cJSON *child;
|
||||
cJSON_ArrayForEach(child, json) {
|
||||
if (json_extract_text(child, tex)) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
} else if (cJSON_IsString(json)) {
|
||||
if (text_buffer_append_string0(tex, json->valuestring) == TEXT_BUF_FULL) {
|
||||
return TRUE;
|
||||
}
|
||||
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
if (f->info.st_size > JSON_MAX_FILE_SIZE) {
|
||||
CTX_LOG_WARNINGF("json.c", "File larger than maximum allowed [%s]", f->filepath)
|
||||
return SCAN_ERR_SKIP;
|
||||
}
|
||||
|
||||
size_t buf_len;
|
||||
char *buf = read_all(f, &buf_len);
|
||||
|
||||
if (buf == NULL) {
|
||||
return SCAN_ERR_READ;
|
||||
}
|
||||
|
||||
buf_len += 1;
|
||||
buf = realloc(buf, buf_len);
|
||||
*(buf + buf_len - 1) = '\0';
|
||||
|
||||
cJSON *json = cJSON_ParseWithOpts(buf, NULL, TRUE);
|
||||
text_buffer_t tex = text_buffer_create(ctx->content_size);
|
||||
|
||||
json_extract_text(json, &tex);
|
||||
text_buffer_terminate_string(&tex);
|
||||
|
||||
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
|
||||
|
||||
cJSON_Delete(json);
|
||||
free(buf);
|
||||
text_buffer_destroy(&tex);
|
||||
|
||||
return SCAN_OK;
|
||||
}
|
||||
|
||||
#define JSON_BUF_SIZE (1024 * 1024 * 5)
|
||||
|
||||
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
char *buf = calloc(JSON_BUF_SIZE + 1, sizeof(char));
|
||||
*(buf + JSON_BUF_SIZE) = '\0';
|
||||
|
||||
text_buffer_t tex = text_buffer_create(ctx->content_size);
|
||||
|
||||
size_t ret;
|
||||
int eof = FALSE;
|
||||
const char *parse_end = buf;
|
||||
size_t to_read;
|
||||
char *ptr = buf;
|
||||
|
||||
while (TRUE) {
|
||||
cJSON *json;
|
||||
|
||||
if (!eof) {
|
||||
to_read = parse_end == buf ? JSON_BUF_SIZE : parse_end - buf;
|
||||
ret = f->read(f, ptr, to_read);
|
||||
if (ret != to_read) {
|
||||
eof = TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
json = cJSON_ParseWithOpts(buf, &parse_end, FALSE);
|
||||
|
||||
if (parse_end == buf + JSON_BUF_SIZE) {
|
||||
CTX_LOG_ERRORF("json.c", "Line too large for buffer [%s]", doc->filepath);
|
||||
cJSON_Delete(json);
|
||||
break;
|
||||
}
|
||||
|
||||
if (parse_end == buf) {
|
||||
cJSON_Delete(json);
|
||||
break;
|
||||
}
|
||||
|
||||
json_extract_text(json, &tex);
|
||||
|
||||
cJSON_Delete(json);
|
||||
|
||||
memmove(buf, parse_end, (buf + JSON_BUF_SIZE - parse_end));
|
||||
ptr = buf + JSON_BUF_SIZE - parse_end + buf;
|
||||
}
|
||||
|
||||
text_buffer_terminate_string(&tex);
|
||||
|
||||
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
|
||||
|
||||
free(buf);
|
||||
text_buffer_destroy(&tex);
|
||||
}
|
||||
30
libscan/json/json.h
Normal file
30
libscan/json/json.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#ifndef SCAN_JSON_H
|
||||
#define SCAN_JSON_H
|
||||
|
||||
#include "../scan.h"
|
||||
|
||||
|
||||
typedef struct {
|
||||
long content_size;
|
||||
log_callback_t log;
|
||||
logf_callback_t logf;
|
||||
store_callback_t store;
|
||||
unsigned int json_mime;
|
||||
unsigned int ndjson_mime;
|
||||
} scan_json_ctx_t;
|
||||
|
||||
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
__always_inline
|
||||
static int is_json(scan_json_ctx_t *ctx, unsigned int mime) {
|
||||
return mime == ctx->json_mime;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static int is_ndjson(scan_json_ctx_t *ctx, unsigned int mime) {
|
||||
return mime == ctx->ndjson_mime;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -20,6 +20,9 @@
|
||||
#undef ABS
|
||||
#define ABS(a) (((a) < 0) ? -(a) : (a))
|
||||
|
||||
#define SHA1_STR_LENGTH 41
|
||||
#define SHA1_DIGEST_LENGTH 20
|
||||
|
||||
#define APPEND_STR_META(doc, keyname, value) \
|
||||
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \
|
||||
meta_str->key = keyname; \
|
||||
|
||||
@@ -7,6 +7,22 @@
|
||||
|
||||
#define STORE_AS_IS ((void*)-1)
|
||||
|
||||
const char *get_filepath_with_ext(document_t *doc, const char *filepath, const char *mime_str) {
|
||||
|
||||
int has_extension = doc->ext > doc->base;
|
||||
|
||||
if (!has_extension) {
|
||||
if (strcmp(mime_str, "image/png") == 0) {
|
||||
return "file.png";
|
||||
} else if (strcmp(mime_str, "image/jpeg") == 0) {
|
||||
return "file.jpg";
|
||||
}
|
||||
}
|
||||
|
||||
return filepath;
|
||||
}
|
||||
|
||||
|
||||
__always_inline
|
||||
void *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) {
|
||||
|
||||
@@ -497,7 +513,7 @@ int vfile_read(void *ptr, uint8_t *buf, int buf_size) {
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
struct stat info;
|
||||
size_t size;
|
||||
FILE *file;
|
||||
void *buf;
|
||||
} memfile_t;
|
||||
@@ -511,14 +527,14 @@ int memfile_read(void *ptr, uint8_t *buf, int buf_size) {
|
||||
return AVERROR_EOF;
|
||||
}
|
||||
|
||||
return buf_size;
|
||||
return (int) ret;
|
||||
}
|
||||
|
||||
long memfile_seek(void *ptr, long offset, int whence) {
|
||||
memfile_t *mem = ptr;
|
||||
|
||||
if (whence == 0x10000) {
|
||||
return mem->info.st_size;
|
||||
return mem->size;
|
||||
}
|
||||
|
||||
int ret = fseek(mem->file, offset, whence);
|
||||
@@ -530,24 +546,31 @@ long memfile_seek(void *ptr, long offset, int whence) {
|
||||
}
|
||||
|
||||
int memfile_open(vfile_t *f, memfile_t *mem) {
|
||||
mem->info = f->info;
|
||||
mem->size = f->info.st_size;
|
||||
|
||||
mem->buf = malloc(mem->info.st_size);
|
||||
mem->buf = malloc(mem->size);
|
||||
if (mem->buf == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int ret = f->read(f, mem->buf, mem->info.st_size);
|
||||
mem->file = fmemopen(mem->buf, mem->info.st_size, "rb");
|
||||
int ret = f->read(f, mem->buf, mem->size);
|
||||
mem->file = fmemopen(mem->buf, mem->size, "rb");
|
||||
|
||||
return (ret == mem->info.st_size && mem->file != NULL) ? 0 : -1;
|
||||
if (f->calculate_checksum) {
|
||||
SHA1_Init(&f->sha1_ctx);
|
||||
safe_sha1_update(&f->sha1_ctx, mem->buf, mem->size);
|
||||
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
|
||||
f->has_checksum = TRUE;
|
||||
}
|
||||
|
||||
return (ret == mem->size && mem->file != NULL) ? 0 : -1;
|
||||
}
|
||||
|
||||
int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) {
|
||||
mem->info.st_size = buf_len;
|
||||
mem->size = (int) buf_len;
|
||||
|
||||
mem->buf = buf;
|
||||
mem->file = fmemopen(mem->buf, mem->info.st_size, "rb");
|
||||
mem->file = fmemopen(mem->buf, mem->size, "rb");
|
||||
|
||||
return mem->file != NULL ? 0 : -1;
|
||||
}
|
||||
@@ -559,7 +582,7 @@ void memfile_close(memfile_t *mem) {
|
||||
}
|
||||
}
|
||||
|
||||
void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc) {
|
||||
void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc, const char *mime_str) {
|
||||
|
||||
AVFormatContext *pFormatCtx = avformat_alloc_context();
|
||||
if (pFormatCtx == NULL) {
|
||||
@@ -569,7 +592,9 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc)
|
||||
|
||||
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
|
||||
AVIOContext *io_ctx = NULL;
|
||||
memfile_t memfile = {{}, 0, 0};
|
||||
memfile_t memfile = {0, 0, 0};
|
||||
|
||||
const char *filepath = get_filepath_with_ext(doc, f->filepath, mime_str);
|
||||
|
||||
if (f->info.st_size <= ctx->max_media_buffer) {
|
||||
int ret = memfile_open(f, &memfile);
|
||||
@@ -586,7 +611,7 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc)
|
||||
|
||||
pFormatCtx->pb = io_ctx;
|
||||
|
||||
int res = avformat_open_input(&pFormatCtx, f->filepath, NULL, NULL);
|
||||
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
|
||||
if (res < 0) {
|
||||
if (res != -5) {
|
||||
CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
|
||||
@@ -605,12 +630,12 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc)
|
||||
memfile_close(&memfile);
|
||||
}
|
||||
|
||||
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char *mime_str) {
|
||||
|
||||
if (f->is_fs_file) {
|
||||
parse_media_filename(ctx, f->filepath, doc);
|
||||
} else {
|
||||
parse_media_vfile(ctx, f, doc);
|
||||
parse_media_vfile(ctx, f, doc, mime_str);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -619,7 +644,7 @@ void init_media() {
|
||||
}
|
||||
|
||||
int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) {
|
||||
memfile_t memfile;
|
||||
memfile_t memfile = {0, 0, 0};
|
||||
AVIOContext *io_ctx = NULL;
|
||||
|
||||
AVFormatContext *pFormatCtx = avformat_alloc_context();
|
||||
@@ -637,8 +662,6 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
|
||||
} else {
|
||||
avformat_close_input(&pFormatCtx);
|
||||
avformat_free_context(pFormatCtx);
|
||||
av_free(io_ctx->buffer);
|
||||
avio_context_free(&io_ctx);
|
||||
fclose(memfile.file);
|
||||
return FALSE;
|
||||
}
|
||||
@@ -658,7 +681,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
|
||||
AVStream *stream = pFormatCtx->streams[0];
|
||||
|
||||
// Decoder
|
||||
AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
|
||||
const AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
|
||||
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
|
||||
avcodec_parameters_to_context(decoder, stream->codecpar);
|
||||
avcodec_open2(decoder, video_codec, NULL);
|
||||
|
||||
@@ -43,7 +43,7 @@ static AVCodecContext *alloc_jpeg_encoder(int w, int h, float qscale) {
|
||||
}
|
||||
|
||||
|
||||
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char*mime_str);
|
||||
|
||||
void init_media();
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
#include "../ebook/ebook.h"
|
||||
|
||||
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void* buf, size_t buf_len) {
|
||||
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void *buf, size_t buf_len) {
|
||||
|
||||
// Open word doc
|
||||
options_type *opts = direct_vGetOptions();
|
||||
@@ -20,7 +20,7 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi
|
||||
opts->iPageWidth = 595;
|
||||
opts->eImageLevel = level_ps_3;
|
||||
|
||||
int doc_word_version = iGuessVersionNumber(file_in, buf_len);
|
||||
int doc_word_version = iGuessVersionNumber(file_in, (int) buf_len);
|
||||
if (doc_word_version < 0 || doc_word_version == 3) {
|
||||
free(buf);
|
||||
return;
|
||||
@@ -38,19 +38,19 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi
|
||||
return;
|
||||
}
|
||||
|
||||
iInitDocument(file_in, buf_len);
|
||||
const char* author = szGetAuthor();
|
||||
iInitDocument(file_in, (int) buf_len);
|
||||
const char *author = szGetAuthor();
|
||||
if (author != NULL) {
|
||||
APPEND_UTF8_META(doc, MetaAuthor, author)
|
||||
}
|
||||
|
||||
const char* title = szGetTitle();
|
||||
const char *title = szGetTitle();
|
||||
if (title != NULL) {
|
||||
APPEND_UTF8_META(doc, MetaTitle, title)
|
||||
}
|
||||
vFreeDocument();
|
||||
|
||||
bWordDecryptor(file_in, buf_len, diag);
|
||||
bWordDecryptor(file_in, (int) buf_len, diag);
|
||||
vDestroyDiagram(diag);
|
||||
fclose(file_out);
|
||||
|
||||
@@ -71,7 +71,7 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi
|
||||
free(out_buf);
|
||||
}
|
||||
|
||||
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* buf, size_t buf_len) {
|
||||
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void *buf, size_t buf_len) {
|
||||
|
||||
scan_ebook_ctx_t ebook_ctx = {
|
||||
.content_size = ctx->content_size,
|
||||
@@ -93,7 +93,7 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b
|
||||
opts->iPageWidth = 595;
|
||||
opts->eImageLevel = level_ps_3;
|
||||
|
||||
int doc_word_version = iGuessVersionNumber(file, buf_len);
|
||||
int doc_word_version = iGuessVersionNumber(file, (int) buf_len);
|
||||
if (doc_word_version < 0 || doc_word_version == 3) {
|
||||
free(buf);
|
||||
return;
|
||||
@@ -110,7 +110,7 @@ void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void* b
|
||||
return;
|
||||
}
|
||||
|
||||
bWordDecryptor(file, buf_len, diag);
|
||||
bWordDecryptor(file, (int) buf_len, diag);
|
||||
vDestroyDiagram(diag);
|
||||
|
||||
fclose(file_out);
|
||||
|
||||
@@ -41,8 +41,6 @@ int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_
|
||||
if (err->level == XML_ERR_FATAL) {
|
||||
CTX_LOG_ERRORF("ooxml.c", "Got fatal XML error while parsing document: %s", err->message)
|
||||
return -1;
|
||||
} else {
|
||||
CTX_LOG_ERRORF("ooxml.c", "Got recoverable XML error while parsing document: %s", err->message)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,7 +68,7 @@ int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_
|
||||
|
||||
int xml_io_read(void *context, char *buffer, int len) {
|
||||
struct archive *a = context;
|
||||
return archive_read_data(a, buffer, len);
|
||||
return (int) archive_read_data(a, buffer, len);
|
||||
}
|
||||
|
||||
int xml_io_close(UNUSED(void *context)) {
|
||||
@@ -78,7 +76,7 @@ int xml_io_close(UNUSED(void *context)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define READ_PART_ERR -2
|
||||
#define READ_PART_ERR (-2)
|
||||
|
||||
__always_inline
|
||||
static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) {
|
||||
@@ -104,6 +102,42 @@ static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *bu
|
||||
return ret;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static int read_doc_props_app(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) {
|
||||
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
|
||||
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
|
||||
|
||||
if (xml == NULL) {
|
||||
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
|
||||
return -1;
|
||||
}
|
||||
|
||||
xmlNode *root = xmlDocGetRootElement(xml);
|
||||
if (root == NULL) {
|
||||
CTX_LOG_ERROR(doc->filepath, "Empty document")
|
||||
xmlFreeDoc(xml);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (xmlStrEqual(root->name, _X("Properties"))) {
|
||||
for (xmlNode *child = root->children; child; child = child->next) {
|
||||
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
|
||||
if (text == NULL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (xmlStrEqual(child->name, _X("Pages"))) {
|
||||
APPEND_LONG_META(doc, MetaPages, strtol((char *) text, NULL, 10))
|
||||
}
|
||||
|
||||
xmlFree(text);
|
||||
}
|
||||
}
|
||||
xmlFreeDoc(xml);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) {
|
||||
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
|
||||
@@ -144,7 +178,7 @@ static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define MAX_TN_SIZE 1024 * 1024 * 15
|
||||
#define MAX_TN_SIZE (1024 * 1024 * 15)
|
||||
|
||||
void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, struct archive_entry *entry) {
|
||||
size_t entry_size = archive_entry_size(entry);
|
||||
@@ -153,7 +187,7 @@ void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, s
|
||||
return;
|
||||
}
|
||||
|
||||
char* buf = malloc(entry_size);
|
||||
char *buf = malloc(entry_size);
|
||||
archive_read_data(a, buf, entry_size);
|
||||
|
||||
APPEND_TN_META(doc, 1, 1) // Size unknown
|
||||
@@ -196,6 +230,10 @@ void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
} else if (ret == TEXT_BUF_FULL) {
|
||||
buffer_full = TRUE;
|
||||
}
|
||||
} else if (strcmp(path, "docProps/app.xml") == 0) {
|
||||
if (read_doc_props_app(ctx, a, doc) != 0) {
|
||||
break;
|
||||
}
|
||||
} else if (strcmp(path, "docProps/core.xml") == 0) {
|
||||
if (read_doc_props(ctx, a, doc) != 0) {
|
||||
break;
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
#include <openssl/md5.h>
|
||||
#include <openssl/sha.h>
|
||||
|
||||
#include "macros.h"
|
||||
|
||||
@@ -16,12 +17,15 @@
|
||||
#define UNUSED(x) __attribute__((__unused__)) x
|
||||
|
||||
typedef void (*store_callback_t)(char *key, size_t key_len, char *buf, size_t buf_len);
|
||||
|
||||
typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...);
|
||||
|
||||
typedef void (*log_callback_t)(const char *filepath, int level, char *str);
|
||||
|
||||
typedef int scan_code_t;
|
||||
#define SCAN_OK (scan_code_t) 0
|
||||
#define SCAN_ERR_READ (scan_code_t) (-1)
|
||||
#define SCAN_ERR_SKIP (scan_code_t) (-2)
|
||||
|
||||
#define LEVEL_DEBUG 0
|
||||
#define LEVEL_INFO 1
|
||||
@@ -68,6 +72,7 @@ enum metakey {
|
||||
MetaAuthor,
|
||||
MetaModifiedBy,
|
||||
MetaThumbnail,
|
||||
MetaChecksum,
|
||||
|
||||
// Number
|
||||
MetaWidth,
|
||||
@@ -129,11 +134,20 @@ typedef struct vfile {
|
||||
};
|
||||
|
||||
int is_fs_file;
|
||||
int has_checksum;
|
||||
int calculate_checksum;
|
||||
const char *filepath;
|
||||
struct stat info;
|
||||
|
||||
SHA_CTX sha1_ctx;
|
||||
unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
|
||||
|
||||
void *rewind_buffer;
|
||||
int rewind_buffer_size;
|
||||
int rewind_buffer_cursor;
|
||||
|
||||
read_func_t read;
|
||||
seek_func_t seek;
|
||||
read_func_t read_rewindable;
|
||||
close_func_t close;
|
||||
reset_func_t reset;
|
||||
log_callback_t log;
|
||||
|
||||
@@ -35,7 +35,7 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
return SCAN_OK;
|
||||
}
|
||||
|
||||
#define MAX_MARKUP_SIZE 1024 * 1024
|
||||
#define MAX_MARKUP_SIZE (1024 * 1024)
|
||||
|
||||
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
|
||||
@@ -9,11 +9,14 @@
|
||||
|
||||
#define STR_STARTS_WITH(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
|
||||
|
||||
#define TEXT_BUF_FULL -1
|
||||
#define INITIAL_BUF_SIZE 1024 * 16
|
||||
#define TEXT_BUF_FULL (-1)
|
||||
#define INITIAL_BUF_SIZE (1024 * 16)
|
||||
|
||||
#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c))
|
||||
#define SHOULD_KEEP_CHAR(c) ((c >= '\'' && c <= ';') || (c >= 'A' && c <= 'z') || (c > 127))
|
||||
#define SHOULD_KEEP_CHAR(c) (\
|
||||
((c) >= '\'' && (c) <= ';') || \
|
||||
((c) >= 'A' && (c) <= 'z') || \
|
||||
((c) > 127 && (c) != 0x00A0 && (c) && (c) != 0xFFFD))
|
||||
|
||||
|
||||
typedef struct dyn_buffer {
|
||||
@@ -333,4 +336,26 @@ static void *read_all(vfile_t *f, size_t *size) {
|
||||
return buf;
|
||||
}
|
||||
|
||||
#define STACK_BUFFER_SIZE (size_t)(4096 * 8)
|
||||
|
||||
__always_inline
|
||||
static void safe_sha1_update(SHA_CTX *ctx, void *buf, size_t size) {
|
||||
unsigned char stack_buf[STACK_BUFFER_SIZE];
|
||||
|
||||
void *sha1_buf;
|
||||
if (size <= STACK_BUFFER_SIZE) {
|
||||
sha1_buf = stack_buf;
|
||||
} else {
|
||||
void *heap_sha1_buf = malloc(size);
|
||||
sha1_buf = heap_sha1_buf;
|
||||
}
|
||||
|
||||
memcpy(sha1_buf, buf, size);
|
||||
SHA1_Update(ctx, (const void *) sha1_buf, size);
|
||||
|
||||
if (sha1_buf != stack_buf) {
|
||||
free(sha1_buf);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
200
libscan/wpd/libwpd_c_api.cpp
Normal file
200
libscan/wpd/libwpd_c_api.cpp
Normal file
@@ -0,0 +1,200 @@
|
||||
#include "libwpd_c_api.h"
|
||||
#include "libwpd/libwpd.h"
|
||||
#include "libwpd/WPXProperty.h"
|
||||
#include "libwpd-stream/libwpd-stream.h"
|
||||
|
||||
class StringDocument : public WPXDocumentInterface {
|
||||
|
||||
private:
|
||||
text_buffer_t *tex;
|
||||
document_t *doc;
|
||||
bool is_full;
|
||||
public:
|
||||
|
||||
StringDocument(text_buffer_t *tex, document_t *doc) {
|
||||
this->tex = tex;
|
||||
this->doc = doc;
|
||||
this->is_full = false;
|
||||
}
|
||||
|
||||
void setDocumentMetaData(const WPXPropertyList &propList) override {
|
||||
|
||||
WPXPropertyList::Iter propIter(propList);
|
||||
for (propIter.rewind(); propIter.next();) {
|
||||
// TODO: Read metadata here ?!
|
||||
}
|
||||
}
|
||||
|
||||
void endDocument() override {
|
||||
text_buffer_terminate_string(this->tex);
|
||||
}
|
||||
|
||||
void closeParagraph() override {
|
||||
if (!this->is_full) {
|
||||
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||
this->is_full = true;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
void closeSpan() override {
|
||||
if (!this->is_full) {
|
||||
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||
this->is_full = true;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
void closeSection() override {
|
||||
if (!this->is_full) {
|
||||
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||
this->is_full = true;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
void insertTab() override {
|
||||
if (!this->is_full) {
|
||||
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||
this->is_full = true;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
void insertSpace() override {
|
||||
if (!this->is_full) {
|
||||
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||
this->is_full = true;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
void insertText(const WPXString &text) override {
|
||||
if (!this->is_full) {
|
||||
if (text_buffer_append_string0(tex, text.cstr()) == TEXT_BUF_FULL) {
|
||||
this->is_full = true;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
void insertLineBreak() override {
|
||||
if (!this->is_full) {
|
||||
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
|
||||
this->is_full = true;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
void definePageStyle(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void closePageSpan() override { /* noop */ }
|
||||
|
||||
void openHeader(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void closeHeader() override { /* noop */ }
|
||||
|
||||
void openFooter(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void closeFooter() override { /* noop */ }
|
||||
|
||||
void
|
||||
defineParagraphStyle(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
|
||||
|
||||
void openParagraph(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
|
||||
|
||||
void defineCharacterStyle(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void openSpan(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void
|
||||
defineSectionStyle(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
|
||||
|
||||
void openSection(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
|
||||
|
||||
void insertField(const WPXString &type, const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void defineOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void defineUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void openOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void openUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void closeOrderedListLevel() override { /* noop */ }
|
||||
|
||||
void closeUnorderedListLevel() override { /* noop */ }
|
||||
|
||||
void openListElement(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
|
||||
|
||||
void closeListElement() override { /* noop */ }
|
||||
|
||||
void openFootnote(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void closeFootnote() override { /* noop */ }
|
||||
|
||||
void openEndnote(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void closeEndnote() override { /* noop */ }
|
||||
|
||||
void openComment(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void closeComment() override { /* noop */ }
|
||||
|
||||
void openTextBox(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void closeTextBox() override { /* noop */ }
|
||||
|
||||
void openTable(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
|
||||
|
||||
void openTableRow(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void closeTableRow() override { /* noop */ }
|
||||
|
||||
void openTableCell(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void closeTableCell() override { /* noop */ }
|
||||
|
||||
void insertCoveredTableCell(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void closeTable() override { /* noop */ }
|
||||
|
||||
void openFrame(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void closeFrame() override { /* noop */ }
|
||||
|
||||
void insertBinaryObject(const WPXPropertyList &propList, const WPXBinaryData &data) override { /* noop */ }
|
||||
|
||||
void insertEquation(const WPXPropertyList &propList, const WPXString &data) override { /* noop */ }
|
||||
|
||||
void openPageSpan(const WPXPropertyList &propList) override { /* noop */ }
|
||||
|
||||
void startDocument() override { /* noop */ };
|
||||
};
|
||||
|
||||
|
||||
wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len) {
|
||||
auto *input = new WPXStringStream(buf, buf_len);
|
||||
return input;
|
||||
}
|
||||
|
||||
wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t ptr) {
|
||||
auto *stream = (WPXStringStream *) ptr;
|
||||
WPDConfidence confidence = WPDocument::isFileFormatSupported(stream);
|
||||
|
||||
return (wpd_confidence_t) confidence;
|
||||
}
|
||||
|
||||
wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc) {
|
||||
auto *stream = (WPXStringStream *) ptr;
|
||||
|
||||
auto myDoc = StringDocument(tex, doc);
|
||||
WPDResult result2 = WPDocument::parse(stream, &myDoc, nullptr);
|
||||
|
||||
return (wpd_result_t) result2;
|
||||
}
|
||||
|
||||
void wpd_memory_stream_destroy(wpd_stream_t ptr) {
|
||||
auto *stream = (WPXStringStream *) ptr;
|
||||
delete stream;
|
||||
}
|
||||
50
libscan/wpd/libwpd_c_api.h
Normal file
50
libscan/wpd/libwpd_c_api.h
Normal file
@@ -0,0 +1,50 @@
|
||||
#ifndef SIST2_LIBWPD_C_API_H
|
||||
#define SIST2_LIBWPD_C_API_H
|
||||
|
||||
#include "stdlib.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
#define EXTERNC extern "C"
|
||||
#else
|
||||
#define EXTERNC
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include "../scan.h"
|
||||
#include "../util.h"
|
||||
#ifdef __cplusplus
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
typedef void *wpd_stream_t;
|
||||
|
||||
typedef enum {
|
||||
C_WPD_CONFIDENCE_NONE = 0,
|
||||
C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION,
|
||||
C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION,
|
||||
C_WPD_CONFIDENCE_EXCELLENT
|
||||
} wpd_confidence_t;
|
||||
|
||||
typedef enum {
|
||||
C_WPD_OK,
|
||||
C_WPD_FILE_ACCESS_ERROR,
|
||||
C_WPD_PARSE_ERROR,
|
||||
C_WPD_UNSUPPORTED_ENCRYPTION_ERROR,
|
||||
C_WPD_PASSWORD_MISSMATCH_ERROR,
|
||||
C_WPD_OLE_ERROR,
|
||||
C_WPD_UNKNOWN_ERROR
|
||||
} wpd_result_t;
|
||||
|
||||
|
||||
EXTERNC wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t stream);
|
||||
|
||||
EXTERNC wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len);
|
||||
|
||||
EXTERNC void wpd_memory_stream_destroy(wpd_stream_t stream);
|
||||
|
||||
EXTERNC wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc);
|
||||
|
||||
#endif
|
||||
41
libscan/wpd/wpd.c
Normal file
41
libscan/wpd/wpd.c
Normal file
@@ -0,0 +1,41 @@
|
||||
#include "wpd.h"
|
||||
#include "libwpd_c_api.h"
|
||||
|
||||
scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
size_t buf_len;
|
||||
void *buf = read_all(f, &buf_len);
|
||||
|
||||
void *stream = wpd_memory_stream_create(buf, buf_len);
|
||||
wpd_confidence_t conf = wpd_is_file_format_supported(stream);
|
||||
|
||||
if (conf == C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION || conf == C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION) {
|
||||
CTX_LOG_DEBUGF("wpd.c", "File is encrypted! Password-protected WPD files are not supported yet (conf=%d)", conf)
|
||||
wpd_memory_stream_destroy(stream);
|
||||
free(buf);
|
||||
return SCAN_ERR_READ;
|
||||
}
|
||||
|
||||
if (conf != C_WPD_CONFIDENCE_EXCELLENT) {
|
||||
CTX_LOG_ERRORF("wpd.c", "Unsupported file format! [%s] (conf=%d)", doc->filepath, conf)
|
||||
wpd_memory_stream_destroy(stream);
|
||||
free(buf);
|
||||
return SCAN_ERR_READ;
|
||||
}
|
||||
|
||||
text_buffer_t tex = text_buffer_create(-1);
|
||||
wpd_result_t res = wpd_parse(stream, &tex, doc);
|
||||
|
||||
if (res != C_WPD_OK) {
|
||||
CTX_LOG_ERRORF("wpd.c", "Error while parsing WPD file [%s] (%d)",
|
||||
doc->filepath, res)
|
||||
}
|
||||
|
||||
if (tex.dyn_buffer.cur != 0) {
|
||||
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
|
||||
}
|
||||
|
||||
text_buffer_destroy(&tex);
|
||||
wpd_memory_stream_destroy(stream);
|
||||
free(buf);
|
||||
}
|
||||
23
libscan/wpd/wpd.h
Normal file
23
libscan/wpd/wpd.h
Normal file
@@ -0,0 +1,23 @@
|
||||
#ifndef SIST2_WPD_H
|
||||
#define SIST2_WPD_H
|
||||
|
||||
#include "../scan.h"
|
||||
#include "../util.h"
|
||||
|
||||
typedef struct {
|
||||
long content_size;
|
||||
|
||||
log_callback_t log;
|
||||
logf_callback_t logf;
|
||||
|
||||
unsigned int wpd_mime;
|
||||
} scan_wpd_ctx_t;
|
||||
|
||||
scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
__always_inline
|
||||
static int is_wpd(scan_wpd_ctx_t *ctx, unsigned int mime) {
|
||||
return mime == ctx->wpd_mime;
|
||||
}
|
||||
|
||||
#endif
|
||||
165
test/main.cpp
165
test/main.cpp
@@ -11,11 +11,14 @@ extern "C" {
|
||||
#include "../libscan/mobi/scan_mobi.h"
|
||||
#include "../libscan/raw/raw.h"
|
||||
#include "../libscan/msdoc/msdoc.h"
|
||||
#include "../libscan/wpd/wpd.h"
|
||||
#include "../libscan/json/json.h"
|
||||
#include <libavutil/avutil.h>
|
||||
}
|
||||
|
||||
static scan_arc_ctx_t arc_recurse_media_ctx;
|
||||
static scan_arc_ctx_t arc_list_ctx;
|
||||
static scan_arc_ctx_t arc_recurse_ooxml_ctx;
|
||||
|
||||
static scan_text_ctx_t text_500_ctx;
|
||||
|
||||
@@ -39,11 +42,20 @@ static scan_msdoc_ctx_t msdoc_ctx;
|
||||
|
||||
static scan_msdoc_ctx_t msdoc_text_ctx;
|
||||
|
||||
static scan_wpd_ctx_t wpd_ctx;
|
||||
|
||||
document_t LastSubDoc;
|
||||
static scan_json_ctx_t json_ctx;
|
||||
|
||||
|
||||
static document_t LastSubDoc;
|
||||
static char *RecurseMediaMime = (char *) "";
|
||||
|
||||
void _parse_media(parse_job_t *job) {
|
||||
parse_media(&media_ctx, &job->vfile, &LastSubDoc);
|
||||
parse_media(&media_ctx, &job->vfile, &LastSubDoc, RecurseMediaMime);
|
||||
}
|
||||
|
||||
void _parse_ooxml(parse_job_t *job) {
|
||||
parse_ooxml(&ooxml_500_ctx, &job->vfile, &LastSubDoc);
|
||||
}
|
||||
|
||||
|
||||
@@ -219,6 +231,24 @@ TEST(Ebook, Utf8Pdf) {
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Ebook, Utf8PdfInvalidChars) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/ebook/invalid_chars.pdf", &f, &doc);
|
||||
|
||||
ebook_ctx.tesseract_lang = nullptr;
|
||||
|
||||
parse_ebook(&ebook_ctx, &f, "application/pdf", &doc);
|
||||
|
||||
ebook_ctx.tesseract_lang = "eng";
|
||||
|
||||
// It should say "HART is a group of highly qualified ..." but the PDF
|
||||
// text is been intentionally fucked with by the authors
|
||||
// We can at least filter out the non-printable/invalid characters like '<27>' etc
|
||||
ASSERT_TRUE(STR_STARTS_WITH(get_meta(&doc, MetaContent)->str_val, "HART i a g f highl alified "));
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Ebook, Pdf2) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
@@ -360,7 +390,7 @@ TEST(MediaImage, ExifGps1) {
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/media/exif_GPS.jpg", &f, &doc);
|
||||
|
||||
parse_media(&media_ctx, &f, &doc);
|
||||
parse_media(&media_ctx, &f, &doc, "image/jpeg");
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaExifGpsLatitudeRef)->str_val, "N");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaExifGpsLatitudeDMS)->str_val, "48:1 , 56585399:1000000, 0:1");
|
||||
@@ -376,7 +406,7 @@ TEST(MediaImage, Exif1) {
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/media/exiftest1.jpg", &f, &doc);
|
||||
|
||||
parse_media(&media_ctx, &f, &doc);
|
||||
parse_media(&media_ctx, &f, &doc, "image/jpeg");
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val, "I don't know if it's a thing mostly done for high end "
|
||||
"hotels or what, but I've seen it in a few places in Thailand: "
|
||||
@@ -405,13 +435,28 @@ TEST(MediaImage, Mem1) {
|
||||
|
||||
size_t size_before = store_size;
|
||||
|
||||
parse_archive(&arc_recurse_media_ctx, &f, &doc);
|
||||
RecurseMediaMime = (char *) "image/jpeg";
|
||||
parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr);
|
||||
|
||||
ASSERT_NE(size_before, store_size);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(MediaImage, AsIsFs) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/media/9555.jpg", &f, &doc);
|
||||
|
||||
size_t size_before = store_size;
|
||||
|
||||
parse_media(&media_ctx, &f, &doc, "image/jpeg");
|
||||
|
||||
ASSERT_EQ(size_before + 14098, store_size);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(MediaImage, Mem2AsIs) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
@@ -419,7 +464,8 @@ TEST(MediaImage, Mem2AsIs) {
|
||||
|
||||
size_t size_before = store_size;
|
||||
|
||||
parse_archive(&arc_recurse_media_ctx, &f, &doc);
|
||||
RecurseMediaMime = (char *) "image/jpeg";
|
||||
parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr);
|
||||
|
||||
ASSERT_EQ(size_before + 14098, store_size);
|
||||
|
||||
@@ -432,7 +478,7 @@ TEST(MediaVideo, VidMkvSubDisabled) {
|
||||
load_doc_file("libscan-test-files/test_files/media/berd.mkv", &f, &doc);
|
||||
|
||||
size_t size_before = store_size;
|
||||
parse_media(&media_ctx, &f, &doc);
|
||||
parse_media(&media_ctx, &f, &doc, "video/x-matroska");
|
||||
|
||||
ASSERT_NE(size_before, store_size);
|
||||
ASSERT_EQ(get_meta(&doc, MetaContent), nullptr);
|
||||
@@ -447,7 +493,7 @@ TEST(MediaVideo, VidMkvSubEnabled) {
|
||||
|
||||
size_t size_before = store_size;
|
||||
media_ctx.read_subtitles = TRUE;
|
||||
parse_media(&media_ctx, &f, &doc);
|
||||
parse_media(&media_ctx, &f, &doc, "video/x-matroska");
|
||||
media_ctx.read_subtitles = FALSE;
|
||||
|
||||
ASSERT_NE(size_before, store_size);
|
||||
@@ -461,7 +507,7 @@ TEST(MediaVideo, Vid3Mp4) {
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/media/vid3.mp4", &f, &doc);
|
||||
|
||||
parse_media(&media_ctx, &f, &doc);
|
||||
parse_media(&media_ctx, &f, &doc, "video/mp4");
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "Helicopter (((Accident))) - "
|
||||
"https://archive.org/details/Virginia_Helicopter_Crash");
|
||||
@@ -478,7 +524,7 @@ TEST(MediaVideo, Vid3Ogv) {
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/media/vid3.ogv", &f, &doc);
|
||||
|
||||
parse_media(&media_ctx, &f, &doc);
|
||||
parse_media(&media_ctx, &f, &doc, "application/ogg");
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "theora");
|
||||
ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 590261);
|
||||
@@ -493,7 +539,7 @@ TEST(MediaVideo, Vid3Webm) {
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/media/vid3.webm", &f, &doc);
|
||||
|
||||
parse_media(&media_ctx, &f, &doc);
|
||||
parse_media(&media_ctx, &f, &doc, "video/webm");
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaMediaVideoCodec)->str_val, "vp8");
|
||||
ASSERT_EQ(get_meta(&doc, MetaMediaBitrate)->long_val, 343153);
|
||||
@@ -510,7 +556,8 @@ TEST(MediaVideoVfile, Vid3Ogv) {
|
||||
|
||||
size_t size_before = store_size;
|
||||
|
||||
parse_archive(&arc_recurse_media_ctx, &f, &doc);
|
||||
RecurseMediaMime = (char *) "video/webm";
|
||||
parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr);
|
||||
|
||||
// ASSERT_STREQ(get_meta(&LastSubDoc, MetaMediaVideoCodec)->str_val, "theora");
|
||||
ASSERT_EQ(get_meta(&LastSubDoc, MetaMediaBitrate)->long_val, 590261);
|
||||
@@ -525,7 +572,7 @@ TEST(MediaVideo, VidDuplicateTags) {
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/media/vid_tags.mkv", &f, &doc);
|
||||
|
||||
parse_media(&media_ctx, &f, &doc);
|
||||
parse_media(&media_ctx, &f, &doc, "video/x-matroska");
|
||||
|
||||
meta_line_t *meta_content = get_meta(&doc, MetaContent);
|
||||
ASSERT_STREQ(meta_content->str_val, "he's got a point");
|
||||
@@ -549,7 +596,7 @@ TEST(MediaAudio, MusicMp3) {
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/media/02-The Watchmaker-Barry James_spoken.mp3", &f, &doc);
|
||||
|
||||
parse_media(&media_ctx, &f, &doc);
|
||||
parse_media(&media_ctx, &f, &doc, "audio/x-mpeg-3");
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaArtist)->str_val, "Barry James");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAlbum)->str_val, "Strange Slumber, Music for Wonderful Dreams");
|
||||
@@ -587,11 +634,48 @@ TEST(Ooxml, Docx1) {
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Thomas");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaModifiedBy)->str_val, "Thomas");
|
||||
ASSERT_EQ(get_meta(&doc, MetaPages)->long_val, 2);
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Ooxml, Docx2) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/ooxml/docx2.docx", &f, &doc);
|
||||
|
||||
ooxml_500_ctx.content_size = 999999;
|
||||
parse_ooxml(&ooxml_500_ctx, &f, &doc);
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "liz evans");
|
||||
ASSERT_EQ(get_meta(&doc, MetaPages)->long_val, 1);
|
||||
ASSERT_EQ(strlen(get_meta(&doc, MetaContent)->str_val), 2780);
|
||||
|
||||
ooxml_500_ctx.content_size = 500;
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Ooxml, Docx2Archive) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/ooxml/docx2.docx.7z", &f, &doc);
|
||||
|
||||
ooxml_500_ctx.content_size = 999999;
|
||||
parse_archive(&arc_recurse_ooxml_ctx, &f, &doc, nullptr, nullptr);
|
||||
|
||||
ASSERT_STREQ(get_meta(&LastSubDoc, MetaAuthor)->str_val, "liz evans");
|
||||
ASSERT_EQ(get_meta(&LastSubDoc, MetaPages)->long_val, 1);
|
||||
ASSERT_EQ(strlen(get_meta(&LastSubDoc, MetaContent)->str_val), 2780);
|
||||
|
||||
fprintf(stderr, "%s\n", get_meta(&LastSubDoc, MetaContent)->str_val);
|
||||
|
||||
ooxml_500_ctx.content_size = 500;
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Ooxml, Docx2Thumbnail) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
@@ -602,6 +686,7 @@ TEST(Ooxml, Docx2Thumbnail) {
|
||||
parse_ooxml(&ooxml_500_ctx, &f, &doc);
|
||||
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), 500, 4);
|
||||
ASSERT_EQ(get_meta(&doc, MetaPages)->long_val, 2);
|
||||
ASSERT_NE(size_before, store_size);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
@@ -670,7 +755,7 @@ TEST(Arc, Utf8) {
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/arc/test1.zip", &f, &doc);
|
||||
|
||||
parse_archive(&arc_list_ctx, &f, &doc);
|
||||
parse_archive(&arc_list_ctx, &f, &doc, nullptr, nullptr);
|
||||
|
||||
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "arctest/ȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬ.txt") != nullptr);
|
||||
|
||||
@@ -685,7 +770,7 @@ TEST(Arc, EncryptedZip) {
|
||||
size_t size_before = store_size;
|
||||
|
||||
strcpy(arc_recurse_media_ctx.passphrase, "sist2");
|
||||
parse_archive(&arc_recurse_media_ctx, &f, &doc);
|
||||
parse_archive(&arc_recurse_media_ctx, &f, &doc, nullptr, nullptr);
|
||||
|
||||
arc_recurse_media_ctx.passphrase[0] = '\0';
|
||||
|
||||
@@ -940,7 +1025,7 @@ TEST(Msdoc, TestFuzz1) {
|
||||
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
size_t buf_len_copy = buf_len;
|
||||
char *buf_copy = (char*)malloc(buf_len);
|
||||
char *buf_copy = (char *) malloc(buf_len);
|
||||
memcpy(buf_copy, buf, buf_len);
|
||||
|
||||
fuzz_buffer(buf_copy, &buf_len_copy, 3, 8, 5);
|
||||
@@ -951,6 +1036,38 @@ TEST(Msdoc, TestFuzz1) {
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Wpd, Wpd51_1) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/wpd/test51_1.wpd", &f, &doc);
|
||||
|
||||
parse_wpd(&wpd_ctx, &f, &doc);
|
||||
|
||||
ASSERT_STREQ(get_meta(&doc, MetaContent)->str_val,
|
||||
"Hello, WordPerfect This is a test This is the next page This is another page");
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Json, Json1) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/json/json1.json", &f, &doc);
|
||||
|
||||
parse_json(&json_ctx, &f, &doc);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Json, NDJson1) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/json/ndjson1.jsonl", &f, &doc);
|
||||
|
||||
parse_ndjson(&json_ctx, &f, &doc);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
setlocale(LC_ALL, "");
|
||||
@@ -961,6 +1078,12 @@ int main(int argc, char **argv) {
|
||||
arc_recurse_media_ctx.mode = ARC_MODE_RECURSE;
|
||||
arc_recurse_media_ctx.parse = _parse_media;
|
||||
|
||||
arc_recurse_ooxml_ctx.log = noop_log;
|
||||
arc_recurse_ooxml_ctx.logf = noop_logf;
|
||||
arc_recurse_ooxml_ctx.store = counter_store;
|
||||
arc_recurse_ooxml_ctx.mode = ARC_MODE_RECURSE;
|
||||
arc_recurse_ooxml_ctx.parse = _parse_ooxml;
|
||||
|
||||
arc_list_ctx.log = noop_log;
|
||||
arc_list_ctx.logf = noop_logf;
|
||||
arc_list_ctx.store = counter_store;
|
||||
@@ -1032,6 +1155,14 @@ int main(int argc, char **argv) {
|
||||
msdoc_text_ctx.content_size = 500;
|
||||
msdoc_text_ctx.tn_size = 0;
|
||||
|
||||
wpd_ctx.log = noop_log;
|
||||
wpd_ctx.logf = noop_logf;
|
||||
wpd_ctx.content_size = 500;
|
||||
|
||||
json_ctx.log = noop_log;
|
||||
json_ctx.logf = noop_logf;
|
||||
json_ctx.content_size = 5000;
|
||||
|
||||
av_log_set_level(AV_LOG_QUIET);
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
|
||||
@@ -16,7 +16,7 @@ int fs_read(struct vfile *f, void *buf, size_t size) {
|
||||
}
|
||||
}
|
||||
|
||||
return read(f->fd, buf, size);
|
||||
return (int) read(f->fd, buf, size);
|
||||
}
|
||||
|
||||
//Note: No out of bounds check
|
||||
@@ -61,12 +61,14 @@ void load_file(const char *filepath, vfile_t *f) {
|
||||
f->read = fs_read;
|
||||
f->close = fs_close;
|
||||
f->is_fs_file = TRUE;
|
||||
f->calculate_checksum = TRUE;
|
||||
f->has_checksum = FALSE;
|
||||
}
|
||||
|
||||
void load_mem(void *mem, size_t size, vfile_t *f) {
|
||||
f->filepath = "_mem_";
|
||||
f->_test_data = mem;
|
||||
f->info.st_size = size;
|
||||
f->info.st_size = (int) size;
|
||||
f->read = mem_read;
|
||||
f->close = nullptr;
|
||||
f->is_fs_file = TRUE;
|
||||
@@ -106,7 +108,7 @@ void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p) {
|
||||
}
|
||||
|
||||
for (int disp = 0; disp < width; disp++) {
|
||||
buf[offset + disp] = (int8_t)rand();
|
||||
buf[offset + disp] = (int8_t) rand();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user