diff --git a/libscan/arc/arc.c b/libscan/arc/arc.c index f59fcbf..d89c188 100644 --- a/libscan/arc/arc.c +++ b/libscan/arc/arc.c @@ -4,6 +4,7 @@ #include #include #include +#include int should_parse_filtered_file(const char *filepath, int ext) { @@ -33,18 +34,29 @@ int should_parse_filtered_file(const char *filepath, int ext) { return FALSE; } -int arc_read(struct vfile *f, void *buf, size_t size) { - size_t read = archive_read_data(f->arc, buf, size); +void arc_close(struct vfile *f) { + SHA1_Final(f->sha1_digest, &f->sha1_ctx); +} - if (read != size) { - const char* error_str = archive_error_string(f->arc); + +int arc_read(struct vfile *f, void *buf, size_t size) { + size_t bytes_read = archive_read_data(f->arc, buf, size); + + if (bytes_read != 0 && bytes_read <= size && f->calculate_checksum) { + f->has_checksum = TRUE; + + safe_sha1_update(&f->sha1_ctx, (unsigned char*)buf, bytes_read); + } + + if (bytes_read != size) { + const char *error_str = archive_error_string(f->arc); if (error_str != NULL) { f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str); } return -1; } - return read; + return (int) bytes_read; } int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) { @@ -58,7 +70,7 @@ int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *ar archive_read_add_passphrase(*a, ctx->passphrase); } - return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE); + return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE); } else if (allow_recurse) { *a = archive_read_new(); archive_read_support_filter_all(*a); @@ -102,8 +114,8 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { if (S_ISREG(archive_entry_stat(entry)->st_mode)) { - const char* utf8_name = archive_entry_pathname_utf8(entry); - const char* file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name; + const char *utf8_name = archive_entry_pathname_utf8(entry); + const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name; dyn_buffer_append_string(&buf, file_path); dyn_buffer_write_char(&buf, ' '); @@ -121,7 +133,7 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2); - sub_job->vfile.close = NULL; + sub_job->vfile.close = arc_close; sub_job->vfile.read = arc_read; sub_job->vfile.reset = NULL; sub_job->vfile.arc = a; @@ -129,13 +141,15 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { sub_job->vfile.is_fs_file = FALSE; sub_job->vfile.log = ctx->log; sub_job->vfile.logf = ctx->logf; + sub_job->vfile.has_checksum = FALSE; + sub_job->vfile.calculate_checksum = f->calculate_checksum; memcpy(sub_job->parent, doc->path_md5, MD5_DIGEST_LENGTH); while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { sub_job->vfile.info = *archive_entry_stat(entry); if (S_ISREG(sub_job->vfile.info.st_mode)) { - const char* utf8_name = archive_entry_pathname_utf8(entry); + const char *utf8_name = archive_entry_pathname_utf8(entry); if (utf8_name == NULL) { sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry)); @@ -151,6 +165,9 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { sub_job->ext = (int) strlen(sub_job->filepath); } + memset(&sub_job->vfile.sha1_ctx, 0, sizeof(sub_job->vfile.sha1_ctx)); + SHA1_Init(&sub_job->vfile.sha1_ctx); + ctx->parse(sub_job); } } diff --git a/libscan/arc/arc.h b/libscan/arc/arc.h index ea5a803..346f36a 100644 --- a/libscan/arc/arc.h +++ b/libscan/arc/arc.h @@ -6,7 +6,7 @@ #include #include "../scan.h" -# define ARC_SKIPPED -1 +# define ARC_SKIPPED (-1) #define ARC_MODE_SKIP 0 #define ARC_MODE_LIST 1 #define ARC_MODE_SHALLOW 2 @@ -31,27 +31,34 @@ typedef struct { } arc_data_t; static int vfile_open_callback(struct archive *a, void *user_data) { - arc_data_t *data = (arc_data_t*)user_data; + arc_data_t *data = (arc_data_t *) user_data; - if (data->f->is_fs_file && data->f->fd == -1) { - data->f->fd = open(data->f->filepath, O_RDONLY); + if (!data->f->is_fs_file) { + SHA1_Init(&data->f->sha1_ctx); } return ARCHIVE_OK; } static long vfile_read_callback(struct archive *a, void *user_data, const void **buf) { - arc_data_t *data = (arc_data_t*)user_data; + arc_data_t *data = (arc_data_t *) user_data; *buf = data->buf; - return data->f->read(data->f, data->buf, ARC_BUF_SIZE); + long ret = data->f->read(data->f, data->buf, sizeof(data->buf)); + + if (!data->f->is_fs_file && ret > 0) { + data->f->has_checksum = TRUE; + safe_sha1_update(&data->f->sha1_ctx, (unsigned char*)data->buf, ret); + } + + return ret; } static int vfile_close_callback(struct archive *a, void *user_data) { - arc_data_t *data = (arc_data_t*)user_data; + arc_data_t *data = (arc_data_t *) user_data; - if (data->f->close != NULL) { - data->f->close(data->f); + if (!data->f->is_fs_file) { + SHA1_Final((unsigned char *) data->f->sha1_digest, &data->f->sha1_ctx); } return ARCHIVE_OK; @@ -63,6 +70,8 @@ int should_parse_filtered_file(const char *filepath, int ext); scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc); -int arc_read(struct vfile * f, void *buf, size_t size); +int arc_read(struct vfile *f, void *buf, size_t size); + +void arc_close(struct vfile *f); #endif diff --git a/libscan/json/json.c b/libscan/json/json.c index a12822b..3ba69f0 100644 --- a/libscan/json/json.c +++ b/libscan/json/json.c @@ -67,10 +67,10 @@ scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) { scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) { - char *buf = malloc(JSON_BUF_SIZE + 1); + char *buf = calloc(JSON_BUF_SIZE + 1, sizeof(char)); *(buf + JSON_BUF_SIZE) = '\0'; - text_buffer_t tex = text_buffer_create(-1); + text_buffer_t tex = text_buffer_create(ctx->content_size); size_t ret; int eof = FALSE; diff --git a/libscan/macros.h b/libscan/macros.h index fb7dc61..f7e6b12 100644 --- a/libscan/macros.h +++ b/libscan/macros.h @@ -20,6 +20,9 @@ #undef ABS #define ABS(a) (((a) < 0) ? -(a) : (a)) +#define SHA1_STR_LENGTH 41 +#define SHA1_DIGEST_LENGTH 20 + #define APPEND_STR_META(doc, keyname, value) \ {meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \ meta_str->key = keyname; \ diff --git a/libscan/media/media.c b/libscan/media/media.c index 73dc52e..dba4bcf 100644 --- a/libscan/media/media.c +++ b/libscan/media/media.c @@ -511,7 +511,7 @@ int memfile_read(void *ptr, uint8_t *buf, int buf_size) { return AVERROR_EOF; } - return buf_size; + return (int) ret; } long memfile_seek(void *ptr, long offset, int whence) { @@ -540,11 +540,18 @@ int memfile_open(vfile_t *f, memfile_t *mem) { int ret = f->read(f, mem->buf, mem->info.st_size); mem->file = fmemopen(mem->buf, mem->info.st_size, "rb"); + if (f->calculate_checksum) { + SHA1_Init(&f->sha1_ctx); + safe_sha1_update(&f->sha1_ctx, mem->buf, mem->info.st_size); + SHA1_Final(f->sha1_digest, &f->sha1_ctx); + f->has_checksum = TRUE; + } + return (ret == mem->info.st_size && mem->file != NULL) ? 0 : -1; } int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) { - mem->info.st_size = buf_len; + mem->info.st_size = (int) buf_len; mem->buf = buf; mem->file = fmemopen(mem->buf, mem->info.st_size, "rb"); @@ -619,7 +626,7 @@ void init_media() { } int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) { - memfile_t memfile; + memfile_t memfile = {{}, 0, 0}; AVIOContext *io_ctx = NULL; AVFormatContext *pFormatCtx = avformat_alloc_context(); @@ -637,8 +644,6 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu } else { avformat_close_input(&pFormatCtx); avformat_free_context(pFormatCtx); - av_free(io_ctx->buffer); - avio_context_free(&io_ctx); fclose(memfile.file); return FALSE; } diff --git a/libscan/scan.h b/libscan/scan.h index e06e8c3..47b0177 100644 --- a/libscan/scan.h +++ b/libscan/scan.h @@ -8,6 +8,7 @@ #include #include #include +#include #include "macros.h" @@ -69,6 +70,7 @@ enum metakey { MetaAuthor, MetaModifiedBy, MetaThumbnail, + MetaChecksum, // Number MetaWidth, @@ -130,9 +132,14 @@ typedef struct vfile { }; int is_fs_file; + int has_checksum; + int calculate_checksum; const char *filepath; struct stat info; + SHA_CTX sha1_ctx; + unsigned char sha1_digest[SHA1_DIGEST_LENGTH]; + read_func_t read; seek_func_t seek; close_func_t close; diff --git a/libscan/util.h b/libscan/util.h index f6dc6cd..acfa16d 100644 --- a/libscan/util.h +++ b/libscan/util.h @@ -13,7 +13,10 @@ #define INITIAL_BUF_SIZE (1024 * 16) #define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c)) -#define SHOULD_KEEP_CHAR(c) (((c) >= '\'' && (c) <= ';') || ((c) >= 'A' && (c) <= 'z') || ((c) > 127)) +#define SHOULD_KEEP_CHAR(c) (\ + ((c) >= '\'' && (c) <= ';') || \ + ((c) >= 'A' && (c) <= 'z') || \ + ((c) > 127 && (c) != 0x00A0 && (c) && (c) != 0xFFFD)) typedef struct dyn_buffer { @@ -333,4 +336,26 @@ static void *read_all(vfile_t *f, size_t *size) { return buf; } +#define STACK_BUFFER_SIZE (size_t)(4096 * 8) + +__always_inline +static void safe_sha1_update(SHA_CTX *ctx, void *buf, size_t size) { + unsigned char stack_buf[STACK_BUFFER_SIZE]; + + void *sha1_buf; + if (size <= STACK_BUFFER_SIZE) { + sha1_buf = stack_buf; + } else { + void *heap_sha1_buf = malloc(size); + sha1_buf = heap_sha1_buf; + } + + memcpy(sha1_buf, buf, size); + SHA1_Update(ctx, (const void *) sha1_buf, size); + + if (sha1_buf != stack_buf) { + free(sha1_buf); + } +} + #endif diff --git a/test/main.cpp b/test/main.cpp index 42f2818..675a8ab 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -46,7 +46,7 @@ static scan_wpd_ctx_t wpd_ctx; static scan_json_ctx_t json_ctx; -document_t LastSubDoc; +static document_t LastSubDoc; void _parse_media(parse_job_t *job) { parse_media(&media_ctx, &job->vfile, &LastSubDoc); @@ -225,6 +225,24 @@ TEST(Ebook, Utf8Pdf) { cleanup(&doc, &f); } +TEST(Ebook, Utf8PdfInvalidChars) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/ebook/invalid_chars.pdf", &f, &doc); + + ebook_ctx.tesseract_lang = nullptr; + + parse_ebook(&ebook_ctx, &f, "application/pdf", &doc); + + ebook_ctx.tesseract_lang = "eng"; + + // It should say "HART is a group of highly qualified ..." but the PDF + // text is been intentionally fucked with by the authors + // We can at least filter out the non-printable/invalid characters like '�' etc + ASSERT_TRUE(STR_STARTS_WITH(get_meta(&doc, MetaContent)->str_val, "HART i a g f highl alified ")); + cleanup(&doc, &f); +} + TEST(Ebook, Pdf2) { vfile_t f; document_t doc; @@ -418,6 +436,20 @@ TEST(MediaImage, Mem1) { cleanup(&doc, &f); } +TEST(MediaImage, AsIsFs) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/media/9555.jpg", &f, &doc); + + size_t size_before = store_size; + + parse_media(&media_ctx, &f, &doc); + + ASSERT_EQ(size_before + 14098, store_size); + + cleanup(&doc, &f); +} + TEST(MediaImage, Mem2AsIs) { vfile_t f; document_t doc; diff --git a/test/test_util.cpp b/test/test_util.cpp index 9e134b0..0174d40 100644 --- a/test/test_util.cpp +++ b/test/test_util.cpp @@ -61,6 +61,8 @@ void load_file(const char *filepath, vfile_t *f) { f->read = fs_read; f->close = fs_close; f->is_fs_file = TRUE; + f->calculate_checksum = TRUE; + f->has_checksum = FALSE; } void load_mem(void *mem, size_t size, vfile_t *f) {