Support for sha1sum

This commit is contained in:
simon987 2021-09-11 13:00:59 -04:00
parent 23da8ada5f
commit 52d7649322
9 changed files with 129 additions and 29 deletions

View File

@ -4,6 +4,7 @@
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <openssl/evp.h>
int should_parse_filtered_file(const char *filepath, int ext) {
@ -33,18 +34,29 @@ int should_parse_filtered_file(const char *filepath, int ext) {
return FALSE;
}
int arc_read(struct vfile *f, void *buf, size_t size) {
size_t read = archive_read_data(f->arc, buf, size);
void arc_close(struct vfile *f) {
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
}
if (read != size) {
const char* error_str = archive_error_string(f->arc);
int arc_read(struct vfile *f, void *buf, size_t size) {
size_t bytes_read = archive_read_data(f->arc, buf, size);
if (bytes_read != 0 && bytes_read <= size && f->calculate_checksum) {
f->has_checksum = TRUE;
safe_sha1_update(&f->sha1_ctx, (unsigned char*)buf, bytes_read);
}
if (bytes_read != size) {
const char *error_str = archive_error_string(f->arc);
if (error_str != NULL) {
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
}
return -1;
}
return read;
return (int) bytes_read;
}
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) {
@ -58,7 +70,7 @@ int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *ar
archive_read_add_passphrase(*a, ctx->passphrase);
}
return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE);
return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE);
} else if (allow_recurse) {
*a = archive_read_new();
archive_read_support_filter_all(*a);
@ -102,8 +114,8 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
const char* utf8_name = archive_entry_pathname_utf8(entry);
const char* file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
const char *utf8_name = archive_entry_pathname_utf8(entry);
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
dyn_buffer_append_string(&buf, file_path);
dyn_buffer_write_char(&buf, ' ');
@ -121,7 +133,7 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2);
sub_job->vfile.close = NULL;
sub_job->vfile.close = arc_close;
sub_job->vfile.read = arc_read;
sub_job->vfile.reset = NULL;
sub_job->vfile.arc = a;
@ -129,13 +141,15 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
sub_job->vfile.is_fs_file = FALSE;
sub_job->vfile.log = ctx->log;
sub_job->vfile.logf = ctx->logf;
sub_job->vfile.has_checksum = FALSE;
sub_job->vfile.calculate_checksum = f->calculate_checksum;
memcpy(sub_job->parent, doc->path_md5, MD5_DIGEST_LENGTH);
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
sub_job->vfile.info = *archive_entry_stat(entry);
if (S_ISREG(sub_job->vfile.info.st_mode)) {
const char* utf8_name = archive_entry_pathname_utf8(entry);
const char *utf8_name = archive_entry_pathname_utf8(entry);
if (utf8_name == NULL) {
sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry));
@ -151,6 +165,9 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
sub_job->ext = (int) strlen(sub_job->filepath);
}
memset(&sub_job->vfile.sha1_ctx, 0, sizeof(sub_job->vfile.sha1_ctx));
SHA1_Init(&sub_job->vfile.sha1_ctx);
ctx->parse(sub_job);
}
}

View File

@ -6,7 +6,7 @@
#include <fcntl.h>
#include "../scan.h"
# define ARC_SKIPPED -1
# define ARC_SKIPPED (-1)
#define ARC_MODE_SKIP 0
#define ARC_MODE_LIST 1
#define ARC_MODE_SHALLOW 2
@ -31,27 +31,34 @@ typedef struct {
} arc_data_t;
static int vfile_open_callback(struct archive *a, void *user_data) {
arc_data_t *data = (arc_data_t*)user_data;
arc_data_t *data = (arc_data_t *) user_data;
if (data->f->is_fs_file && data->f->fd == -1) {
data->f->fd = open(data->f->filepath, O_RDONLY);
if (!data->f->is_fs_file) {
SHA1_Init(&data->f->sha1_ctx);
}
return ARCHIVE_OK;
}
static long vfile_read_callback(struct archive *a, void *user_data, const void **buf) {
arc_data_t *data = (arc_data_t*)user_data;
arc_data_t *data = (arc_data_t *) user_data;
*buf = data->buf;
return data->f->read(data->f, data->buf, ARC_BUF_SIZE);
long ret = data->f->read(data->f, data->buf, sizeof(data->buf));
if (!data->f->is_fs_file && ret > 0) {
data->f->has_checksum = TRUE;
safe_sha1_update(&data->f->sha1_ctx, (unsigned char*)data->buf, ret);
}
return ret;
}
static int vfile_close_callback(struct archive *a, void *user_data) {
arc_data_t *data = (arc_data_t*)user_data;
arc_data_t *data = (arc_data_t *) user_data;
if (data->f->close != NULL) {
data->f->close(data->f);
if (!data->f->is_fs_file) {
SHA1_Final((unsigned char *) data->f->sha1_digest, &data->f->sha1_ctx);
}
return ARCHIVE_OK;
@ -63,6 +70,8 @@ int should_parse_filtered_file(const char *filepath, int ext);
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc);
int arc_read(struct vfile * f, void *buf, size_t size);
int arc_read(struct vfile *f, void *buf, size_t size);
void arc_close(struct vfile *f);
#endif

View File

@ -67,10 +67,10 @@ scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
char *buf = malloc(JSON_BUF_SIZE + 1);
char *buf = calloc(JSON_BUF_SIZE + 1, sizeof(char));
*(buf + JSON_BUF_SIZE) = '\0';
text_buffer_t tex = text_buffer_create(-1);
text_buffer_t tex = text_buffer_create(ctx->content_size);
size_t ret;
int eof = FALSE;

View File

@ -20,6 +20,9 @@
#undef ABS
#define ABS(a) (((a) < 0) ? -(a) : (a))
#define SHA1_STR_LENGTH 41
#define SHA1_DIGEST_LENGTH 20
#define APPEND_STR_META(doc, keyname, value) \
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \
meta_str->key = keyname; \

View File

@ -511,7 +511,7 @@ int memfile_read(void *ptr, uint8_t *buf, int buf_size) {
return AVERROR_EOF;
}
return buf_size;
return (int) ret;
}
long memfile_seek(void *ptr, long offset, int whence) {
@ -540,11 +540,18 @@ int memfile_open(vfile_t *f, memfile_t *mem) {
int ret = f->read(f, mem->buf, mem->info.st_size);
mem->file = fmemopen(mem->buf, mem->info.st_size, "rb");
if (f->calculate_checksum) {
SHA1_Init(&f->sha1_ctx);
safe_sha1_update(&f->sha1_ctx, mem->buf, mem->info.st_size);
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
f->has_checksum = TRUE;
}
return (ret == mem->info.st_size && mem->file != NULL) ? 0 : -1;
}
int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) {
mem->info.st_size = buf_len;
mem->info.st_size = (int) buf_len;
mem->buf = buf;
mem->file = fmemopen(mem->buf, mem->info.st_size, "rb");
@ -619,7 +626,7 @@ void init_media() {
}
int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) {
memfile_t memfile;
memfile_t memfile = {{}, 0, 0};
AVIOContext *io_ctx = NULL;
AVFormatContext *pFormatCtx = avformat_alloc_context();
@ -637,8 +644,6 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
} else {
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}

View File

@ -8,6 +8,7 @@
#include <stdio.h>
#include <sys/stat.h>
#include <openssl/md5.h>
#include <openssl/sha.h>
#include "macros.h"
@ -69,6 +70,7 @@ enum metakey {
MetaAuthor,
MetaModifiedBy,
MetaThumbnail,
MetaChecksum,
// Number
MetaWidth,
@ -130,9 +132,14 @@ typedef struct vfile {
};
int is_fs_file;
int has_checksum;
int calculate_checksum;
const char *filepath;
struct stat info;
SHA_CTX sha1_ctx;
unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
read_func_t read;
seek_func_t seek;
close_func_t close;

View File

@ -13,7 +13,10 @@
#define INITIAL_BUF_SIZE (1024 * 16)
#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c))
#define SHOULD_KEEP_CHAR(c) (((c) >= '\'' && (c) <= ';') || ((c) >= 'A' && (c) <= 'z') || ((c) > 127))
#define SHOULD_KEEP_CHAR(c) (\
((c) >= '\'' && (c) <= ';') || \
((c) >= 'A' && (c) <= 'z') || \
((c) > 127 && (c) != 0x00A0 && (c) && (c) != 0xFFFD))
typedef struct dyn_buffer {
@ -333,4 +336,26 @@ static void *read_all(vfile_t *f, size_t *size) {
return buf;
}
#define STACK_BUFFER_SIZE (size_t)(4096 * 8)
__always_inline
static void safe_sha1_update(SHA_CTX *ctx, void *buf, size_t size) {
unsigned char stack_buf[STACK_BUFFER_SIZE];
void *sha1_buf;
if (size <= STACK_BUFFER_SIZE) {
sha1_buf = stack_buf;
} else {
void *heap_sha1_buf = malloc(size);
sha1_buf = heap_sha1_buf;
}
memcpy(sha1_buf, buf, size);
SHA1_Update(ctx, (const void *) sha1_buf, size);
if (sha1_buf != stack_buf) {
free(sha1_buf);
}
}
#endif

View File

@ -46,7 +46,7 @@ static scan_wpd_ctx_t wpd_ctx;
static scan_json_ctx_t json_ctx;
document_t LastSubDoc;
static document_t LastSubDoc;
void _parse_media(parse_job_t *job) {
parse_media(&media_ctx, &job->vfile, &LastSubDoc);
@ -225,6 +225,24 @@ TEST(Ebook, Utf8Pdf) {
cleanup(&doc, &f);
}
TEST(Ebook, Utf8PdfInvalidChars) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ebook/invalid_chars.pdf", &f, &doc);
ebook_ctx.tesseract_lang = nullptr;
parse_ebook(&ebook_ctx, &f, "application/pdf", &doc);
ebook_ctx.tesseract_lang = "eng";
// It should say "HART is a group of highly qualified ..." but the PDF
// text is been intentionally fucked with by the authors
// We can at least filter out the non-printable/invalid characters like '<27>' etc
ASSERT_TRUE(STR_STARTS_WITH(get_meta(&doc, MetaContent)->str_val, "HART i a g f highl alified "));
cleanup(&doc, &f);
}
TEST(Ebook, Pdf2) {
vfile_t f;
document_t doc;
@ -418,6 +436,20 @@ TEST(MediaImage, Mem1) {
cleanup(&doc, &f);
}
TEST(MediaImage, AsIsFs) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/media/9555.jpg", &f, &doc);
size_t size_before = store_size;
parse_media(&media_ctx, &f, &doc);
ASSERT_EQ(size_before + 14098, store_size);
cleanup(&doc, &f);
}
TEST(MediaImage, Mem2AsIs) {
vfile_t f;
document_t doc;

View File

@ -61,6 +61,8 @@ void load_file(const char *filepath, vfile_t *f) {
f->read = fs_read;
f->close = fs_close;
f->is_fs_file = TRUE;
f->calculate_checksum = TRUE;
f->has_checksum = FALSE;
}
void load_mem(void *mem, size_t size, vfile_t *f) {