mirror of
https://github.com/simon987/libscan.git
synced 2025-04-05 04:22:58 +00:00
Support for sha1sum
This commit is contained in:
parent
23da8ada5f
commit
52d7649322
@ -4,6 +4,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <openssl/evp.h>
|
||||
|
||||
|
||||
int should_parse_filtered_file(const char *filepath, int ext) {
|
||||
@ -33,18 +34,29 @@ int should_parse_filtered_file(const char *filepath, int ext) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
int arc_read(struct vfile *f, void *buf, size_t size) {
|
||||
size_t read = archive_read_data(f->arc, buf, size);
|
||||
void arc_close(struct vfile *f) {
|
||||
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
|
||||
}
|
||||
|
||||
if (read != size) {
|
||||
const char* error_str = archive_error_string(f->arc);
|
||||
|
||||
int arc_read(struct vfile *f, void *buf, size_t size) {
|
||||
size_t bytes_read = archive_read_data(f->arc, buf, size);
|
||||
|
||||
if (bytes_read != 0 && bytes_read <= size && f->calculate_checksum) {
|
||||
f->has_checksum = TRUE;
|
||||
|
||||
safe_sha1_update(&f->sha1_ctx, (unsigned char*)buf, bytes_read);
|
||||
}
|
||||
|
||||
if (bytes_read != size) {
|
||||
const char *error_str = archive_error_string(f->arc);
|
||||
if (error_str != NULL) {
|
||||
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
return read;
|
||||
return (int) bytes_read;
|
||||
}
|
||||
|
||||
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) {
|
||||
@ -58,7 +70,7 @@ int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *ar
|
||||
archive_read_add_passphrase(*a, ctx->passphrase);
|
||||
}
|
||||
|
||||
return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE);
|
||||
return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE);
|
||||
} else if (allow_recurse) {
|
||||
*a = archive_read_new();
|
||||
archive_read_support_filter_all(*a);
|
||||
@ -102,8 +114,8 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
|
||||
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
|
||||
const char* utf8_name = archive_entry_pathname_utf8(entry);
|
||||
const char* file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
|
||||
const char *utf8_name = archive_entry_pathname_utf8(entry);
|
||||
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
|
||||
|
||||
dyn_buffer_append_string(&buf, file_path);
|
||||
dyn_buffer_write_char(&buf, ' ');
|
||||
@ -121,7 +133,7 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2);
|
||||
|
||||
sub_job->vfile.close = NULL;
|
||||
sub_job->vfile.close = arc_close;
|
||||
sub_job->vfile.read = arc_read;
|
||||
sub_job->vfile.reset = NULL;
|
||||
sub_job->vfile.arc = a;
|
||||
@ -129,13 +141,15 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
sub_job->vfile.is_fs_file = FALSE;
|
||||
sub_job->vfile.log = ctx->log;
|
||||
sub_job->vfile.logf = ctx->logf;
|
||||
sub_job->vfile.has_checksum = FALSE;
|
||||
sub_job->vfile.calculate_checksum = f->calculate_checksum;
|
||||
memcpy(sub_job->parent, doc->path_md5, MD5_DIGEST_LENGTH);
|
||||
|
||||
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
|
||||
sub_job->vfile.info = *archive_entry_stat(entry);
|
||||
if (S_ISREG(sub_job->vfile.info.st_mode)) {
|
||||
|
||||
const char* utf8_name = archive_entry_pathname_utf8(entry);
|
||||
const char *utf8_name = archive_entry_pathname_utf8(entry);
|
||||
|
||||
if (utf8_name == NULL) {
|
||||
sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry));
|
||||
@ -151,6 +165,9 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
sub_job->ext = (int) strlen(sub_job->filepath);
|
||||
}
|
||||
|
||||
memset(&sub_job->vfile.sha1_ctx, 0, sizeof(sub_job->vfile.sha1_ctx));
|
||||
SHA1_Init(&sub_job->vfile.sha1_ctx);
|
||||
|
||||
ctx->parse(sub_job);
|
||||
}
|
||||
}
|
||||
|
@ -6,7 +6,7 @@
|
||||
#include <fcntl.h>
|
||||
#include "../scan.h"
|
||||
|
||||
# define ARC_SKIPPED -1
|
||||
# define ARC_SKIPPED (-1)
|
||||
#define ARC_MODE_SKIP 0
|
||||
#define ARC_MODE_LIST 1
|
||||
#define ARC_MODE_SHALLOW 2
|
||||
@ -31,27 +31,34 @@ typedef struct {
|
||||
} arc_data_t;
|
||||
|
||||
static int vfile_open_callback(struct archive *a, void *user_data) {
|
||||
arc_data_t *data = (arc_data_t*)user_data;
|
||||
arc_data_t *data = (arc_data_t *) user_data;
|
||||
|
||||
if (data->f->is_fs_file && data->f->fd == -1) {
|
||||
data->f->fd = open(data->f->filepath, O_RDONLY);
|
||||
if (!data->f->is_fs_file) {
|
||||
SHA1_Init(&data->f->sha1_ctx);
|
||||
}
|
||||
|
||||
return ARCHIVE_OK;
|
||||
}
|
||||
|
||||
static long vfile_read_callback(struct archive *a, void *user_data, const void **buf) {
|
||||
arc_data_t *data = (arc_data_t*)user_data;
|
||||
arc_data_t *data = (arc_data_t *) user_data;
|
||||
|
||||
*buf = data->buf;
|
||||
return data->f->read(data->f, data->buf, ARC_BUF_SIZE);
|
||||
long ret = data->f->read(data->f, data->buf, sizeof(data->buf));
|
||||
|
||||
if (!data->f->is_fs_file && ret > 0) {
|
||||
data->f->has_checksum = TRUE;
|
||||
safe_sha1_update(&data->f->sha1_ctx, (unsigned char*)data->buf, ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int vfile_close_callback(struct archive *a, void *user_data) {
|
||||
arc_data_t *data = (arc_data_t*)user_data;
|
||||
arc_data_t *data = (arc_data_t *) user_data;
|
||||
|
||||
if (data->f->close != NULL) {
|
||||
data->f->close(data->f);
|
||||
if (!data->f->is_fs_file) {
|
||||
SHA1_Final((unsigned char *) data->f->sha1_digest, &data->f->sha1_ctx);
|
||||
}
|
||||
|
||||
return ARCHIVE_OK;
|
||||
@ -63,6 +70,8 @@ int should_parse_filtered_file(const char *filepath, int ext);
|
||||
|
||||
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
int arc_read(struct vfile * f, void *buf, size_t size);
|
||||
int arc_read(struct vfile *f, void *buf, size_t size);
|
||||
|
||||
void arc_close(struct vfile *f);
|
||||
|
||||
#endif
|
||||
|
@ -67,10 +67,10 @@ scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
char *buf = malloc(JSON_BUF_SIZE + 1);
|
||||
char *buf = calloc(JSON_BUF_SIZE + 1, sizeof(char));
|
||||
*(buf + JSON_BUF_SIZE) = '\0';
|
||||
|
||||
text_buffer_t tex = text_buffer_create(-1);
|
||||
text_buffer_t tex = text_buffer_create(ctx->content_size);
|
||||
|
||||
size_t ret;
|
||||
int eof = FALSE;
|
||||
|
@ -20,6 +20,9 @@
|
||||
#undef ABS
|
||||
#define ABS(a) (((a) < 0) ? -(a) : (a))
|
||||
|
||||
#define SHA1_STR_LENGTH 41
|
||||
#define SHA1_DIGEST_LENGTH 20
|
||||
|
||||
#define APPEND_STR_META(doc, keyname, value) \
|
||||
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \
|
||||
meta_str->key = keyname; \
|
||||
|
@ -511,7 +511,7 @@ int memfile_read(void *ptr, uint8_t *buf, int buf_size) {
|
||||
return AVERROR_EOF;
|
||||
}
|
||||
|
||||
return buf_size;
|
||||
return (int) ret;
|
||||
}
|
||||
|
||||
long memfile_seek(void *ptr, long offset, int whence) {
|
||||
@ -540,11 +540,18 @@ int memfile_open(vfile_t *f, memfile_t *mem) {
|
||||
int ret = f->read(f, mem->buf, mem->info.st_size);
|
||||
mem->file = fmemopen(mem->buf, mem->info.st_size, "rb");
|
||||
|
||||
if (f->calculate_checksum) {
|
||||
SHA1_Init(&f->sha1_ctx);
|
||||
safe_sha1_update(&f->sha1_ctx, mem->buf, mem->info.st_size);
|
||||
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
|
||||
f->has_checksum = TRUE;
|
||||
}
|
||||
|
||||
return (ret == mem->info.st_size && mem->file != NULL) ? 0 : -1;
|
||||
}
|
||||
|
||||
int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) {
|
||||
mem->info.st_size = buf_len;
|
||||
mem->info.st_size = (int) buf_len;
|
||||
|
||||
mem->buf = buf;
|
||||
mem->file = fmemopen(mem->buf, mem->info.st_size, "rb");
|
||||
@ -619,7 +626,7 @@ void init_media() {
|
||||
}
|
||||
|
||||
int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) {
|
||||
memfile_t memfile;
|
||||
memfile_t memfile = {{}, 0, 0};
|
||||
AVIOContext *io_ctx = NULL;
|
||||
|
||||
AVFormatContext *pFormatCtx = avformat_alloc_context();
|
||||
@ -637,8 +644,6 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
|
||||
} else {
|
||||
avformat_close_input(&pFormatCtx);
|
||||
avformat_free_context(pFormatCtx);
|
||||
av_free(io_ctx->buffer);
|
||||
avio_context_free(&io_ctx);
|
||||
fclose(memfile.file);
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
#include <openssl/md5.h>
|
||||
#include <openssl/sha.h>
|
||||
|
||||
#include "macros.h"
|
||||
|
||||
@ -69,6 +70,7 @@ enum metakey {
|
||||
MetaAuthor,
|
||||
MetaModifiedBy,
|
||||
MetaThumbnail,
|
||||
MetaChecksum,
|
||||
|
||||
// Number
|
||||
MetaWidth,
|
||||
@ -130,9 +132,14 @@ typedef struct vfile {
|
||||
};
|
||||
|
||||
int is_fs_file;
|
||||
int has_checksum;
|
||||
int calculate_checksum;
|
||||
const char *filepath;
|
||||
struct stat info;
|
||||
|
||||
SHA_CTX sha1_ctx;
|
||||
unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
|
||||
|
||||
read_func_t read;
|
||||
seek_func_t seek;
|
||||
close_func_t close;
|
||||
|
@ -13,7 +13,10 @@
|
||||
#define INITIAL_BUF_SIZE (1024 * 16)
|
||||
|
||||
#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c))
|
||||
#define SHOULD_KEEP_CHAR(c) (((c) >= '\'' && (c) <= ';') || ((c) >= 'A' && (c) <= 'z') || ((c) > 127))
|
||||
#define SHOULD_KEEP_CHAR(c) (\
|
||||
((c) >= '\'' && (c) <= ';') || \
|
||||
((c) >= 'A' && (c) <= 'z') || \
|
||||
((c) > 127 && (c) != 0x00A0 && (c) && (c) != 0xFFFD))
|
||||
|
||||
|
||||
typedef struct dyn_buffer {
|
||||
@ -333,4 +336,26 @@ static void *read_all(vfile_t *f, size_t *size) {
|
||||
return buf;
|
||||
}
|
||||
|
||||
#define STACK_BUFFER_SIZE (size_t)(4096 * 8)
|
||||
|
||||
__always_inline
|
||||
static void safe_sha1_update(SHA_CTX *ctx, void *buf, size_t size) {
|
||||
unsigned char stack_buf[STACK_BUFFER_SIZE];
|
||||
|
||||
void *sha1_buf;
|
||||
if (size <= STACK_BUFFER_SIZE) {
|
||||
sha1_buf = stack_buf;
|
||||
} else {
|
||||
void *heap_sha1_buf = malloc(size);
|
||||
sha1_buf = heap_sha1_buf;
|
||||
}
|
||||
|
||||
memcpy(sha1_buf, buf, size);
|
||||
SHA1_Update(ctx, (const void *) sha1_buf, size);
|
||||
|
||||
if (sha1_buf != stack_buf) {
|
||||
free(sha1_buf);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -46,7 +46,7 @@ static scan_wpd_ctx_t wpd_ctx;
|
||||
static scan_json_ctx_t json_ctx;
|
||||
|
||||
|
||||
document_t LastSubDoc;
|
||||
static document_t LastSubDoc;
|
||||
|
||||
void _parse_media(parse_job_t *job) {
|
||||
parse_media(&media_ctx, &job->vfile, &LastSubDoc);
|
||||
@ -225,6 +225,24 @@ TEST(Ebook, Utf8Pdf) {
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Ebook, Utf8PdfInvalidChars) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/ebook/invalid_chars.pdf", &f, &doc);
|
||||
|
||||
ebook_ctx.tesseract_lang = nullptr;
|
||||
|
||||
parse_ebook(&ebook_ctx, &f, "application/pdf", &doc);
|
||||
|
||||
ebook_ctx.tesseract_lang = "eng";
|
||||
|
||||
// It should say "HART is a group of highly qualified ..." but the PDF
|
||||
// text is been intentionally fucked with by the authors
|
||||
// We can at least filter out the non-printable/invalid characters like '<27>' etc
|
||||
ASSERT_TRUE(STR_STARTS_WITH(get_meta(&doc, MetaContent)->str_val, "HART i a g f highl alified "));
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(Ebook, Pdf2) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
@ -418,6 +436,20 @@ TEST(MediaImage, Mem1) {
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(MediaImage, AsIsFs) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/media/9555.jpg", &f, &doc);
|
||||
|
||||
size_t size_before = store_size;
|
||||
|
||||
parse_media(&media_ctx, &f, &doc);
|
||||
|
||||
ASSERT_EQ(size_before + 14098, store_size);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
|
||||
TEST(MediaImage, Mem2AsIs) {
|
||||
vfile_t f;
|
||||
document_t doc;
|
||||
|
@ -61,6 +61,8 @@ void load_file(const char *filepath, vfile_t *f) {
|
||||
f->read = fs_read;
|
||||
f->close = fs_close;
|
||||
f->is_fs_file = TRUE;
|
||||
f->calculate_checksum = TRUE;
|
||||
f->has_checksum = FALSE;
|
||||
}
|
||||
|
||||
void load_mem(void *mem, size_t size, vfile_t *f) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user