Support for sha1sum

This commit is contained in:
2021-09-11 13:00:59 -04:00
parent 23da8ada5f
commit 52d7649322
9 changed files with 129 additions and 29 deletions

View File

@@ -46,7 +46,7 @@ static scan_wpd_ctx_t wpd_ctx;
static scan_json_ctx_t json_ctx;
document_t LastSubDoc;
static document_t LastSubDoc;
void _parse_media(parse_job_t *job) {
parse_media(&media_ctx, &job->vfile, &LastSubDoc);
@@ -225,6 +225,24 @@ TEST(Ebook, Utf8Pdf) {
cleanup(&doc, &f);
}
TEST(Ebook, Utf8PdfInvalidChars) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/ebook/invalid_chars.pdf", &f, &doc);
ebook_ctx.tesseract_lang = nullptr;
parse_ebook(&ebook_ctx, &f, "application/pdf", &doc);
ebook_ctx.tesseract_lang = "eng";
// It should say "HART is a group of highly qualified ..." but the PDF
// text is been intentionally fucked with by the authors
// We can at least filter out the non-printable/invalid characters like '<27>' etc
ASSERT_TRUE(STR_STARTS_WITH(get_meta(&doc, MetaContent)->str_val, "HART i a g f highl alified "));
cleanup(&doc, &f);
}
TEST(Ebook, Pdf2) {
vfile_t f;
document_t doc;
@@ -418,6 +436,20 @@ TEST(MediaImage, Mem1) {
cleanup(&doc, &f);
}
TEST(MediaImage, AsIsFs) {
vfile_t f;
document_t doc;
load_doc_file("libscan-test-files/test_files/media/9555.jpg", &f, &doc);
size_t size_before = store_size;
parse_media(&media_ctx, &f, &doc);
ASSERT_EQ(size_before + 14098, store_size);
cleanup(&doc, &f);
}
TEST(MediaImage, Mem2AsIs) {
vfile_t f;
document_t doc;

View File

@@ -61,6 +61,8 @@ void load_file(const char *filepath, vfile_t *f) {
f->read = fs_read;
f->close = fs_close;
f->is_fs_file = TRUE;
f->calculate_checksum = TRUE;
f->has_checksum = FALSE;
}
void load_mem(void *mem, size_t size, vfile_t *f) {