From fe6232ed8222ee33178d6c50ff2c5ea6312e7dee Mon Sep 17 00:00:00 2001 From: simon987 Date: Sun, 10 May 2020 19:52:42 -0400 Subject: [PATCH] UTF-8 fix attempt w/ libarchive (simon987/sist2#44) --- libscan/arc/arc.c | 8 ++++---- test/main.cpp | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/libscan/arc/arc.c b/libscan/arc/arc.c index 89b8ac6..e46dc53 100644 --- a/libscan/arc/arc.c +++ b/libscan/arc/arc.c @@ -88,7 +88,7 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { archive_read_support_filter_all(a); archive_read_support_format_all(a); - ret = archive_read_open_filename(a, doc->filepath, ARC_BUF_SIZE); + ret = archive_read_open_filename(a, f->filepath, ARC_BUF_SIZE); } else if (ctx->mode == ARC_MODE_RECURSE) { a = archive_read_new(); @@ -106,7 +106,7 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { } if (ret != ARCHIVE_OK) { - CTX_LOG_ERRORF(doc->filepath, "(arc.c) [%d] %s", ret, archive_error_string(a)) + CTX_LOG_ERRORF(f->filepath, "(arc.c) [%d] %s", ret, archive_error_string(a)) archive_read_free(a); return SCAN_ERR_READ; } @@ -118,7 +118,7 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { if (S_ISREG(archive_entry_stat(entry)->st_mode)) { - char *path = (char *) archive_entry_pathname(entry); + char *path = (char *) archive_entry_pathname_utf8(entry); dyn_buffer_append_string(&buf, path); dyn_buffer_write_char(&buf, '\n'); @@ -147,7 +147,7 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc) { while (archive_read_next_header(a, &entry) == ARCHIVE_OK) { sub_job->vfile.info = *archive_entry_stat(entry); if (S_ISREG(sub_job->vfile.info.st_mode)) { - sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry)); + sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname_utf8(entry)); sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1; char *p = strrchr(sub_job->filepath, '.'); diff --git a/test/main.cpp b/test/main.cpp index e0d4d0f..e56fcb7 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -420,7 +420,23 @@ TEST(Mobi, Azw3) { cleanup(&doc, &f); } +/* Arc */ +TEST(Arc, Utf8) { + vfile_t f; + document_t doc; + load_doc_file("libscan-test-files/test_files/arc/test1.zip", &f, &doc); + + parse_archive(&arc_list_ctx, &f, &doc); + + ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "arctest/ȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬȬ.txt") != nullptr); + + cleanup(&doc, &f); +} + + int main(int argc, char **argv) { + setlocale(LC_ALL, ""); + arc_recurse_ctx.log = noop_log; arc_recurse_ctx.logf = noop_logf; arc_recurse_ctx.store = noop_store;