mirror of
https://github.com/simon987/sist2.git
synced 2025-04-03 07:22:59 +00:00
Add test files as submodule, remove support for msword thumbnails
This commit is contained in:
parent
87ecc5ef6d
commit
9e0d7bf992
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -10,3 +10,6 @@
|
||||
[submodule "third-party/libscan/third-party/libmobi"]
|
||||
path = third-party/libscan/third-party/libmobi
|
||||
url = https://github.com/bfabiszewski/libmobi
|
||||
[submodule "third-party/libscan/libscan-test-files"]
|
||||
path = third-party/libscan/libscan-test-files
|
||||
url = https://github.com/simon987/libscan-test-files
|
||||
|
@ -81,7 +81,7 @@ See [Usage guide](docs/USAGE.md) for more details
|
||||
| html, xml | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - |
|
||||
| tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
|
||||
| docx, xlsx, pptx | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | if embedded | creator, modified_by, title |
|
||||
| doc (MS Word 97-2003) | antiword | yes | yes | author, title |
|
||||
| doc (MS Word 97-2003) | antiword | yes | no | author, title |
|
||||
| mobi, azw, azw3 | libmobi | yes | no | author, title |
|
||||
| wpd (WordPerfect) | libwpd | yes | no | *planned* |
|
||||
| json, jsonl, ndjson | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | - | - |
|
||||
|
1
third-party/libscan/libscan-test-files
vendored
Submodule
1
third-party/libscan/libscan-test-files
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit cdf1f89423424b2091520bfe8c580d682fe01f7d
|
58
third-party/libscan/libscan/msdoc/msdoc.c
vendored
58
third-party/libscan/libscan/msdoc/msdoc.c
vendored
@ -4,8 +4,6 @@
|
||||
#include <sys/mman.h>
|
||||
#include "../../third-party/antiword/src/antiword.h"
|
||||
|
||||
#include "../ebook/ebook.h"
|
||||
|
||||
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void *buf, size_t buf_len) {
|
||||
|
||||
// Open word doc
|
||||
@ -71,57 +69,6 @@ void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, voi
|
||||
free(out_buf);
|
||||
}
|
||||
|
||||
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void *buf, size_t buf_len) {
|
||||
|
||||
scan_ebook_ctx_t ebook_ctx = {
|
||||
.content_size = ctx->content_size,
|
||||
.tn_size = ctx->tn_size,
|
||||
.enable_tn = TRUE,
|
||||
.log = ctx->log,
|
||||
.logf = ctx->logf,
|
||||
.store = ctx->store,
|
||||
};
|
||||
|
||||
// Open word doc
|
||||
options_type *opts = direct_vGetOptions();
|
||||
opts->iParagraphBreak = 74;
|
||||
opts->eConversionType = conversion_pdf;
|
||||
opts->bHideHiddenText = 1;
|
||||
opts->bRemoveRemovedText = 1;
|
||||
opts->bUseLandscape = 0;
|
||||
opts->eEncoding = encoding_latin_1;
|
||||
opts->iPageHeight = 842; // A4
|
||||
opts->iPageWidth = 595;
|
||||
opts->eImageLevel = level_ps_3;
|
||||
|
||||
int doc_word_version = iGuessVersionNumber(file, (int) buf_len);
|
||||
if (doc_word_version < 0 || doc_word_version == 3) {
|
||||
free(buf);
|
||||
return;
|
||||
}
|
||||
rewind(file);
|
||||
|
||||
size_t out_len;
|
||||
char *out_buf;
|
||||
|
||||
FILE *file_out = open_memstream(&out_buf, &out_len);
|
||||
|
||||
diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
|
||||
if (diag == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
bWordDecryptor(file, (int) buf_len, diag);
|
||||
vDestroyDiagram(diag);
|
||||
|
||||
fclose(file_out);
|
||||
|
||||
parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc, TRUE);
|
||||
|
||||
free(buf);
|
||||
free(out_buf);
|
||||
}
|
||||
|
||||
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
size_t buf_len;
|
||||
@ -138,11 +85,6 @@ void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (ctx->enable_tn) {
|
||||
char *buf_pdf = malloc(buf_len);
|
||||
memcpy(buf_pdf, buf, buf_len);
|
||||
parse_msdoc_pdf(ctx, doc, file, buf_pdf, buf_len);
|
||||
}
|
||||
parse_msdoc_text(ctx, doc, file, buf, buf_len);
|
||||
fclose(file);
|
||||
}
|
||||
|
2
third-party/libscan/libscan/msdoc/msdoc.h
vendored
2
third-party/libscan/libscan/msdoc/msdoc.h
vendored
@ -5,8 +5,6 @@
|
||||
|
||||
typedef struct {
|
||||
long content_size;
|
||||
int enable_tn;
|
||||
int tn_size;
|
||||
log_callback_t log;
|
||||
logf_callback_t logf;
|
||||
store_callback_t store;
|
||||
|
23
third-party/libscan/test/main.cpp
vendored
23
third-party/libscan/test/main.cpp
vendored
@ -916,15 +916,12 @@ TEST(Msdoc, Test1Pdf) {
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/msdoc/test1.doc", &f, &doc);
|
||||
|
||||
size_t size_before = store_size;
|
||||
|
||||
parse_msdoc(&msdoc_ctx, &f, &doc);
|
||||
|
||||
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "October 2000") != nullptr);
|
||||
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "INTERNATIONAL ORGANIZATION FOR STANDARDIZATION");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Oliver Morgan");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
|
||||
ASSERT_NE(size_before, store_size);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
@ -934,15 +931,12 @@ TEST(Msdoc, Test1Text) {
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/msdoc/test1.doc", &f, &doc);
|
||||
|
||||
size_t size_before = store_size;
|
||||
|
||||
parse_msdoc(&msdoc_text_ctx, &f, &doc);
|
||||
|
||||
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "October 2000") != nullptr);
|
||||
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "INTERNATIONAL ORGANIZATION FOR STANDARDIZATION");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Oliver Morgan");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
|
||||
ASSERT_EQ(size_before, store_size);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
@ -952,15 +946,12 @@ TEST(Msdoc, Test2Pdf) {
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/msdoc/test2.doc", &f, &doc);
|
||||
|
||||
size_t size_before = store_size;
|
||||
|
||||
parse_msdoc(&msdoc_ctx, &f, &doc);
|
||||
|
||||
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "GNU Free Documentation License") != nullptr);
|
||||
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "DWARF Debugging Information Format");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Ron Brender");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
|
||||
ASSERT_NE(size_before, store_size);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
@ -970,15 +961,12 @@ TEST(Msdoc, Test3Pdf) {
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/msdoc/test3.doc", &f, &doc);
|
||||
|
||||
size_t size_before = store_size;
|
||||
|
||||
parse_msdoc(&msdoc_ctx, &f, &doc);
|
||||
|
||||
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "INTERNATIONAL PATENT CLASSIFICATION") != nullptr);
|
||||
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "IPC Fixed Texts Specification");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Fievet");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
|
||||
ASSERT_NE(size_before, store_size);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
@ -988,15 +976,12 @@ TEST(Msdoc, Test4Pdf) {
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/msdoc/test4.doc", &f, &doc);
|
||||
|
||||
size_t size_before = store_size;
|
||||
|
||||
parse_msdoc(&msdoc_ctx, &f, &doc);
|
||||
|
||||
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "SQL Server international data types") != nullptr);
|
||||
ASSERT_STREQ(get_meta(&doc, MetaTitle)->str_val, "MSDN Authoring Template");
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "Brenda Yen");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
|
||||
ASSERT_NE(size_before, store_size);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
@ -1012,7 +997,6 @@ TEST(Msdoc, TestUtf8Pdf) {
|
||||
|
||||
ASSERT_NE(get_meta(&doc, MetaContent), nullptr);
|
||||
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "调查项目 A questionnaire") != nullptr);
|
||||
ASSERT_NE(size_before, store_size);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
@ -1034,14 +1018,11 @@ TEST(Msdoc, Test5Pdf) {
|
||||
document_t doc;
|
||||
load_doc_file("libscan-test-files/test_files/msdoc/test5.doc", &f, &doc);
|
||||
|
||||
size_t size_before = store_size;
|
||||
|
||||
parse_msdoc(&msdoc_ctx, &f, &doc);
|
||||
|
||||
ASSERT_TRUE(strstr(get_meta(&doc, MetaContent)->str_val, "орган Федеральной") != nullptr);
|
||||
ASSERT_STREQ(get_meta(&doc, MetaAuthor)->str_val, "uswo");
|
||||
ASSERT_NEAR(strlen(get_meta(&doc, MetaContent)->str_val), msdoc_ctx.content_size, 4);
|
||||
ASSERT_NE(size_before, store_size);
|
||||
|
||||
cleanup(&doc, &f);
|
||||
}
|
||||
@ -1184,15 +1165,11 @@ int main(int argc, char **argv) {
|
||||
msdoc_ctx.logf = noop_logf;
|
||||
msdoc_ctx.store = counter_store;
|
||||
msdoc_ctx.content_size = 500;
|
||||
msdoc_ctx.tn_size = 500;
|
||||
msdoc_ctx.enable_tn = TRUE;
|
||||
|
||||
msdoc_text_ctx.log = noop_log;
|
||||
msdoc_text_ctx.logf = noop_logf;
|
||||
msdoc_text_ctx.store = counter_store;
|
||||
msdoc_text_ctx.content_size = 500;
|
||||
msdoc_text_ctx.tn_size = 0;
|
||||
msdoc_text_ctx.enable_tn = FALSE;
|
||||
|
||||
wpd_ctx.log = noop_log;
|
||||
wpd_ctx.logf = noop_logf;
|
||||
|
Loading…
x
Reference in New Issue
Block a user