mirror of
https://github.com/simon987/sist2.git
synced 2025-12-10 22:18:54 +00:00
process pool mostly works, still WIP
This commit is contained in:
22
third-party/libscan/libscan/arc/arc.c
vendored
22
third-party/libscan/libscan/arc/arc.c
vendored
@@ -188,14 +188,13 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre
|
||||
|
||||
} else {
|
||||
|
||||
parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2);
|
||||
parse_job_t *sub_job = malloc(sizeof(parse_job_t));
|
||||
|
||||
sub_job->vfile.close = arc_close;
|
||||
sub_job->vfile.read = arc_read;
|
||||
sub_job->vfile.read_rewindable = arc_read_rewindable;
|
||||
sub_job->vfile.reset = NULL;
|
||||
sub_job->vfile.arc = a;
|
||||
sub_job->vfile.filepath = sub_job->filepath;
|
||||
sub_job->vfile.is_fs_file = FALSE;
|
||||
sub_job->vfile.rewind_buffer_size = 0;
|
||||
sub_job->vfile.rewind_buffer = NULL;
|
||||
@@ -206,22 +205,29 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre
|
||||
strcpy(sub_job->parent, doc->doc_id);
|
||||
|
||||
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
|
||||
sub_job->vfile.info = *archive_entry_stat(entry);
|
||||
struct stat entry_stat = *archive_entry_stat(entry);
|
||||
sub_job->vfile.st_mode = entry_stat.st_mode;
|
||||
sub_job->vfile.st_size = entry_stat.st_size;
|
||||
sub_job->vfile.mtime = (int) entry_stat.st_mtim.tv_sec;
|
||||
|
||||
double decompressed_size_ratio = (double) sub_job->vfile.info.st_size / (double) f->info.st_size;
|
||||
double decompressed_size_ratio = (double) sub_job->vfile.st_size / (double) f->st_size;
|
||||
if (decompressed_size_ratio > MAX_DECOMPRESSED_SIZE_RATIO) {
|
||||
CTX_LOG_DEBUGF("arc.c", "Skipped %s, possible zip bomb (decompressed_size_ratio=%f)", sub_job->filepath, decompressed_size_ratio)
|
||||
CTX_LOG_DEBUGF("arc.c", "Skipped %s, possible zip bomb (decompressed_size_ratio=%f)", sub_job->filepath,
|
||||
decompressed_size_ratio)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (S_ISREG(sub_job->vfile.info.st_mode)) {
|
||||
if (S_ISREG(sub_job->vfile.st_mode)) {
|
||||
|
||||
const char *utf8_name = archive_entry_pathname_utf8(entry);
|
||||
|
||||
if (utf8_name == NULL) {
|
||||
sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry));
|
||||
snprintf(sub_job->filepath, sizeof(sub_job->filepath), "%s#/%s", f->filepath,
|
||||
archive_entry_pathname(entry));
|
||||
strcpy(sub_job->vfile.filepath, sub_job->filepath);
|
||||
} else {
|
||||
sprintf(sub_job->filepath, "%s#/%s", f->filepath, utf8_name);
|
||||
snprintf(sub_job->filepath, sizeof(sub_job->filepath), "%s#/%s", f->filepath, utf8_name);
|
||||
strcpy(sub_job->vfile.filepath, sub_job->filepath);
|
||||
}
|
||||
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
|
||||
|
||||
|
||||
14
third-party/libscan/libscan/ebook/ebook.c
vendored
14
third-party/libscan/libscan/ebook/ebook.c
vendored
@@ -1,28 +1,34 @@
|
||||
#include "ebook.h"
|
||||
#include <mupdf/fitz.h>
|
||||
#include <pthread.h>
|
||||
#include <tesseract/capi.h>
|
||||
|
||||
#include "../media/media.h"
|
||||
#include "../arc/arc.h"
|
||||
#include "../ocr/ocr.h"
|
||||
|
||||
#if EBOOK_LOCKS
|
||||
#include <pthread.h>
|
||||
pthread_mutex_t Mutex;
|
||||
#endif
|
||||
|
||||
/* fill_image callback doesn't let us pass opaque pointers unless I create my own device */
|
||||
__thread text_buffer_t thread_buffer;
|
||||
__thread scan_ebook_ctx_t thread_ctx;
|
||||
|
||||
pthread_mutex_t Mutex;
|
||||
|
||||
static void my_fz_lock(UNUSED(void *user), int lock) {
|
||||
#if EBOOK_LOCKS
|
||||
if (lock == FZ_LOCK_FREETYPE) {
|
||||
pthread_mutex_lock(&Mutex);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void my_fz_unlock(UNUSED(void *user), int lock) {
|
||||
#if EBOOK_LOCKS
|
||||
if (lock == FZ_LOCK_FREETYPE) {
|
||||
pthread_mutex_unlock(&Mutex);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -187,11 +193,13 @@ void fz_warn_callback(void *user, const char *message) {
|
||||
static void init_fzctx(fz_context *fzctx, document_t *doc) {
|
||||
fz_register_document_handlers(fzctx);
|
||||
|
||||
#if EBOOK_LOCKS
|
||||
static int mu_is_initialized = FALSE;
|
||||
if (!mu_is_initialized) {
|
||||
pthread_mutex_init(&Mutex, NULL);
|
||||
mu_is_initialized = TRUE;
|
||||
}
|
||||
#endif
|
||||
|
||||
fzctx->warn.print_user = doc;
|
||||
fzctx->warn.print = fz_warn_callback;
|
||||
|
||||
1
third-party/libscan/libscan/ebook/ebook.h
vendored
1
third-party/libscan/libscan/ebook/ebook.h
vendored
@@ -9,7 +9,6 @@ typedef struct {
|
||||
int enable_tn;
|
||||
const char *tesseract_lang;
|
||||
const char *tesseract_path;
|
||||
pthread_mutex_t mupdf_mutex;
|
||||
|
||||
log_callback_t log;
|
||||
logf_callback_t logf;
|
||||
|
||||
2
third-party/libscan/libscan/json/json.c
vendored
2
third-party/libscan/libscan/json/json.c
vendored
@@ -32,7 +32,7 @@ int json_extract_text(cJSON *json, text_buffer_t *tex) {
|
||||
|
||||
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
if (f->info.st_size > JSON_MAX_FILE_SIZE) {
|
||||
if (f->st_size > JSON_MAX_FILE_SIZE) {
|
||||
CTX_LOG_WARNINGF("json.c", "File larger than maximum allowed [%s]", f->filepath)
|
||||
return SCAN_ERR_SKIP;
|
||||
}
|
||||
|
||||
8
third-party/libscan/libscan/media/media.c
vendored
8
third-party/libscan/libscan/media/media.c
vendored
@@ -687,7 +687,7 @@ long memfile_seek(void *ptr, long offset, int whence) {
|
||||
}
|
||||
|
||||
int memfile_open(vfile_t *f, memfile_t *mem) {
|
||||
mem->size = f->info.st_size;
|
||||
mem->size = f->st_size;
|
||||
|
||||
mem->buf = malloc(mem->size);
|
||||
if (mem->buf == NULL) {
|
||||
@@ -737,16 +737,16 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc,
|
||||
|
||||
const char *filepath = get_filepath_with_ext(doc, f->filepath, mime_str);
|
||||
|
||||
if (f->info.st_size <= ctx->max_media_buffer) {
|
||||
if (f->st_size <= ctx->max_media_buffer) {
|
||||
int ret = memfile_open(f, &memfile);
|
||||
if (ret == 0) {
|
||||
CTX_LOG_DEBUGF(f->filepath, "Loading media file in memory (%ldB)", f->info.st_size)
|
||||
CTX_LOG_DEBUGF(f->filepath, "Loading media file in memory (%ldB)", f->st_size)
|
||||
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, &memfile, memfile_read, NULL, memfile_seek);
|
||||
}
|
||||
}
|
||||
|
||||
if (io_ctx == NULL) {
|
||||
CTX_LOG_DEBUGF(f->filepath, "Reading media file without seek support", f->info.st_size)
|
||||
CTX_LOG_DEBUGF(f->filepath, "Reading media file without seek support", f->st_size)
|
||||
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, f, vfile_read, NULL, NULL);
|
||||
}
|
||||
|
||||
|
||||
14
third-party/libscan/libscan/scan.h
vendored
14
third-party/libscan/libscan/scan.h
vendored
@@ -51,6 +51,8 @@ typedef int scan_code_t;
|
||||
#define SIST_DOC_ID_LEN MD5_STR_LENGTH
|
||||
#define SIST_INDEX_ID_LEN MD5_STR_LENGTH
|
||||
|
||||
#define EBOOK_LOCKS 0
|
||||
|
||||
enum metakey {
|
||||
// String
|
||||
MetaContent = 1,
|
||||
@@ -100,7 +102,6 @@ typedef struct meta_line {
|
||||
union {
|
||||
char str_val[0];
|
||||
unsigned long long_val;
|
||||
double double_val;
|
||||
};
|
||||
} meta_line_t;
|
||||
|
||||
@@ -114,7 +115,7 @@ typedef struct document {
|
||||
short ext;
|
||||
meta_line_t *meta_head;
|
||||
meta_line_t *meta_tail;
|
||||
char filepath[PATH_MAX];
|
||||
char filepath[PATH_MAX * 2 + 1];
|
||||
} document_t;
|
||||
|
||||
typedef struct vfile vfile_t;
|
||||
@@ -139,8 +140,11 @@ typedef struct vfile {
|
||||
int is_fs_file;
|
||||
int has_checksum;
|
||||
int calculate_checksum;
|
||||
const char *filepath;
|
||||
struct stat info;
|
||||
char filepath[PATH_MAX * 2 + 1];
|
||||
|
||||
int mtime;
|
||||
size_t st_size;
|
||||
unsigned int st_mode;
|
||||
|
||||
SHA_CTX sha1_ctx;
|
||||
unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
|
||||
@@ -162,7 +166,7 @@ typedef struct parse_job_t {
|
||||
int ext;
|
||||
struct vfile vfile;
|
||||
char parent[SIST_DOC_ID_LEN];
|
||||
char filepath[PATH_MAX];
|
||||
char filepath[PATH_MAX * 2 + 1];
|
||||
} parse_job_t;
|
||||
|
||||
|
||||
|
||||
4
third-party/libscan/libscan/text/text.c
vendored
4
third-party/libscan/libscan/text/text.c
vendored
@@ -2,7 +2,7 @@
|
||||
|
||||
scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
int to_read = MIN(ctx->content_size, f->info.st_size);
|
||||
int to_read = MIN(ctx->content_size, f->st_size);
|
||||
|
||||
if (to_read <= 2) {
|
||||
return SCAN_OK;
|
||||
@@ -39,7 +39,7 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
int to_read = MIN(MAX_MARKUP_SIZE, f->info.st_size);
|
||||
int to_read = MIN(MAX_MARKUP_SIZE, f->st_size);
|
||||
|
||||
char *buf = malloc(to_read + 1);
|
||||
int ret = f->read(f, buf, to_read);
|
||||
|
||||
6
third-party/libscan/libscan/util.h
vendored
6
third-party/libscan/libscan/util.h
vendored
@@ -325,10 +325,10 @@ static int text_buffer_append_markup(text_buffer_t *buf, const char *markup) {
|
||||
}
|
||||
|
||||
static void *read_all(vfile_t *f, size_t *size) {
|
||||
void *buf = malloc(f->info.st_size);
|
||||
*size = f->read(f, buf, f->info.st_size);
|
||||
void *buf = malloc(f->st_size);
|
||||
*size = f->read(f, buf, f->st_size);
|
||||
|
||||
if (*size != f->info.st_size) {
|
||||
if (*size != f->st_size) {
|
||||
free(buf);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
14
third-party/libscan/test/test_util.cpp
vendored
14
third-party/libscan/test/test_util.cpp
vendored
@@ -50,14 +50,20 @@ void cleanup(document_t *doc, vfile_t *f) {
|
||||
}
|
||||
|
||||
void load_file(const char *filepath, vfile_t *f) {
|
||||
stat(filepath, &f->info);
|
||||
struct stat info = {};
|
||||
stat(filepath, &info);
|
||||
|
||||
f->mtime = (int)info.st_mtim.tv_sec;
|
||||
f->st_size = info.st_size;
|
||||
f->st_mode = info.st_mode;
|
||||
|
||||
f->fd = open(filepath, O_RDONLY);
|
||||
|
||||
if (f->fd == -1) {
|
||||
FAIL() << FILE_NOT_FOUND_ERR;
|
||||
}
|
||||
|
||||
f->filepath = filepath;
|
||||
memcpy(f->filepath, filepath, sizeof(f->filepath));
|
||||
f->read = fs_read;
|
||||
f->close = fs_close;
|
||||
f->is_fs_file = TRUE;
|
||||
@@ -66,9 +72,9 @@ void load_file(const char *filepath, vfile_t *f) {
|
||||
}
|
||||
|
||||
void load_mem(void *mem, size_t size, vfile_t *f) {
|
||||
f->filepath = "_mem_";
|
||||
memcpy(f->filepath, "_mem_", strlen("_mem_"));
|
||||
f->_test_data = mem;
|
||||
f->info.st_size = (int) size;
|
||||
f->st_size = size;
|
||||
f->read = mem_read;
|
||||
f->close = nullptr;
|
||||
f->is_fs_file = TRUE;
|
||||
|
||||
Reference in New Issue
Block a user