process pool mostly works, still WIP

This commit is contained in:
2023-03-09 22:11:21 -05:00
parent 8c662bb8f8
commit f8abffba81
25 changed files with 1219 additions and 267 deletions

View File

@@ -188,14 +188,13 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre
} else {
parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2);
parse_job_t *sub_job = malloc(sizeof(parse_job_t));
sub_job->vfile.close = arc_close;
sub_job->vfile.read = arc_read;
sub_job->vfile.read_rewindable = arc_read_rewindable;
sub_job->vfile.reset = NULL;
sub_job->vfile.arc = a;
sub_job->vfile.filepath = sub_job->filepath;
sub_job->vfile.is_fs_file = FALSE;
sub_job->vfile.rewind_buffer_size = 0;
sub_job->vfile.rewind_buffer = NULL;
@@ -206,22 +205,29 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre
strcpy(sub_job->parent, doc->doc_id);
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
sub_job->vfile.info = *archive_entry_stat(entry);
struct stat entry_stat = *archive_entry_stat(entry);
sub_job->vfile.st_mode = entry_stat.st_mode;
sub_job->vfile.st_size = entry_stat.st_size;
sub_job->vfile.mtime = (int) entry_stat.st_mtim.tv_sec;
double decompressed_size_ratio = (double) sub_job->vfile.info.st_size / (double) f->info.st_size;
double decompressed_size_ratio = (double) sub_job->vfile.st_size / (double) f->st_size;
if (decompressed_size_ratio > MAX_DECOMPRESSED_SIZE_RATIO) {
CTX_LOG_DEBUGF("arc.c", "Skipped %s, possible zip bomb (decompressed_size_ratio=%f)", sub_job->filepath, decompressed_size_ratio)
CTX_LOG_DEBUGF("arc.c", "Skipped %s, possible zip bomb (decompressed_size_ratio=%f)", sub_job->filepath,
decompressed_size_ratio)
continue;
}
if (S_ISREG(sub_job->vfile.info.st_mode)) {
if (S_ISREG(sub_job->vfile.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
if (utf8_name == NULL) {
sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry));
snprintf(sub_job->filepath, sizeof(sub_job->filepath), "%s#/%s", f->filepath,
archive_entry_pathname(entry));
strcpy(sub_job->vfile.filepath, sub_job->filepath);
} else {
sprintf(sub_job->filepath, "%s#/%s", f->filepath, utf8_name);
snprintf(sub_job->filepath, sizeof(sub_job->filepath), "%s#/%s", f->filepath, utf8_name);
strcpy(sub_job->vfile.filepath, sub_job->filepath);
}
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;

View File

@@ -1,28 +1,34 @@
#include "ebook.h"
#include <mupdf/fitz.h>
#include <pthread.h>
#include <tesseract/capi.h>
#include "../media/media.h"
#include "../arc/arc.h"
#include "../ocr/ocr.h"
#if EBOOK_LOCKS
#include <pthread.h>
pthread_mutex_t Mutex;
#endif
/* fill_image callback doesn't let us pass opaque pointers unless I create my own device */
__thread text_buffer_t thread_buffer;
__thread scan_ebook_ctx_t thread_ctx;
pthread_mutex_t Mutex;
static void my_fz_lock(UNUSED(void *user), int lock) {
#if EBOOK_LOCKS
if (lock == FZ_LOCK_FREETYPE) {
pthread_mutex_lock(&Mutex);
}
#endif
}
static void my_fz_unlock(UNUSED(void *user), int lock) {
#if EBOOK_LOCKS
if (lock == FZ_LOCK_FREETYPE) {
pthread_mutex_unlock(&Mutex);
}
#endif
}
@@ -187,11 +193,13 @@ void fz_warn_callback(void *user, const char *message) {
static void init_fzctx(fz_context *fzctx, document_t *doc) {
fz_register_document_handlers(fzctx);
#if EBOOK_LOCKS
static int mu_is_initialized = FALSE;
if (!mu_is_initialized) {
pthread_mutex_init(&Mutex, NULL);
mu_is_initialized = TRUE;
}
#endif
fzctx->warn.print_user = doc;
fzctx->warn.print = fz_warn_callback;

View File

@@ -9,7 +9,6 @@ typedef struct {
int enable_tn;
const char *tesseract_lang;
const char *tesseract_path;
pthread_mutex_t mupdf_mutex;
log_callback_t log;
logf_callback_t logf;

View File

@@ -32,7 +32,7 @@ int json_extract_text(cJSON *json, text_buffer_t *tex) {
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
if (f->info.st_size > JSON_MAX_FILE_SIZE) {
if (f->st_size > JSON_MAX_FILE_SIZE) {
CTX_LOG_WARNINGF("json.c", "File larger than maximum allowed [%s]", f->filepath)
return SCAN_ERR_SKIP;
}

View File

@@ -687,7 +687,7 @@ long memfile_seek(void *ptr, long offset, int whence) {
}
int memfile_open(vfile_t *f, memfile_t *mem) {
mem->size = f->info.st_size;
mem->size = f->st_size;
mem->buf = malloc(mem->size);
if (mem->buf == NULL) {
@@ -737,16 +737,16 @@ void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc,
const char *filepath = get_filepath_with_ext(doc, f->filepath, mime_str);
if (f->info.st_size <= ctx->max_media_buffer) {
if (f->st_size <= ctx->max_media_buffer) {
int ret = memfile_open(f, &memfile);
if (ret == 0) {
CTX_LOG_DEBUGF(f->filepath, "Loading media file in memory (%ldB)", f->info.st_size)
CTX_LOG_DEBUGF(f->filepath, "Loading media file in memory (%ldB)", f->st_size)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, &memfile, memfile_read, NULL, memfile_seek);
}
}
if (io_ctx == NULL) {
CTX_LOG_DEBUGF(f->filepath, "Reading media file without seek support", f->info.st_size)
CTX_LOG_DEBUGF(f->filepath, "Reading media file without seek support", f->st_size)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, f, vfile_read, NULL, NULL);
}

View File

@@ -51,6 +51,8 @@ typedef int scan_code_t;
#define SIST_DOC_ID_LEN MD5_STR_LENGTH
#define SIST_INDEX_ID_LEN MD5_STR_LENGTH
#define EBOOK_LOCKS 0
enum metakey {
// String
MetaContent = 1,
@@ -100,7 +102,6 @@ typedef struct meta_line {
union {
char str_val[0];
unsigned long long_val;
double double_val;
};
} meta_line_t;
@@ -114,7 +115,7 @@ typedef struct document {
short ext;
meta_line_t *meta_head;
meta_line_t *meta_tail;
char filepath[PATH_MAX];
char filepath[PATH_MAX * 2 + 1];
} document_t;
typedef struct vfile vfile_t;
@@ -139,8 +140,11 @@ typedef struct vfile {
int is_fs_file;
int has_checksum;
int calculate_checksum;
const char *filepath;
struct stat info;
char filepath[PATH_MAX * 2 + 1];
int mtime;
size_t st_size;
unsigned int st_mode;
SHA_CTX sha1_ctx;
unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
@@ -162,7 +166,7 @@ typedef struct parse_job_t {
int ext;
struct vfile vfile;
char parent[SIST_DOC_ID_LEN];
char filepath[PATH_MAX];
char filepath[PATH_MAX * 2 + 1];
} parse_job_t;

View File

@@ -2,7 +2,7 @@
scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
int to_read = MIN(ctx->content_size, f->info.st_size);
int to_read = MIN(ctx->content_size, f->st_size);
if (to_read <= 2) {
return SCAN_OK;
@@ -39,7 +39,7 @@ scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
int to_read = MIN(MAX_MARKUP_SIZE, f->info.st_size);
int to_read = MIN(MAX_MARKUP_SIZE, f->st_size);
char *buf = malloc(to_read + 1);
int ret = f->read(f, buf, to_read);

View File

@@ -325,10 +325,10 @@ static int text_buffer_append_markup(text_buffer_t *buf, const char *markup) {
}
static void *read_all(vfile_t *f, size_t *size) {
void *buf = malloc(f->info.st_size);
*size = f->read(f, buf, f->info.st_size);
void *buf = malloc(f->st_size);
*size = f->read(f, buf, f->st_size);
if (*size != f->info.st_size) {
if (*size != f->st_size) {
free(buf);
return NULL;
}

View File

@@ -50,14 +50,20 @@ void cleanup(document_t *doc, vfile_t *f) {
}
void load_file(const char *filepath, vfile_t *f) {
stat(filepath, &f->info);
struct stat info = {};
stat(filepath, &info);
f->mtime = (int)info.st_mtim.tv_sec;
f->st_size = info.st_size;
f->st_mode = info.st_mode;
f->fd = open(filepath, O_RDONLY);
if (f->fd == -1) {
FAIL() << FILE_NOT_FOUND_ERR;
}
f->filepath = filepath;
memcpy(f->filepath, filepath, sizeof(f->filepath));
f->read = fs_read;
f->close = fs_close;
f->is_fs_file = TRUE;
@@ -66,9 +72,9 @@ void load_file(const char *filepath, vfile_t *f) {
}
void load_mem(void *mem, size_t size, vfile_t *f) {
f->filepath = "_mem_";
memcpy(f->filepath, "_mem_", strlen("_mem_"));
f->_test_data = mem;
f->info.st_size = (int) size;
f->st_size = size;
f->read = mem_read;
f->close = nullptr;
f->is_fs_file = TRUE;