use sqlite to save index, major thread pool refactor

This commit is contained in:
2023-04-03 21:39:50 -04:00
parent ca973d63a4
commit fc36f33d52
62 changed files with 3630 additions and 4673 deletions

View File

@@ -97,7 +97,6 @@ find_package(LibLZMA REQUIRED)
find_package(ZLIB REQUIRED)
find_package(unofficial-pcre CONFIG REQUIRED)
find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec)
find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd)
find_library(FREETYPE_LIB NAMES freetype freetyped)
@@ -110,6 +109,7 @@ find_library(CMS_LIB NAMES lcms2)
find_library(JAS_LIB NAMES jasper)
find_library(GUMBO_LIB NAMES gumbo)
find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/11/ /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/)
find_package(Leptonica CONFIG REQUIRED)
target_compile_options(
@@ -231,6 +231,7 @@ target_link_libraries(
antiword
mobi
unofficial::pcre::pcre unofficial::pcre::pcre16 unofficial::pcre::pcre32 unofficial::pcre::pcrecpp
leptonica
)
target_include_directories(

View File

@@ -9,27 +9,13 @@
#define MAX_DECOMPRESSED_SIZE_RATIO 40.0
int should_parse_filtered_file(const char *filepath, int ext) {
char tmp[PATH_MAX * 2];
int should_parse_filtered_file(const char *filepath) {
if (ext == 0) {
return FALSE;
}
if (strncmp(filepath + ext, "tgz", 3) == 0) {
if (strstr(filepath, ".tgz")) {
return TRUE;
}
memcpy(tmp, filepath, ext - 1);
*(tmp + ext - 1) = '\0';
char *idx = strrchr(tmp, '.');
if (idx == NULL) {
return FALSE;
}
if (strcmp(idx, ".tar") == 0) {
if (strstr(filepath, ".tar.")) {
return TRUE;
}
@@ -206,18 +192,10 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
struct stat entry_stat = *archive_entry_stat(entry);
sub_job->vfile.st_mode = entry_stat.st_mode;
sub_job->vfile.st_size = entry_stat.st_size;
sub_job->vfile.mtime = (int) entry_stat.st_mtim.tv_sec;
double decompressed_size_ratio = (double) sub_job->vfile.st_size / (double) f->st_size;
if (decompressed_size_ratio > MAX_DECOMPRESSED_SIZE_RATIO) {
CTX_LOG_DEBUGF("arc.c", "Skipped %s, possible zip bomb (decompressed_size_ratio=%f)", sub_job->filepath,
decompressed_size_ratio)
continue;
}
if (S_ISREG(sub_job->vfile.st_mode)) {
if (S_ISREG(entry_stat.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
@@ -231,6 +209,13 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre
}
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
double decompressed_size_ratio = (double) sub_job->vfile.st_size / (double) f->st_size;
if (decompressed_size_ratio > MAX_DECOMPRESSED_SIZE_RATIO) {
CTX_LOG_DEBUGF("arc.c", "Skipped %s, possible zip bomb (decompressed_size_ratio=%f)", sub_job->filepath,
decompressed_size_ratio)
break;
}
// Handle excludes
if (exclude != NULL && EXCLUDED(sub_job->filepath)) {
CTX_LOG_DEBUGF("arc.c", "Excluded: %s", sub_job->filepath)

View File

@@ -67,7 +67,7 @@ static int vfile_close_callback(struct archive *a, void *user_data) {
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse);
int should_parse_filtered_file(const char *filepath, int ext);
int should_parse_filtered_file(const char *filepath);
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra);

View File

@@ -162,7 +162,7 @@ int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_d
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
APPEND_LONG_META(doc, MetaThumbnail, 1)
ctx->store(doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size);
ctx->store(doc->doc_id, 0, (char *) jpeg_packet.data, jpeg_packet.size);
free(samples);
av_packet_unref(&jpeg_packet);

View File

@@ -232,7 +232,7 @@ void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc) {
bmp_format(&bmp_data, dimensions, bitmap);
APPEND_LONG_META(doc, MetaThumbnail, 1)
ctx->store(doc->doc_id, sizeof(doc->doc_id), (char *) bmp_data.buf, bmp_data.cur);
ctx->store(doc->doc_id, 0, bmp_data.buf, bmp_data.cur);
dyn_buffer_destroy(&bmp_data);
free(bitmap);

View File

@@ -468,8 +468,7 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
if (scaled_frame == STORE_AS_IS) {
return_value = SAVE_THUMBNAIL_OK;
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) frame_and_packet->packet->data,
frame_and_packet->packet->size);
ctx->store(doc->doc_id, 0, frame_and_packet->packet->data, frame_and_packet->packet->size);
} else {
// Encode frame to jpeg
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
@@ -482,19 +481,17 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
// Save thumbnail
if (thumbnail_index == 0) {
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size);
ctx->store(doc->doc_id, 0, jpeg_packet.data, jpeg_packet.size);
return_value = SAVE_THUMBNAIL_OK;
} else if (thumbnail_index > 1) {
return_value = SAVE_THUMBNAIL_OK;
// TO FIX: the 2nd rendered frame is always broken, just skip it until
// I figure out a better fix.
thumbnail_index -= 1;
char tn_key[sizeof(doc->doc_id) + sizeof(char) * 4];
snprintf(tn_key, sizeof(tn_key), "%s%04d", doc->doc_id, thumbnail_index);
ctx->store(doc->doc_id, thumbnail_index, jpeg_packet.data, jpeg_packet.size);
ctx->store((char *) tn_key, sizeof(tn_key), (char *) jpeg_packet.data, jpeg_packet.size);
return_value = SAVE_THUMBNAIL_OK;
} else {
return_value = SAVE_THUMBNAIL_SKIPPED;
}
@@ -854,8 +851,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
if (scaled_frame == STORE_AS_IS) {
APPEND_LONG_META(doc, MetaThumbnail, 1)
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) frame_and_packet->packet->data,
frame_and_packet->packet->size);
ctx->store(doc->doc_id, 0, frame_and_packet->packet->data, frame_and_packet->packet->size);
} else {
// Encode frame to jpeg
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
@@ -868,7 +864,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
// Save thumbnail
APPEND_LONG_META(doc, MetaThumbnail, 1)
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size);
ctx->store(doc->doc_id, 0, jpeg_packet.data, jpeg_packet.size);
av_packet_unref(&jpeg_packet);
avcodec_free_context(&jpeg_encoder);

View File

@@ -191,7 +191,7 @@ void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, s
archive_read_data(a, buf, entry_size);
APPEND_LONG_META(doc, MetaThumbnail, 1)
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), buf, entry_size);
ctx->store(doc->doc_id, 1, buf, entry_size);
free(buf);
}

View File

@@ -6,6 +6,7 @@
#endif
#include <stdio.h>
#include <string.h>
#include <sys/stat.h>
#include <openssl/md5.h>
#include <openssl/sha.h>
@@ -16,7 +17,7 @@
#define UNUSED(x) __attribute__((__unused__)) x
typedef void (*store_callback_t)(char *key, size_t key_len, char *buf, size_t buf_len);
typedef void (*store_callback_t)(char *key, int num, void *buf, size_t buf_len);
typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...);
@@ -111,8 +112,8 @@ typedef struct document {
unsigned long size;
unsigned int mime;
int mtime;
short base;
short ext;
int base;
int ext;
meta_line_t *meta_head;
meta_line_t *meta_tail;
char filepath[PATH_MAX * 2 + 1];
@@ -144,7 +145,6 @@ typedef struct vfile {
int mtime;
size_t st_size;
unsigned int st_mode;
SHA_CTX sha1_ctx;
unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
@@ -161,7 +161,7 @@ typedef struct vfile {
logf_callback_t logf;
} vfile_t;
typedef struct parse_job_t {
typedef struct {
int base;
int ext;
struct vfile vfile;

View File

@@ -358,4 +358,37 @@ static void safe_sha1_update(SHA_CTX *ctx, void *buf, size_t size) {
}
}
static parse_job_t *create_parse_job(const char *filepath, int mtime, size_t st_size) {
parse_job_t *job = (parse_job_t *) malloc(sizeof(parse_job_t));
job->parent[0] = '\0';
strcpy(job->filepath, filepath);
strcpy(job->vfile.filepath, filepath);
job->vfile.st_size = st_size;
job->vfile.mtime = mtime;
const char *slash = strrchr(filepath, '/');
if (slash == NULL) {
job->base = 0;
} else {
job->base = (int) (slash - filepath + 1);
}
const char *dot = strrchr(filepath + job->base, '.');
if (dot == NULL) {
job->ext = (int) strlen(filepath);
} else {
job->ext = (int) (dot - filepath + 1);
}
job->vfile.fd = -1;
job->vfile.is_fs_file = TRUE;
job->vfile.has_checksum = FALSE;
job->vfile.rewind_buffer_size = 0;
job->vfile.rewind_buffer = NULL;
return job;
}
#endif

View File

@@ -55,7 +55,6 @@ void load_file(const char *filepath, vfile_t *f) {
f->mtime = (int)info.st_mtim.tv_sec;
f->st_size = info.st_size;
f->st_mode = info.st_mode;
f->fd = open(filepath, O_RDONLY);

View File

@@ -21,7 +21,7 @@ static void noop_log(const char *filepath, int level, char *str) {
static size_t store_size = 0;
static void counter_store(char* key, size_t key_len, char *value, size_t value_len) {
static void counter_store(char* key, int num, void *value, size_t value_len) {
store_size += value_len;
// char id[37];
// char tmp[PATH_MAX];