mirror of
https://github.com/simon987/sist2.git
synced 2025-12-10 22:18:54 +00:00
use sqlite to save index, major thread pool refactor
This commit is contained in:
3
third-party/libscan/CMakeLists.txt
vendored
3
third-party/libscan/CMakeLists.txt
vendored
@@ -97,7 +97,6 @@ find_package(LibLZMA REQUIRED)
|
||||
find_package(ZLIB REQUIRED)
|
||||
find_package(unofficial-pcre CONFIG REQUIRED)
|
||||
|
||||
|
||||
find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec)
|
||||
find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd)
|
||||
find_library(FREETYPE_LIB NAMES freetype freetyped)
|
||||
@@ -110,6 +109,7 @@ find_library(CMS_LIB NAMES lcms2)
|
||||
find_library(JAS_LIB NAMES jasper)
|
||||
find_library(GUMBO_LIB NAMES gumbo)
|
||||
find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/11/ /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/)
|
||||
find_package(Leptonica CONFIG REQUIRED)
|
||||
|
||||
|
||||
target_compile_options(
|
||||
@@ -231,6 +231,7 @@ target_link_libraries(
|
||||
antiword
|
||||
mobi
|
||||
unofficial::pcre::pcre unofficial::pcre::pcre16 unofficial::pcre::pcre32 unofficial::pcre::pcrecpp
|
||||
leptonica
|
||||
)
|
||||
|
||||
target_include_directories(
|
||||
|
||||
37
third-party/libscan/libscan/arc/arc.c
vendored
37
third-party/libscan/libscan/arc/arc.c
vendored
@@ -9,27 +9,13 @@
|
||||
|
||||
#define MAX_DECOMPRESSED_SIZE_RATIO 40.0
|
||||
|
||||
int should_parse_filtered_file(const char *filepath, int ext) {
|
||||
char tmp[PATH_MAX * 2];
|
||||
int should_parse_filtered_file(const char *filepath) {
|
||||
|
||||
if (ext == 0) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (strncmp(filepath + ext, "tgz", 3) == 0) {
|
||||
if (strstr(filepath, ".tgz")) {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
memcpy(tmp, filepath, ext - 1);
|
||||
*(tmp + ext - 1) = '\0';
|
||||
|
||||
char *idx = strrchr(tmp, '.');
|
||||
|
||||
if (idx == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (strcmp(idx, ".tar") == 0) {
|
||||
if (strstr(filepath, ".tar.")) {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
@@ -206,18 +192,10 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre
|
||||
|
||||
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
|
||||
struct stat entry_stat = *archive_entry_stat(entry);
|
||||
sub_job->vfile.st_mode = entry_stat.st_mode;
|
||||
sub_job->vfile.st_size = entry_stat.st_size;
|
||||
sub_job->vfile.mtime = (int) entry_stat.st_mtim.tv_sec;
|
||||
|
||||
double decompressed_size_ratio = (double) sub_job->vfile.st_size / (double) f->st_size;
|
||||
if (decompressed_size_ratio > MAX_DECOMPRESSED_SIZE_RATIO) {
|
||||
CTX_LOG_DEBUGF("arc.c", "Skipped %s, possible zip bomb (decompressed_size_ratio=%f)", sub_job->filepath,
|
||||
decompressed_size_ratio)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (S_ISREG(sub_job->vfile.st_mode)) {
|
||||
if (S_ISREG(entry_stat.st_mode)) {
|
||||
|
||||
const char *utf8_name = archive_entry_pathname_utf8(entry);
|
||||
|
||||
@@ -231,6 +209,13 @@ scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre
|
||||
}
|
||||
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
|
||||
|
||||
double decompressed_size_ratio = (double) sub_job->vfile.st_size / (double) f->st_size;
|
||||
if (decompressed_size_ratio > MAX_DECOMPRESSED_SIZE_RATIO) {
|
||||
CTX_LOG_DEBUGF("arc.c", "Skipped %s, possible zip bomb (decompressed_size_ratio=%f)", sub_job->filepath,
|
||||
decompressed_size_ratio)
|
||||
break;
|
||||
}
|
||||
|
||||
// Handle excludes
|
||||
if (exclude != NULL && EXCLUDED(sub_job->filepath)) {
|
||||
CTX_LOG_DEBUGF("arc.c", "Excluded: %s", sub_job->filepath)
|
||||
|
||||
2
third-party/libscan/libscan/arc/arc.h
vendored
2
third-party/libscan/libscan/arc/arc.h
vendored
@@ -67,7 +67,7 @@ static int vfile_close_callback(struct archive *a, void *user_data) {
|
||||
|
||||
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse);
|
||||
|
||||
int should_parse_filtered_file(const char *filepath, int ext);
|
||||
int should_parse_filtered_file(const char *filepath);
|
||||
|
||||
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra);
|
||||
|
||||
|
||||
2
third-party/libscan/libscan/ebook/ebook.c
vendored
2
third-party/libscan/libscan/ebook/ebook.c
vendored
@@ -162,7 +162,7 @@ int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_d
|
||||
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
|
||||
|
||||
APPEND_LONG_META(doc, MetaThumbnail, 1)
|
||||
ctx->store(doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size);
|
||||
ctx->store(doc->doc_id, 0, (char *) jpeg_packet.data, jpeg_packet.size);
|
||||
|
||||
free(samples);
|
||||
av_packet_unref(&jpeg_packet);
|
||||
|
||||
2
third-party/libscan/libscan/font/font.c
vendored
2
third-party/libscan/libscan/font/font.c
vendored
@@ -232,7 +232,7 @@ void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
bmp_format(&bmp_data, dimensions, bitmap);
|
||||
|
||||
APPEND_LONG_META(doc, MetaThumbnail, 1)
|
||||
ctx->store(doc->doc_id, sizeof(doc->doc_id), (char *) bmp_data.buf, bmp_data.cur);
|
||||
ctx->store(doc->doc_id, 0, bmp_data.buf, bmp_data.cur);
|
||||
|
||||
dyn_buffer_destroy(&bmp_data);
|
||||
free(bitmap);
|
||||
|
||||
16
third-party/libscan/libscan/media/media.c
vendored
16
third-party/libscan/libscan/media/media.c
vendored
@@ -468,8 +468,7 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
|
||||
if (scaled_frame == STORE_AS_IS) {
|
||||
return_value = SAVE_THUMBNAIL_OK;
|
||||
|
||||
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) frame_and_packet->packet->data,
|
||||
frame_and_packet->packet->size);
|
||||
ctx->store(doc->doc_id, 0, frame_and_packet->packet->data, frame_and_packet->packet->size);
|
||||
} else {
|
||||
// Encode frame to jpeg
|
||||
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
|
||||
@@ -482,19 +481,17 @@ int decode_frame_and_save_thumbnail(scan_media_ctx_t *ctx, AVFormatContext *pFor
|
||||
|
||||
// Save thumbnail
|
||||
if (thumbnail_index == 0) {
|
||||
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size);
|
||||
ctx->store(doc->doc_id, 0, jpeg_packet.data, jpeg_packet.size);
|
||||
return_value = SAVE_THUMBNAIL_OK;
|
||||
|
||||
} else if (thumbnail_index > 1) {
|
||||
return_value = SAVE_THUMBNAIL_OK;
|
||||
// TO FIX: the 2nd rendered frame is always broken, just skip it until
|
||||
// I figure out a better fix.
|
||||
thumbnail_index -= 1;
|
||||
|
||||
char tn_key[sizeof(doc->doc_id) + sizeof(char) * 4];
|
||||
snprintf(tn_key, sizeof(tn_key), "%s%04d", doc->doc_id, thumbnail_index);
|
||||
ctx->store(doc->doc_id, thumbnail_index, jpeg_packet.data, jpeg_packet.size);
|
||||
|
||||
ctx->store((char *) tn_key, sizeof(tn_key), (char *) jpeg_packet.data, jpeg_packet.size);
|
||||
return_value = SAVE_THUMBNAIL_OK;
|
||||
} else {
|
||||
return_value = SAVE_THUMBNAIL_SKIPPED;
|
||||
}
|
||||
@@ -854,8 +851,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
|
||||
|
||||
if (scaled_frame == STORE_AS_IS) {
|
||||
APPEND_LONG_META(doc, MetaThumbnail, 1)
|
||||
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) frame_and_packet->packet->data,
|
||||
frame_and_packet->packet->size);
|
||||
ctx->store(doc->doc_id, 0, frame_and_packet->packet->data, frame_and_packet->packet->size);
|
||||
} else {
|
||||
// Encode frame to jpeg
|
||||
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
|
||||
@@ -868,7 +864,7 @@ int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, docu
|
||||
|
||||
// Save thumbnail
|
||||
APPEND_LONG_META(doc, MetaThumbnail, 1)
|
||||
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), (char *) jpeg_packet.data, jpeg_packet.size);
|
||||
ctx->store(doc->doc_id, 0, jpeg_packet.data, jpeg_packet.size);
|
||||
|
||||
av_packet_unref(&jpeg_packet);
|
||||
avcodec_free_context(&jpeg_encoder);
|
||||
|
||||
2
third-party/libscan/libscan/ooxml/ooxml.c
vendored
2
third-party/libscan/libscan/ooxml/ooxml.c
vendored
@@ -191,7 +191,7 @@ void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, s
|
||||
archive_read_data(a, buf, entry_size);
|
||||
|
||||
APPEND_LONG_META(doc, MetaThumbnail, 1)
|
||||
ctx->store((char *) doc->doc_id, sizeof(doc->doc_id), buf, entry_size);
|
||||
ctx->store(doc->doc_id, 1, buf, entry_size);
|
||||
free(buf);
|
||||
}
|
||||
|
||||
|
||||
10
third-party/libscan/libscan/scan.h
vendored
10
third-party/libscan/libscan/scan.h
vendored
@@ -6,6 +6,7 @@
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <sys/stat.h>
|
||||
#include <openssl/md5.h>
|
||||
#include <openssl/sha.h>
|
||||
@@ -16,7 +17,7 @@
|
||||
|
||||
#define UNUSED(x) __attribute__((__unused__)) x
|
||||
|
||||
typedef void (*store_callback_t)(char *key, size_t key_len, char *buf, size_t buf_len);
|
||||
typedef void (*store_callback_t)(char *key, int num, void *buf, size_t buf_len);
|
||||
|
||||
typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...);
|
||||
|
||||
@@ -111,8 +112,8 @@ typedef struct document {
|
||||
unsigned long size;
|
||||
unsigned int mime;
|
||||
int mtime;
|
||||
short base;
|
||||
short ext;
|
||||
int base;
|
||||
int ext;
|
||||
meta_line_t *meta_head;
|
||||
meta_line_t *meta_tail;
|
||||
char filepath[PATH_MAX * 2 + 1];
|
||||
@@ -144,7 +145,6 @@ typedef struct vfile {
|
||||
|
||||
int mtime;
|
||||
size_t st_size;
|
||||
unsigned int st_mode;
|
||||
|
||||
SHA_CTX sha1_ctx;
|
||||
unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
|
||||
@@ -161,7 +161,7 @@ typedef struct vfile {
|
||||
logf_callback_t logf;
|
||||
} vfile_t;
|
||||
|
||||
typedef struct parse_job_t {
|
||||
typedef struct {
|
||||
int base;
|
||||
int ext;
|
||||
struct vfile vfile;
|
||||
|
||||
33
third-party/libscan/libscan/util.h
vendored
33
third-party/libscan/libscan/util.h
vendored
@@ -358,4 +358,37 @@ static void safe_sha1_update(SHA_CTX *ctx, void *buf, size_t size) {
|
||||
}
|
||||
}
|
||||
|
||||
static parse_job_t *create_parse_job(const char *filepath, int mtime, size_t st_size) {
|
||||
parse_job_t *job = (parse_job_t *) malloc(sizeof(parse_job_t));
|
||||
|
||||
job->parent[0] = '\0';
|
||||
|
||||
strcpy(job->filepath, filepath);
|
||||
strcpy(job->vfile.filepath, filepath);
|
||||
job->vfile.st_size = st_size;
|
||||
job->vfile.mtime = mtime;
|
||||
|
||||
const char *slash = strrchr(filepath, '/');
|
||||
if (slash == NULL) {
|
||||
job->base = 0;
|
||||
} else {
|
||||
job->base = (int) (slash - filepath + 1);
|
||||
}
|
||||
|
||||
const char *dot = strrchr(filepath + job->base, '.');
|
||||
if (dot == NULL) {
|
||||
job->ext = (int) strlen(filepath);
|
||||
} else {
|
||||
job->ext = (int) (dot - filepath + 1);
|
||||
}
|
||||
|
||||
job->vfile.fd = -1;
|
||||
job->vfile.is_fs_file = TRUE;
|
||||
job->vfile.has_checksum = FALSE;
|
||||
job->vfile.rewind_buffer_size = 0;
|
||||
job->vfile.rewind_buffer = NULL;
|
||||
|
||||
return job;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
1
third-party/libscan/test/test_util.cpp
vendored
1
third-party/libscan/test/test_util.cpp
vendored
@@ -55,7 +55,6 @@ void load_file(const char *filepath, vfile_t *f) {
|
||||
|
||||
f->mtime = (int)info.st_mtim.tv_sec;
|
||||
f->st_size = info.st_size;
|
||||
f->st_mode = info.st_mode;
|
||||
|
||||
f->fd = open(filepath, O_RDONLY);
|
||||
|
||||
|
||||
2
third-party/libscan/test/test_util.h
vendored
2
third-party/libscan/test/test_util.h
vendored
@@ -21,7 +21,7 @@ static void noop_log(const char *filepath, int level, char *str) {
|
||||
|
||||
static size_t store_size = 0;
|
||||
|
||||
static void counter_store(char* key, size_t key_len, char *value, size_t value_len) {
|
||||
static void counter_store(char* key, int num, void *value, size_t value_len) {
|
||||
store_size += value_len;
|
||||
// char id[37];
|
||||
// char tmp[PATH_MAX];
|
||||
|
||||
2
third-party/libscan/third-party/antiword
vendored
2
third-party/libscan/third-party/antiword
vendored
Submodule third-party/libscan/third-party/antiword updated: ddb042143e...badfdac845
Reference in New Issue
Block a user