mirror of
https://github.com/simon987/sist2.git
synced 2025-12-12 15:08:53 +00:00
use sqlite to save index, major thread pool refactor
This commit is contained in:
42
src/parsing/fs_util.h
Normal file
42
src/parsing/fs_util.h
Normal file
@@ -0,0 +1,42 @@
|
||||
#ifndef SIST2_FS_UTIL_H
|
||||
#define SIST2_FS_UTIL_H
|
||||
|
||||
#include "src/sist.h"
|
||||
|
||||
#define CLOSE_FILE(f) if ((f).close != NULL) {(f).close(&(f));};
|
||||
|
||||
static int fs_read(struct vfile *f, void *buf, size_t size) {
|
||||
|
||||
if (f->fd == -1) {
|
||||
SHA1_Init(&f->sha1_ctx);
|
||||
|
||||
f->fd = open(f->filepath, O_RDONLY);
|
||||
if (f->fd == -1) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
int ret = (int) read(f->fd, buf, size);
|
||||
|
||||
if (ret != 0 && f->calculate_checksum) {
|
||||
f->has_checksum = TRUE;
|
||||
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void fs_close(struct vfile *f) {
|
||||
if (f->fd != -1) {
|
||||
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
|
||||
close(f->fd);
|
||||
}
|
||||
}
|
||||
|
||||
static void fs_reset(struct vfile *f) {
|
||||
if (f->fd != -1) {
|
||||
lseek(f->fd, 0, SEEK_SET);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
32
src/parsing/magic_util.c
Normal file
32
src/parsing/magic_util.c
Normal file
@@ -0,0 +1,32 @@
|
||||
#include "magic_util.h"
|
||||
#include "src/log.h"
|
||||
#include "mime.h"
|
||||
#include <magic.h>
|
||||
#include "src/magic_generated.c"
|
||||
|
||||
|
||||
char *magic_buffer_embedded(void *buffer, size_t buffer_size) {
|
||||
|
||||
magic_t magic = magic_open(MAGIC_MIME_TYPE);
|
||||
|
||||
const char *magic_buffers[1] = {magic_database_buffer,};
|
||||
size_t sizes[1] = {sizeof(magic_database_buffer),};
|
||||
|
||||
// TODO: check if we can reuse the magic instance
|
||||
int load_ret = magic_load_buffers(magic, (void **) &magic_buffers, sizes, 1);
|
||||
|
||||
if (load_ret != 0) {
|
||||
LOG_FATALF("parse.c", "Could not load libmagic database: (%d)", load_ret);
|
||||
}
|
||||
|
||||
const char *magic_mime_str = magic_buffer(magic, buffer, buffer_size);
|
||||
char *return_value = NULL;
|
||||
|
||||
if (magic_mime_str != NULL) {
|
||||
return_value = malloc(strlen(magic_mime_str) + 1);
|
||||
strcpy(return_value, magic_mime_str);
|
||||
}
|
||||
|
||||
magic_close(magic);
|
||||
return return_value;
|
||||
}
|
||||
8
src/parsing/magic_util.h
Normal file
8
src/parsing/magic_util.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#ifndef SIST2_MAGIC_UTIL_H
|
||||
#define SIST2_MAGIC_UTIL_H
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
char *magic_buffer_embedded(void *buffer, size_t buffer_size);
|
||||
|
||||
#endif //SIST2_MAGIC_UTIL_H
|
||||
@@ -1,22 +1,30 @@
|
||||
#include "mime.h"
|
||||
#include <zlib.h>
|
||||
|
||||
unsigned int mime_get_mime_by_ext(GHashTable *ext_table, const char * ext) {
|
||||
char lower[8];
|
||||
char *p = lower;
|
||||
unsigned int mime_get_mime_by_ext(const char *ext) {
|
||||
unsigned char lower[16];
|
||||
unsigned char *p = lower;
|
||||
int cnt = 0;
|
||||
while ((*ext) != '\0' && cnt + 1 < sizeof(lower)) {
|
||||
*p++ = (char)tolower(*ext++);
|
||||
*p++ = tolower(*ext++);
|
||||
cnt++;
|
||||
}
|
||||
*p = '\0';
|
||||
return (size_t) g_hash_table_lookup(ext_table, lower);
|
||||
|
||||
unsigned long crc = crc32(0, lower, cnt);
|
||||
|
||||
unsigned int mime = mime_extension_lookup(crc);
|
||||
return mime;
|
||||
}
|
||||
|
||||
unsigned int mime_get_mime_by_string(GHashTable *mime_table, const char * str) {
|
||||
unsigned int mime_get_mime_by_string(const char *str) {
|
||||
|
||||
const char * ptr = str;
|
||||
const char *ptr = str;
|
||||
while (*ptr == ' ' || *ptr == '[') {
|
||||
ptr++;
|
||||
}
|
||||
return (size_t) g_hash_table_lookup(mime_table, ptr);
|
||||
|
||||
unsigned long crc = crc32(0, (unsigned char *) ptr, strlen(ptr));
|
||||
|
||||
return mime_name_lookup(crc);
|
||||
}
|
||||
|
||||
@@ -51,14 +51,14 @@ enum major_mime {
|
||||
|
||||
enum mime;
|
||||
|
||||
GHashTable *mime_get_mime_table();
|
||||
unsigned int mime_name_lookup(unsigned long mime_crc32);
|
||||
|
||||
GHashTable *mime_get_ext_table();
|
||||
unsigned int mime_extension_lookup(unsigned long extension_crc32);
|
||||
|
||||
char *mime_get_mime_text(unsigned int);
|
||||
const char *mime_get_mime_text(unsigned int);
|
||||
|
||||
unsigned int mime_get_mime_by_ext(GHashTable *ext_table, const char * ext);
|
||||
unsigned int mime_get_mime_by_ext(const char *ext);
|
||||
|
||||
unsigned int mime_get_mime_by_string(GHashTable *mime_table, const char * str);
|
||||
unsigned int mime_get_mime_by_string(const char *str);
|
||||
|
||||
#endif
|
||||
|
||||
2730
src/parsing/mime_generated.c
vendored
2730
src/parsing/mime_generated.c
vendored
File diff suppressed because it is too large
Load Diff
@@ -5,235 +5,242 @@
|
||||
#include "mime.h"
|
||||
#include "src/io/serialize.h"
|
||||
#include "src/parsing/sidecar.h"
|
||||
#include "src/magic_generated.c"
|
||||
|
||||
#include <magic.h>
|
||||
#include "src/parsing/fs_util.h"
|
||||
#include "src/parsing/magic_util.h"
|
||||
#include <pthread.h>
|
||||
|
||||
|
||||
#define MIN_VIDEO_SIZE (1024 * 64)
|
||||
#define MIN_IMAGE_SIZE (512)
|
||||
|
||||
int fs_read(struct vfile *f, void *buf, size_t size) {
|
||||
#define MAGIC_BUF_SIZE (4096 * 6)
|
||||
|
||||
if (f->fd == -1) {
|
||||
SHA1_Init(&f->sha1_ctx);
|
||||
typedef enum {
|
||||
FILETYPE_DONT_PARSE,
|
||||
FILETYPE_RAW,
|
||||
FILETYPE_MEDIA,
|
||||
FILETYPE_EBOOK,
|
||||
FILETYPE_MARKUP,
|
||||
FILETYPE_TEXT,
|
||||
FILETYPE_FONT,
|
||||
FILETYPE_ARCHIVE,
|
||||
FILETYPE_OOXML,
|
||||
FILETYPE_COMIC,
|
||||
FILETYPE_MOBI,
|
||||
FILETYPE_SIST2_SIDECAR,
|
||||
FILETYPE_MSDOC,
|
||||
FILETYPE_JSON,
|
||||
FILETYPE_NDJSON,
|
||||
} file_type_t;
|
||||
|
||||
f->fd = open(f->filepath, O_RDONLY);
|
||||
if (f->fd == -1) {
|
||||
return -1;
|
||||
file_type_t get_file_type(unsigned int mime, size_t size, const char *filepath) {
|
||||
|
||||
int major_mime = MAJOR_MIME(mime);
|
||||
|
||||
if (!(SHOULD_PARSE(mime))) {
|
||||
return FILETYPE_DONT_PARSE;
|
||||
} else if (IS_RAW(mime)) {
|
||||
return FILETYPE_RAW;
|
||||
} else if ((major_mime == MimeVideo && size >= MIN_VIDEO_SIZE) ||
|
||||
(major_mime == MimeImage && size >= MIN_IMAGE_SIZE) || major_mime == MimeAudio) {
|
||||
return FILETYPE_MEDIA;
|
||||
} else if (IS_PDF(mime)) {
|
||||
return FILETYPE_EBOOK;
|
||||
} else if (major_mime == MimeText && ScanCtx.text_ctx.content_size > 0) {
|
||||
if (IS_MARKUP(mime)) {
|
||||
return FILETYPE_MARKUP;
|
||||
} else {
|
||||
return FILETYPE_TEXT;
|
||||
}
|
||||
|
||||
} else if (IS_FONT(mime)) {
|
||||
return FILETYPE_FONT;
|
||||
} else if (
|
||||
ScanCtx.arc_ctx.mode != ARC_MODE_SKIP && (
|
||||
IS_ARC(mime) ||
|
||||
(IS_ARC_FILTER(mime) && should_parse_filtered_file(filepath))
|
||||
)) {
|
||||
return FILETYPE_ARCHIVE;
|
||||
} else if ((ScanCtx.ooxml_ctx.content_size > 0 || ScanCtx.media_ctx.tn_size > 0) && IS_DOC(mime)) {
|
||||
return FILETYPE_OOXML;
|
||||
} else if (is_cbr(&ScanCtx.comic_ctx, mime) || is_cbz(&ScanCtx.comic_ctx, mime)) {
|
||||
return FILETYPE_COMIC;
|
||||
} else if (IS_MOBI(mime)) {
|
||||
return FILETYPE_MOBI;
|
||||
} else if (mime == MIME_SIST2_SIDECAR) {
|
||||
return FILETYPE_SIST2_SIDECAR;
|
||||
} else if (is_msdoc(&ScanCtx.msdoc_ctx, mime)) {
|
||||
return FILETYPE_MSDOC;
|
||||
} else if (is_json(&ScanCtx.json_ctx, mime)) {
|
||||
return FILETYPE_JSON;
|
||||
} else if (is_ndjson(&ScanCtx.json_ctx, mime)) {
|
||||
return FILETYPE_NDJSON;
|
||||
}
|
||||
}
|
||||
|
||||
#define GET_MIME_ERROR_FATAL (-1)
|
||||
|
||||
int get_mime(parse_job_t *job) {
|
||||
|
||||
char *extension = job->filepath + job->ext;
|
||||
|
||||
int mime = 0;
|
||||
|
||||
if (job->vfile.st_size == 0) {
|
||||
return MIME_EMPTY;
|
||||
}
|
||||
|
||||
if (*extension != '\0' && (job->ext - job->base != 1)) {
|
||||
mime = (int) mime_get_mime_by_ext(extension);
|
||||
|
||||
if (mime != 0) {
|
||||
return mime;
|
||||
}
|
||||
}
|
||||
|
||||
int ret = (int) read(f->fd, buf, size);
|
||||
|
||||
if (ret != 0 && f->calculate_checksum) {
|
||||
f->has_checksum = TRUE;
|
||||
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, ret);
|
||||
if (strlen(extension) == 0 && strlen(job->filepath + job->base) == 40) {
|
||||
fprintf(stderr, "GIT? %s", job->filepath);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define CLOSE_FILE(f) if ((f).close != NULL) {(f).close(&(f));};
|
||||
|
||||
void fs_close(struct vfile *f) {
|
||||
if (f->fd != -1) {
|
||||
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
|
||||
close(f->fd);
|
||||
if (ScanCtx.fast) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
void fs_reset(struct vfile *f) {
|
||||
if (f->fd != -1) {
|
||||
lseek(f->fd, 0, SEEK_SET);
|
||||
// Get mime type with libmagic
|
||||
if (job->vfile.read_rewindable == NULL) {
|
||||
LOG_WARNING(job->filepath,
|
||||
"File does not support rewindable reads, cannot guess Media type");
|
||||
return 0;
|
||||
}
|
||||
|
||||
char *buf[MAGIC_BUF_SIZE];
|
||||
int bytes_read = job->vfile.read_rewindable(&job->vfile, buf, MAGIC_BUF_SIZE);
|
||||
if (bytes_read < 0) {
|
||||
if (job->vfile.is_fs_file) {
|
||||
LOG_ERRORF(job->filepath, "read(): [%d] %s", errno, strerror(errno));
|
||||
} else {
|
||||
LOG_ERRORF(job->filepath, "(virtual) read(): [%d] %s", bytes_read, archive_error_string(job->vfile.arc));
|
||||
}
|
||||
|
||||
|
||||
return GET_MIME_ERROR_FATAL;
|
||||
}
|
||||
|
||||
char *magic_mime_str = magic_buffer_embedded(buf, bytes_read);
|
||||
|
||||
if (magic_mime_str != NULL) {
|
||||
mime = (int) mime_get_mime_by_string(magic_mime_str);
|
||||
free(magic_mime_str);
|
||||
|
||||
if (mime == 0) {
|
||||
LOG_WARNINGF(job->filepath, "Couldn't find mime %s", magic_mime_str);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (job->vfile.reset != NULL) {
|
||||
job->vfile.reset(&job->vfile);
|
||||
}
|
||||
|
||||
return mime;
|
||||
}
|
||||
|
||||
void set_dbg_current_file(parse_job_t *job) {
|
||||
unsigned long long pid = (unsigned long long) pthread_self();
|
||||
pthread_mutex_lock(&ScanCtx.dbg_current_files_mu);
|
||||
g_hash_table_replace(ScanCtx.dbg_current_files, GINT_TO_POINTER(pid), job);
|
||||
pthread_mutex_unlock(&ScanCtx.dbg_current_files_mu);
|
||||
}
|
||||
void parse(parse_job_t *job) {
|
||||
|
||||
void parse_job(parse_job_t *job) {
|
||||
tpool_work_arg_shm_t *arg = malloc(sizeof(tpool_work_arg_shm_t) + sizeof(*job));
|
||||
|
||||
memcpy(arg->arg, job, sizeof(*job));
|
||||
arg->arg_size = -1;
|
||||
|
||||
parse(arg);
|
||||
|
||||
free(arg);
|
||||
}
|
||||
|
||||
void parse(tpool_work_arg_shm_t *arg) {
|
||||
|
||||
parse_job_t *job = (void*)arg->arg;
|
||||
if (job->vfile.is_fs_file) {
|
||||
job->vfile.read = fs_read;
|
||||
job->vfile.read_rewindable = fs_read;
|
||||
job->vfile.reset = fs_reset;
|
||||
job->vfile.close = fs_close;
|
||||
job->vfile.calculate_checksum = ScanCtx.calculate_checksums;
|
||||
}
|
||||
|
||||
document_t *doc = malloc(sizeof(document_t));
|
||||
|
||||
set_dbg_current_file(job);
|
||||
|
||||
strcpy(doc->filepath, job->filepath);
|
||||
doc->ext = (short) job->ext;
|
||||
doc->base = (short) job->base;
|
||||
|
||||
char *rel_path = doc->filepath + ScanCtx.index.desc.root_len;
|
||||
generate_doc_id(rel_path, doc->doc_id);
|
||||
|
||||
doc->ext = job->ext;
|
||||
doc->base = job->base;
|
||||
doc->meta_head = NULL;
|
||||
doc->meta_tail = NULL;
|
||||
doc->mime = 0;
|
||||
doc->size = job->vfile.st_size;
|
||||
doc->mtime = (int) job->vfile.mtime;
|
||||
doc->mime = get_mime(job);
|
||||
generate_doc_id(doc->filepath + ScanCtx.index.desc.root_len, doc->doc_id);
|
||||
|
||||
int inc_ts = incremental_get(ScanCtx.original_table, doc->doc_id);
|
||||
if (inc_ts != 0 && inc_ts == job->vfile.mtime) {
|
||||
pthread_mutex_lock(&ScanCtx.copy_table_mu);
|
||||
incremental_mark_file(ScanCtx.copy_table, doc->doc_id);
|
||||
pthread_mutex_unlock(&ScanCtx.copy_table_mu);
|
||||
if (doc->mime == GET_MIME_ERROR_FATAL) {
|
||||
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
|
||||
ScanCtx.dbg_failed_files_count += 1;
|
||||
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
|
||||
|
||||
CLOSE_FILE(job->vfile)
|
||||
free(doc);
|
||||
return;
|
||||
}
|
||||
|
||||
if (database_mark_document(ProcData.index_db, doc->doc_id, doc->mtime)) {
|
||||
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
|
||||
ScanCtx.dbg_skipped_files_count += 1;
|
||||
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
|
||||
|
||||
CLOSE_FILE(job->vfile)
|
||||
free(doc);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (ScanCtx.new_table != NULL) {
|
||||
pthread_mutex_lock(&ScanCtx.copy_table_mu);
|
||||
incremental_mark_file(ScanCtx.new_table, doc->doc_id);
|
||||
pthread_mutex_unlock(&ScanCtx.copy_table_mu);
|
||||
}
|
||||
|
||||
char *buf[MAGIC_BUF_SIZE];
|
||||
|
||||
if (LogCtx.very_verbose) {
|
||||
LOG_DEBUGF(job->filepath, "Starting parse job {%s}", doc->doc_id)
|
||||
LOG_DEBUGF(job->filepath, "Starting parse job {%s}", doc->doc_id);
|
||||
}
|
||||
|
||||
if (job->ext > 4096) {
|
||||
fprintf(stderr, "Ext is %d, filename is %s\n", job->ext, job->filepath);
|
||||
}
|
||||
|
||||
if (job->vfile.st_size == 0) {
|
||||
doc->mime = MIME_EMPTY;
|
||||
} else if (*(job->filepath + job->ext) != '\0' && (job->ext - job->base != 1)) {
|
||||
doc->mime = mime_get_mime_by_ext(ScanCtx.ext_table, job->filepath + job->ext);
|
||||
}
|
||||
|
||||
if (doc->mime == 0 && !ScanCtx.fast) {
|
||||
|
||||
// Get mime type with libmagic
|
||||
if (job->vfile.read_rewindable == NULL) {
|
||||
LOG_WARNING(job->filepath,
|
||||
"File does not support rewindable reads, cannot guess Media type");
|
||||
goto abort;
|
||||
}
|
||||
|
||||
int bytes_read = job->vfile.read_rewindable(&job->vfile, buf, MAGIC_BUF_SIZE);
|
||||
if (bytes_read < 0) {
|
||||
|
||||
if (job->vfile.is_fs_file) {
|
||||
LOG_ERRORF(job->filepath, "read(): [%d] %s", errno, strerror(errno))
|
||||
} else {
|
||||
LOG_ERRORF(job->filepath, "(virtual) read(): [%d] %s", bytes_read, archive_error_string(job->vfile.arc))
|
||||
}
|
||||
|
||||
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
|
||||
ScanCtx.dbg_failed_files_count += 1;
|
||||
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
|
||||
|
||||
switch (get_file_type(doc->mime, doc->size, doc->filepath)) {
|
||||
case FILETYPE_RAW:
|
||||
parse_raw(&ScanCtx.raw_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_MEDIA:
|
||||
parse_media(&ScanCtx.media_ctx, &job->vfile, doc, mime_get_mime_text(doc->mime));
|
||||
break;
|
||||
case FILETYPE_EBOOK:
|
||||
parse_ebook(&ScanCtx.ebook_ctx, &job->vfile, mime_get_mime_text(doc->mime), doc);
|
||||
break;
|
||||
case FILETYPE_MARKUP:
|
||||
parse_markup(&ScanCtx.text_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_TEXT:
|
||||
parse_text(&ScanCtx.text_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_FONT:
|
||||
parse_font(&ScanCtx.font_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_ARCHIVE:
|
||||
parse_archive(&ScanCtx.arc_ctx, &job->vfile, doc, ScanCtx.exclude, ScanCtx.exclude_extra);
|
||||
break;
|
||||
case FILETYPE_OOXML:
|
||||
parse_ooxml(&ScanCtx.ooxml_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_COMIC:
|
||||
parse_comic(&ScanCtx.comic_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_MOBI:
|
||||
parse_mobi(&ScanCtx.mobi_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_SIST2_SIDECAR:
|
||||
parse_sidecar(&job->vfile, doc);
|
||||
CLOSE_FILE(job->vfile)
|
||||
free(doc);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
magic_t magic = magic_open(MAGIC_MIME_TYPE);
|
||||
|
||||
const char *magic_buffers[1] = {magic_database_buffer,};
|
||||
size_t sizes[1] = {sizeof(magic_database_buffer),};
|
||||
|
||||
int load_ret = magic_load_buffers(magic, (void **) &magic_buffers, sizes, 1);
|
||||
|
||||
if (load_ret != 0) {
|
||||
LOG_FATALF("parse.c", "Could not load libmagic database: (%d)", load_ret)
|
||||
}
|
||||
|
||||
const char *magic_mime_str = magic_buffer(magic, buf, bytes_read);
|
||||
if (magic_mime_str != NULL) {
|
||||
doc->mime = mime_get_mime_by_string(ScanCtx.mime_table, magic_mime_str);
|
||||
|
||||
LOG_DEBUGF(job->filepath, "libmagic: %s", magic_mime_str);
|
||||
|
||||
if (doc->mime == 0) {
|
||||
LOG_WARNINGF(job->filepath, "Couldn't find mime %s", magic_mime_str);
|
||||
}
|
||||
}
|
||||
|
||||
if (job->vfile.reset != NULL) {
|
||||
job->vfile.reset(&job->vfile);
|
||||
}
|
||||
|
||||
magic_close(magic);
|
||||
case FILETYPE_MSDOC:
|
||||
parse_msdoc(&ScanCtx.msdoc_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_JSON:
|
||||
parse_json(&ScanCtx.json_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_NDJSON:
|
||||
parse_ndjson(&ScanCtx.json_ctx, &job->vfile, doc);
|
||||
break;
|
||||
case FILETYPE_DONT_PARSE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
int mmime = MAJOR_MIME(doc->mime);
|
||||
|
||||
if (!(SHOULD_PARSE(doc->mime))) {
|
||||
|
||||
} else if (IS_RAW(doc->mime)) {
|
||||
parse_raw(&ScanCtx.raw_ctx, &job->vfile, doc);
|
||||
} else if ((mmime == MimeVideo && doc->size >= MIN_VIDEO_SIZE) ||
|
||||
(mmime == MimeImage && doc->size >= MIN_IMAGE_SIZE) || mmime == MimeAudio) {
|
||||
|
||||
parse_media(&ScanCtx.media_ctx, &job->vfile, doc, mime_get_mime_text(doc->mime));
|
||||
|
||||
} else if (IS_PDF(doc->mime)) {
|
||||
parse_ebook(&ScanCtx.ebook_ctx, &job->vfile, mime_get_mime_text(doc->mime), doc);
|
||||
|
||||
} else if (mmime == MimeText && ScanCtx.text_ctx.content_size > 0) {
|
||||
if (IS_MARKUP(doc->mime)) {
|
||||
parse_markup(&ScanCtx.text_ctx, &job->vfile, doc);
|
||||
} else {
|
||||
parse_text(&ScanCtx.text_ctx, &job->vfile, doc);
|
||||
}
|
||||
|
||||
} else if (IS_FONT(doc->mime)) {
|
||||
parse_font(&ScanCtx.font_ctx, &job->vfile, doc);
|
||||
|
||||
} else if (
|
||||
ScanCtx.arc_ctx.mode != ARC_MODE_SKIP && (
|
||||
IS_ARC(doc->mime) ||
|
||||
(IS_ARC_FILTER(doc->mime) && should_parse_filtered_file(doc->filepath, doc->ext))
|
||||
)) {
|
||||
parse_archive(&ScanCtx.arc_ctx, &job->vfile, doc, ScanCtx.exclude, ScanCtx.exclude_extra);
|
||||
} else if ((ScanCtx.ooxml_ctx.content_size > 0 || ScanCtx.media_ctx.tn_size > 0) && IS_DOC(doc->mime)) {
|
||||
parse_ooxml(&ScanCtx.ooxml_ctx, &job->vfile, doc);
|
||||
} else if (is_cbr(&ScanCtx.comic_ctx, doc->mime) || is_cbz(&ScanCtx.comic_ctx, doc->mime)) {
|
||||
parse_comic(&ScanCtx.comic_ctx, &job->vfile, doc);
|
||||
} else if (IS_MOBI(doc->mime)) {
|
||||
parse_mobi(&ScanCtx.mobi_ctx, &job->vfile, doc);
|
||||
} else if (doc->mime == MIME_SIST2_SIDECAR) {
|
||||
parse_sidecar(&job->vfile, doc);
|
||||
CLOSE_FILE(job->vfile)
|
||||
free(doc);
|
||||
return;
|
||||
} else if (is_msdoc(&ScanCtx.msdoc_ctx, doc->mime)) {
|
||||
parse_msdoc(&ScanCtx.msdoc_ctx, &job->vfile, doc);
|
||||
} else if (is_json(&ScanCtx.json_ctx, doc->mime)) {
|
||||
parse_json(&ScanCtx.json_ctx, &job->vfile, doc);
|
||||
} else if (is_ndjson(&ScanCtx.json_ctx, doc->mime)) {
|
||||
parse_ndjson(&ScanCtx.json_ctx, &job->vfile, doc);
|
||||
}
|
||||
|
||||
abort:
|
||||
|
||||
//Parent meta
|
||||
if (job->parent[0] != '\0') {
|
||||
meta_line_t *meta_parent = malloc(sizeof(meta_line_t) + SIST_INDEX_ID_LEN);
|
||||
@@ -247,12 +254,8 @@ void parse(tpool_work_arg_shm_t *arg) {
|
||||
if (job->vfile.has_checksum) {
|
||||
char sha1_digest_str[SHA1_STR_LENGTH];
|
||||
buf2hex((unsigned char *) job->vfile.sha1_digest, SHA1_DIGEST_LENGTH, (char *) sha1_digest_str);
|
||||
APPEND_STR_META(doc, MetaChecksum, (const char *) sha1_digest_str);
|
||||
APPEND_STR_META(doc, MetaChecksum, (const char *) sha1_digest_str)
|
||||
}
|
||||
|
||||
write_document(doc);
|
||||
}
|
||||
|
||||
void cleanup_parse() {
|
||||
// noop
|
||||
}
|
||||
|
||||
@@ -4,15 +4,7 @@
|
||||
#include "../sist.h"
|
||||
#include "src/tpool.h"
|
||||
|
||||
#define MAGIC_BUF_SIZE (4096 * 6)
|
||||
|
||||
int fs_read(struct vfile *f, void *buf, size_t size);
|
||||
void fs_close(struct vfile *f);
|
||||
void fs_reset(struct vfile *f);
|
||||
|
||||
void parse_job(parse_job_t *job);
|
||||
void parse(tpool_work_arg_shm_t *arg);
|
||||
|
||||
void cleanup_parse();
|
||||
void parse(parse_job_t *arg);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -4,12 +4,12 @@
|
||||
|
||||
void parse_sidecar(vfile_t *vfile, document_t *doc) {
|
||||
|
||||
LOG_DEBUGF("sidecar.c", "Parsing sidecar file %s", vfile->filepath)
|
||||
LOG_DEBUGF("sidecar.c", "Parsing sidecar file %s", vfile->filepath);
|
||||
|
||||
size_t size;
|
||||
char *buf = read_all(vfile, &size);
|
||||
if (buf == NULL) {
|
||||
LOG_ERRORF("sidecar.c", "Read error for %s", vfile->filepath)
|
||||
LOG_ERRORF("sidecar.c", "Read error for %s", vfile->filepath);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ void parse_sidecar(vfile_t *vfile, document_t *doc) {
|
||||
|
||||
cJSON *json = cJSON_Parse(buf);
|
||||
if (json == NULL) {
|
||||
LOG_ERRORF("sidecar.c", "Could not parse JSON sidecar %s", vfile->filepath)
|
||||
LOG_ERRORF("sidecar.c", "Could not parse JSON sidecar %s", vfile->filepath);
|
||||
return;
|
||||
}
|
||||
char *json_str = cJSON_PrintUnformatted(json);
|
||||
@@ -32,8 +32,7 @@ void parse_sidecar(vfile_t *vfile, document_t *doc) {
|
||||
|
||||
generate_doc_id(rel_path, assoc_doc_id);
|
||||
|
||||
store_write(ScanCtx.index.meta_store, assoc_doc_id, sizeof(assoc_doc_id), json_str,
|
||||
strlen(json_str) + 1);
|
||||
database_write_document_sidecar(ProcData.index_db, assoc_doc_id, json_str);
|
||||
|
||||
cJSON_Delete(json);
|
||||
free(json_str);
|
||||
|
||||
Reference in New Issue
Block a user