Archive file support

This commit is contained in:
simon 2019-12-13 10:53:51 -05:00
parent 9778acda77
commit be23201210
40 changed files with 879 additions and 434 deletions

View File

@ -41,7 +41,7 @@ if (WITH_SIST2)
# utf8.h
utf8.h/utf8.h
)
src/parsing/arc.c src/parsing/arc.h)
endif ()
if (WITH_SIST2_SCAN)
@ -74,7 +74,7 @@ if (WITH_SIST2_SCAN)
# utf8.h
utf8.h/utf8.h
)
src/parsing/arc.c src/parsing/arc.h)
endif ()
find_package(PkgConfig REQUIRED)
@ -160,6 +160,7 @@ if (WITH_SIST2)
${PROJECT_SOURCE_DIR}/lib/libharfbuzz.a
${PROJECT_SOURCE_DIR}/lib/libopenjp2.a
freetype
archive
)
endif ()
@ -228,6 +229,11 @@ if (WITH_SIST2_SCAN)
${PROJECT_SOURCE_DIR}/lib/libharfbuzz.a
${PROJECT_SOURCE_DIR}/lib/libopenjp2.a
freetype
${PROJECT_SOURCE_DIR}/lib/libarchive.a
${PROJECT_SOURCE_DIR}/lib/liblz4.a
${PROJECT_SOURCE_DIR}/lib/liblzma.a
${PROJECT_SOURCE_DIR}/lib/libzstd.a
)
endif ()

View File

@ -15,9 +15,11 @@ sist2 (Simple incremental search tool)
* Generates thumbnails\*
* Incremental scanning
* Automatic tagging from file attributes via [user scripts](scripting/README.md)
* Recursive scan inside archive files \*\*
\* See [format support](#format-support)
\* See [format support](#format-support)
\** See [Archive files](#archive-files)
## Getting Started
@ -33,8 +35,6 @@ sist2 (Simple incremental search tool)
## Example usage
![demo](demo.gif)
See help page `sist2 --help` for more details.
**Scan a directory**
@ -91,10 +91,25 @@ pdf,xps,cbz,fb2,epub | MuPDF | yes | yes, `png` | title |
`image/*` | ffmpeg | - | yes, `jpeg` | `EXIF:Artist`, `EXIF:ImageDescription` |
ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
`text/plain` | *(none)* | yes | no | - |
tar, zip, rar, 7z, ar ... | Libarchive | *planned* | - | no |
docx, xlsx, pptx | | *planned* | no | *planned* |
tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
docx, xlsx, pptx | | yes | no | *planned* |
\* *See [Archive files](#archive-files)*
### Archive files
**sist2** will scan files stored into archive files (zip, tar, 7z...) as if
they were directly in the file system. Recursive (archives inside archives)
scan is also supported.
**Limitations**:
* Parsing media files with formats that require
*seek* (e.g. `.gif`, `.mp4` w/ fragmented metadata etc.) is not supported.
* Archive files are scanned sequentially, by a single thread. On systems where
**sist2** is not I/O bound, scans might be faster when larger archives are split
into smaller parts.
To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
## Build from source

@ -1 +0,0 @@
Subproject commit 288acf97a15d558f96c24c89f578b724d6e06b0c

@ -1 +0,0 @@
Subproject commit 53c21c2d6bebba887be9a30de204875fb41b1169

@ -1 +0,0 @@
Subproject commit 878e3588a3349c2660b0f9aa6d94a994034d7c10

@ -1 +0,0 @@
Subproject commit 1249b5cd02c3b6fb9b917d16c76bc76c862932b6

@ -1 +0,0 @@
Subproject commit 355cedaefe68358ad533ffb6a59bbb4e6444267a

@ -1 +0,0 @@
Subproject commit d8d4cc9290982e1fdd254377ff62d8175f9c6059

@ -1 +0,0 @@
Subproject commit 5875a6b44618fb7dfd5cd6d742533eaee2014060

View File

@ -410,4 +410,9 @@ text/PGP,
audio/x-hx-aac-adts,
application/x-chrome-extension,
image/heic, heic
image/x-gem,
image/x-gem,
application/x-lzma, lzma
application/warc, warc
application/x-lz4, lz4
application/x-lzip, lz
application/x-lzop, lzo

1 application/arj arj
410 audio/x-hx-aac-adts
411 application/x-chrome-extension
412 image/heic heic
413 image/x-gem
414 application/x-lzma lzma
415 application/warc warc
416 application/x-lz4 lz4
417 application/x-lzip lz
418 application/x-lzop lzo

View File

@ -67,5 +67,39 @@ make -j 4
cd ..
mv libmagic/src/.libs/libmagic.a .
# libarchive
git clone https://github.com/libarchive/libarchive
cd libarchive/build
./autogen.sh
cd ..
./configure --without-nettle --without-expat --without-xml2 --without-openssl
make -j 4
cd ..
mv libarchive/.libs/libarchive.a .
# lz4
git clone https://github.com/lz4/lz4
cd lz4
make -j 4
cd ..
mv lz4/lib/liblz4.a .
# lzma
wget https://newcontinuum.dl.sourceforge.net/project/lzmautils/xz-5.2.3.tar.gz
tar -xzf xz-5.2.3.tar.gz
rm xz-5.2.3.tar.gz
cd xz-5.2.3
./autogen.sh
./configure
make -j 4
cd ..
mv xz-5.2.3/src/liblzma/.libs/liblzma.a .
# zstd
git clone https://github.com/facebook/zstd
cd zstd
make -j 4
cd ..
mv zstd/lib/libzstd.a .
cd ..

View File

@ -34,6 +34,28 @@ font = (
"font/woff2"
)
# Archive "formats"
archive = (
"application/x-tar",
"application/zip",
"application/x-rar",
"application/x-arc",
"application/x-warc",
"application/x-7z-compressed",
)
# Archive "filters"
arc_filter = (
"application/gzip",
"application/x-bzip2",
"application/x-xz",
"application/x-zstd",
"application/x-lzma",
"application/x-lz4",
"application/x-lzip",
"application/x-lzop",
)
cnt = 1
@ -48,6 +70,10 @@ def mime_id(mime):
mime_id += " | 0x40000000"
elif mime in font:
mime_id += " | 0x20000000"
elif mime in archive:
mime_id += " | 0x10000000"
elif mime in arc_filter:
mime_id += " | 0x08000000"
elif mime == "application/x-empty":
return "1"
return mime_id

View File

@ -1,7 +1,7 @@
#include "cli.h"
#define DEFAULT_OUTPUT "index.sist2/"
#define DEFAULT_CONTENT_SIZE 4096
#define DEFAULT_CONTENT_SIZE 32768
#define DEFAULT_QUALITY 5
#define DEFAULT_SIZE 500
#define DEFAULT_REWRITE_URL ""
@ -35,6 +35,7 @@ void scan_args_destroy(scan_args_t *args) {
}
#ifndef SIST_SCAN_ONLY
void index_args_destroy(index_args_t *args) {
//todo
free(args);
@ -44,6 +45,7 @@ void web_args_destroy(web_args_t *args) {
//todo
free(args);
}
#endif
int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
@ -119,10 +121,24 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
if (args->rewrite_url == NULL) {
args->rewrite_url = DEFAULT_REWRITE_URL;
}
if (args->archive == NULL || strcmp(args->archive, "recurse") == 0) {
args->archive_mode = ARC_MODE_RECURSE;
} else if (strcmp(args->archive, "list") == 0) {
args->archive_mode = ARC_MODE_LIST;
} else if (strcmp(args->archive, "shallow") == 0) {
args->archive_mode = ARC_MODE_SHALLOW;
} else if (strcmp(args->archive, "skip") == 0) {
args->archive_mode = ARC_MODE_SKIP;
} else {
fprintf(stderr, "Archive mode must be one of (skip, list, shallow, recurse), got '%s'", args->archive);
return 1;
}
return 0;
}
#ifndef SIST_SCAN_ONLY
int index_args_validate(index_args_t *args, int argc, const char **argv) {
if (argc < 2) {
@ -196,7 +212,7 @@ int web_args_validate(web_args_t *args, int argc, const char **argv) {
}
if (args->credentials != NULL) {
args->b64credentials = onion_base64_encode(args->credentials, (int)strlen(args->credentials));
args->b64credentials = onion_base64_encode(args->credentials, (int) strlen(args->credentials));
//Remove trailing newline
*(args->b64credentials + strlen(args->b64credentials) - 1) = '\0';
}
@ -223,5 +239,6 @@ web_args_t *web_args_create() {
web_args_t *args = calloc(sizeof(web_args_t), 1);
return args;
}
#endif

View File

@ -14,6 +14,8 @@ typedef struct scan_args {
char *name;
int depth;
char *path;
char *archive;
archive_mode_t archive_mode;
} scan_args_t;
scan_args_t *scan_args_create();

View File

@ -16,6 +16,7 @@ struct {
int content_size;
float tn_qscale;
int depth;
archive_mode_t archive_mode;
size_t stat_tn_size;
size_t stat_index_size;

View File

@ -111,6 +111,8 @@ char *get_meta_key_text(enum metakey meta_key) {
return "title";
case MetaFontName:
return "font_name";
case MetaParent:
return "parent";
default:
return NULL;
}
@ -247,6 +249,7 @@ void read_index(const char *path, const char index_id[UUID_STR_LEN], index_func
case MetaAlbumArtist:
case MetaGenre:
case MetaFontName:
case MetaParent:
case MetaTitle: {
buf.cur = 0;
while ((c = getc(file)) != 0) {

View File

@ -1,7 +1,8 @@
#include "walk.h"
#include "src/ctx.h"
parse_job_t *create_parse_job(const char *filepath, const struct stat *info, int base) {
__always_inline
parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info, int base) {
int len = (int) strlen(filepath);
parse_job_t *job = malloc(sizeof(parse_job_t) + len);
@ -14,14 +15,22 @@ parse_job_t *create_parse_job(const char *filepath, const struct stat *info, int
job->ext = len;
}
memcpy(&(job->info), info, sizeof(struct stat));
job->info = *info;
memset(job->parent, 0, 16);
job->vfile.filepath = job->filepath;
job->vfile.read = fs_read;
job->vfile.close = fs_close;
job->vfile.fd = -1;
job->vfile.is_fs_file = TRUE;
return job;
}
int handle_entry(const char *filepath, const struct stat *info, int typeflag, struct FTW *ftw) {
if (ftw->level <= ScanCtx.depth && typeflag == FTW_F && S_ISREG(info->st_mode)) {
parse_job_t *job = create_parse_job(filepath, info, ftw->base);
parse_job_t *job = create_fs_parse_job(filepath, info, ftw->base);
tpool_add_work(ScanCtx.pool, parse, job);
}

View File

@ -10,7 +10,7 @@
#define EPILOG "Made by simon987 <me@simon987.net>. Released under GPL-3.0"
static const char *const Version = "1.1.9";
static const char *const Version = "1.1.10";
static const char *const usage[] = {
"sist2 scan [OPTION]... PATH",
"sist2 index [OPTION]... INDEX",
@ -51,7 +51,7 @@ void scan_print_header() {
}
if (ScanCtx.content_size > 0) {
printf("content_size\t%d B\n", ScanCtx.content_size);
printf("content_size\t\t%d B\n", ScanCtx.content_size);
} else {
printf("content_size\t\t\tdisabled\n");
}
@ -66,6 +66,7 @@ void sist2_scan(scan_args_t *args) {
ScanCtx.content_size = args->content_size;
ScanCtx.threads = args->threads;
ScanCtx.depth = args->depth;
ScanCtx.archive_mode = args->archive_mode;
strncpy(ScanCtx.index.path, args->output, sizeof(ScanCtx.index.path));
strncpy(ScanCtx.index.desc.name, args->name, sizeof(ScanCtx.index.desc.name));
strncpy(ScanCtx.index.desc.root, args->path, sizeof(ScanCtx.index.desc.root));
@ -242,7 +243,7 @@ int main(int argc, const char *argv[]) {
OPT_INTEGER(0, "size", &scan_args->size,
"Thumbnail size, in pixels. Use negative value to disable. DEFAULT=500"),
OPT_INTEGER(0, "content-size", &scan_args->content_size,
"Number of bytes to be extracted from text documents. Use negative value to disable. DEFAULT=4096"),
"Number of bytes to be extracted from text documents. Use negative value to disable. DEFAULT=32768"),
OPT_STRING(0, "incremental", &scan_args->incremental,
"Reuse an existing index and only scan modified files."),
OPT_STRING('o', "output", &scan_args->output, "Output directory. DEFAULT=index.sist2/"),
@ -250,6 +251,9 @@ int main(int argc, const char *argv[]) {
OPT_STRING(0, "name", &scan_args->name, "Index display name. DEFAULT: (name of the directory)"),
OPT_INTEGER(0, "depth", &scan_args->depth, "Scan up to DEPTH subdirectories deep. "
"Use 0 to only scan files in PATH. DEFAULT: -1"),
OPT_STRING(0, "archive", &scan_args->archive, "Archive file mode (skip|list|shallow|recurse). "
"skip: Don't parse, list: only get file names as text, "
"shallow: Don't parse archives inside archives. DEFAULT: recurse"),
#ifndef SIST_SCAN_ONLY
OPT_GROUP("Index options"),

152
src/parsing/arc.c Normal file
View File

@ -0,0 +1,152 @@
#include "arc.h"
#include "src/ctx.h"
#define ARC_BUF_SIZE 8192
int should_parse_filtered_file(const char *filepath, int ext) {
char tmp[PATH_MAX * 2];
if (ext == 0) {
return FALSE;
}
memcpy(tmp, filepath, ext - 1);
*(tmp + ext - 1) = '\0';
char *idx = strrchr(tmp, '.');
if (idx == NULL) {
return FALSE;
}
if (strcmp(idx, ".tar") == 0) {
return TRUE;
}
return FALSE;
}
int arc_read(struct vfile *f, void *buf, size_t size) {
return archive_read_data(f->arc, buf, size);
}
typedef struct arc_data {
vfile_t *f;
char buf[ARC_BUF_SIZE];
} arc_data_f;
int vfile_open_callback(struct archive *a, void *user_data) {
arc_data_f *data = user_data;
if (data->f->is_fs_file && data->f->fd == -1) {
data->f->fd = open(data->f->filepath, O_RDONLY);
}
return ARCHIVE_OK;
}
long vfile_read_callback(struct archive *a, void *user_data, const void **buf) {
arc_data_f *data = user_data;
*buf = data->buf;
return data->f->read(data->f, data->buf, ARC_BUF_SIZE);
}
int vfile_close_callback(struct archive *a, void *user_data) {
arc_data_f *data = user_data;
if (data->f->close != NULL) {
data->f->close(data->f);
}
return ARCHIVE_OK;
}
void parse_archive(vfile_t *f, document_t *doc) {
struct archive *a;
struct archive_entry *entry;
a = archive_read_new();
archive_read_support_filter_all(a);
archive_read_support_format_all(a);
arc_data_f data;
data.f = f;
int ret = 0;
if (data.f->is_fs_file) {
ret = archive_read_open_filename(a, doc->filepath, ARC_BUF_SIZE);
} else if (ScanCtx.archive_mode == ARC_MODE_RECURSE) {
ret = archive_read_open(
a, &data,
vfile_open_callback,
vfile_read_callback,
vfile_close_callback
);
} else {
archive_read_free(a);
return;
}
if (ret != ARCHIVE_OK) {
fprintf(stderr, "OPEN[%d]:%s %s\n", ret, archive_error_string(a), doc->filepath);
archive_read_free(a);
return;
}
if (ScanCtx.archive_mode == ARC_MODE_LIST) {
dyn_buffer_t buf = dyn_buffer_create();
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
char *path = (char *) archive_entry_pathname(entry);
dyn_buffer_append_string(&buf, path);
dyn_buffer_write_char(&buf, '\n');
}
}
dyn_buffer_write_char(&buf, '\0');
meta_line_t *meta_list = malloc(sizeof(meta_line_t) + buf.cur);
meta_list->key = MetaContent;
strcpy(meta_list->strval, buf.buf);
APPEND_META(doc, meta_list);
dyn_buffer_destroy(&buf);
} else {
parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2);
sub_job->vfile.close = NULL;
sub_job->vfile.read = arc_read;
sub_job->vfile.arc = a;
sub_job->vfile.filepath = sub_job->filepath;
sub_job->vfile.is_fs_file = FALSE;
memcpy(sub_job->parent, doc->uuid, sizeof(uuid_t));
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
sub_job->info = *archive_entry_stat(entry);
if (S_ISREG(sub_job->info.st_mode)) {
sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry));
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
char *p = strrchr(sub_job->filepath, '.');
if (p != NULL) {
sub_job->ext = (int) (p - sub_job->filepath + 1);
} else {
sub_job->ext = (int) strlen(sub_job->filepath);
}
parse(sub_job);
}
}
free(sub_job);
}
archive_read_free(a);
}

12
src/parsing/arc.h Normal file
View File

@ -0,0 +1,12 @@
#ifndef SIST2_ARC_H
#define SIST2_ARC_H
#include "src/sist.h"
int should_parse_filtered_file(const char *filepath, int ext);
void parse_archive(vfile_t *f, document_t *doc);
int arc_read(struct vfile * f, void *buf, size_t size);
#endif

View File

@ -2,6 +2,7 @@
#include "src/ctx.h"
#define MIN_SIZE 32
#define AVIO_BUF_SIZE 8192
__always_inline
AVCodecContext *alloc_jpeg_encoder(int dstW, int dstH, float qscale) {
@ -89,9 +90,9 @@ AVFrame *read_frame(AVFormatContext *pFormatCtx, AVCodecContext *decoder, int st
int read_frame_ret = av_read_frame(pFormatCtx, &avPacket);
if (read_frame_ret != 0) {
if (read_frame_ret != AVERROR_EOF) {
fprintf(stderr, "Error reading frame: %d\n", read_frame_ret);
}
// if (read_frame_ret != AVERROR_EOF) {
// fprintf(stderr, "Error reading frame: %d\n", read_frame_ret);
// }
av_frame_free(&frame);
av_packet_unref(&avPacket);
return NULL;
@ -188,22 +189,11 @@ void append_video_meta(AVFormatContext *pFormatCtx, AVFrame *frame, document_t *
}
}
void parse_media(const char *filepath, document_t *doc) {
void parse_media(AVFormatContext *pFormatCtx, document_t *doc) {
int video_stream = -1;
int audio_stream = -1;
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
fprintf(stderr, "Could not allocate AVFormatContext! %s \n", filepath);
return;
}
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
if (res < 0) {
fprintf(stderr, "media error: %s %s\n", filepath, av_err2str(res));
return;
}
avformat_find_stream_info(pFormatCtx, NULL);
for (int i = (int) pFormatCtx->nb_streams - 1; i >= 0; i--) {
@ -314,3 +304,58 @@ void parse_media(const char *filepath, document_t *doc) {
avformat_free_context(pFormatCtx);
}
void parse_media_filename(const char *filepath, document_t *doc) {
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
fprintf(stderr, "Could not allocate AVFormatContext! %s \n", filepath);
return;
}
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
if (res < 0) {
fprintf(stderr, "media error: %s %s\n", filepath, av_err2str(res));
return;
}
parse_media(pFormatCtx, doc);
}
int vfile_read(void *ptr, uint8_t *buf, int buf_size) {
struct vfile *f = ptr;
int ret = f->read(f, buf, buf_size);
if (ret == 0) {
return AVERROR_EOF;
}
return ret;
}
void parse_media_vfile(struct vfile *f, document_t *doc) {
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
fprintf(stderr, "Could not allocate AVFormatContext! %s \n", f->filepath);
return;
}
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
AVIOContext *io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, f, vfile_read, NULL, NULL);
pFormatCtx->pb = io_ctx;
pFormatCtx->flags |= AVFMT_FLAG_CUSTOM_IO;
int res = avformat_open_input(&pFormatCtx, "", NULL, NULL);
if (res == -5) {
// Tried to parse media that requires seek
return;
} else if(res < 0) {
fprintf(stderr, "media error: %s %s\n", f->filepath, av_err2str(res));
return;
}
parse_media(pFormatCtx, doc);
av_free(io_ctx);
}

View File

@ -7,6 +7,8 @@
#define MIN_VIDEO_SIZE 1024 * 64
#define MIN_IMAGE_SIZE 1024 * 2
void parse_media(const char * filepath, document_t *doc);
void parse_media_filename(const char * filepath, document_t *doc);
void parse_media_vfile(struct vfile *f, document_t *doc);
#endif

View File

@ -8,7 +8,7 @@
#define MIME_EMPTY 1
#define DONT_PARSE 0x80000000
#define SHOULD_PARSE(mime_id) (mime_id & DONT_PARSE) != DONT_PARSE
#define SHOULD_PARSE(mime_id) (mime_id & DONT_PARSE) != DONT_PARSE && mime_id != 0
#define PDF_MASK 0x40000000
#define IS_PDF(mime_id) (mime_id & PDF_MASK) == PDF_MASK
@ -16,6 +16,12 @@
#define FONT_MASK 0x20000000
#define IS_FONT(mime_id) (mime_id & FONT_MASK) == FONT_MASK
#define ARC_MASK 0x10000000
#define IS_ARC(mime_id) (mime_id & ARC_MASK) == ARC_MASK
#define ARC_FILTER_MASK 0x08000000
#define IS_ARC_FILTER(mime_id) (mime_id & ARC_FILTER_MASK) == ARC_FILTER_MASK
enum major_mime {
MimeInvalid = 0,
MimeModel = 1,

View File

@ -20,7 +20,7 @@ enum mime {
application_freeloader=655372,
application_futuresplash=655373,
application_groupwise=655374,
application_gzip=655375,
application_gzip=655375 | 0x08000000,
application_hta=655376,
application_i_deas=655377,
application_iges=655378,
@ -82,342 +82,346 @@ enum mime {
application_vnd_xara=655434,
application_vocaltec_media_desc=655435,
application_vocaltec_media_file=655436,
application_winhelp=655437,
application_wordperfect=655438,
application_wordperfect6_0=655439,
application_wordperfect6_1=655440,
application_x_123=655441,
application_x_7z_compressed=655442,
application_x_aim=655443,
application_x_apple_diskimage=655444,
application_x_arc=655445,
application_x_archive=655446,
application_x_atari_7800_rom=655447,
application_x_authorware_bin=655448,
application_x_authorware_map=655449,
application_x_authorware_seg=655450,
application_x_avira_qua=655451,
application_x_bcpio=655452,
application_x_bittorrent=655453,
application_x_bsh=655454,
application_x_bytecode_python=655455,
application_x_bzip=655456,
application_x_bzip2=655457,
application_x_cbr=655458,
application_x_cbz=655459 | 0x40000000,
application_x_cdlink=655460,
application_x_chat=655461,
application_x_chrome_extension=655462,
application_x_cocoa=655463,
application_x_conference=655464,
application_x_coredump=655465,
application_x_cpio=655466,
application_x_dbf=655467,
application_x_dbt=655468,
application_x_debian_package=655469,
application_x_deepv=655470,
application_x_director=655471,
application_x_dmp=655472,
application_x_dosdriver=655473,
application_x_dosexec=655474,
application_x_dvi=655475,
application_x_elc=655476,
application_warc=655437,
application_winhelp=655438,
application_wordperfect=655439,
application_wordperfect6_0=655440,
application_wordperfect6_1=655441,
application_x_123=655442,
application_x_7z_compressed=655443 | 0x10000000,
application_x_aim=655444,
application_x_apple_diskimage=655445,
application_x_arc=655446 | 0x10000000,
application_x_archive=655447,
application_x_atari_7800_rom=655448,
application_x_authorware_bin=655449,
application_x_authorware_map=655450,
application_x_authorware_seg=655451,
application_x_avira_qua=655452,
application_x_bcpio=655453,
application_x_bittorrent=655454,
application_x_bsh=655455,
application_x_bytecode_python=655456,
application_x_bzip=655457,
application_x_bzip2=655458 | 0x08000000,
application_x_cbr=655459,
application_x_cbz=655460 | 0x40000000,
application_x_cdlink=655461,
application_x_chat=655462,
application_x_chrome_extension=655463,
application_x_cocoa=655464,
application_x_conference=655465,
application_x_coredump=655466,
application_x_cpio=655467,
application_x_dbf=655468,
application_x_dbt=655469,
application_x_debian_package=655470,
application_x_deepv=655471,
application_x_director=655472,
application_x_dmp=655473,
application_x_dosdriver=655474,
application_x_dosexec=655475,
application_x_dvi=655476,
application_x_elc=655477,
application_x_empty=1,
application_x_envoy=655478,
application_x_esrehber=655479,
application_x_excel=655480,
application_x_executable=655481,
application_x_font_gdos=655482,
application_x_font_pf2=655483,
application_x_font_pfm=655484,
application_x_font_sfn=655485,
application_x_font_ttf=655486 | 0x20000000,
application_x_freelance=655487,
application_x_gamecube_rom=655488,
application_x_gdbm=655489,
application_x_gettext_translation=655490,
application_x_git=655491,
application_x_gsp=655492,
application_x_gss=655493,
application_x_gtar=655494,
application_x_gzip=655495,
application_x_hdf=655496,
application_x_helpfile=655497,
application_x_httpd_imap=655498,
application_x_ima=655499,
application_x_innosetup=655500,
application_x_internett_signup=655501,
application_x_inventor=655502,
application_x_ip2=655503,
application_x_java_applet=655504,
application_x_java_commerce=655505,
application_x_java_image=655506,
application_x_java_jmod=655507,
application_x_java_keystore=655508,
application_x_kdelnk=655509,
application_x_koan=655510,
application_x_latex=655511,
application_x_livescreen=655512,
application_x_lotus=655513,
application_x_lz4=655514,
application_x_lz4_json=655515,
application_x_lzh=655516,
application_x_lzh_compressed=655517,
application_x_lzx=655518,
application_x_mach_binary=655519,
application_x_mach_executable=655520,
application_x_magic_cap_package_1_0=655521,
application_x_mathcad=655522,
application_x_maxis_dbpf=655523,
application_x_meme=655524,
application_x_midi=655525,
application_x_mif=655526,
application_x_mix_transfer=655527,
application_x_mobipocket_ebook=655528,
application_x_ms_compress_szdd=655529,
application_x_ms_pdb=655530,
application_x_ms_reader=655531,
application_x_msaccess=655532,
application_x_navi_animation=655533,
application_x_navidoc=655534,
application_x_navimap=655535,
application_x_navistyle=655536,
application_x_nes_rom=655537,
application_x_netcdf=655538,
application_x_newton_compatible_pkg=655539,
application_x_nintendo_ds_rom=655540,
application_x_object=655541,
application_x_omc=655542,
application_x_omcdatamaker=655543,
application_x_omcregerator=655544,
application_x_pagemaker=655545,
application_x_pcl=655546,
application_x_pgp_keyring=655547,
application_x_pixclscript=655548,
application_x_pkcs7_certreqresp=655549,
application_x_pkcs7_signature=655550,
application_x_project=655551,
application_x_qpro=655552,
application_x_rar=655553,
application_x_rpm=655554,
application_x_sdp=655555,
application_x_sea=655556,
application_x_seelogo=655557,
application_x_setupscript=655558,
application_x_shar=655559,
application_x_sharedlib=655560,
application_x_shockwave_flash=655561,
application_x_snappy_framed=655562,
application_x_sprite=655563,
application_x_sqlite3=655564,
application_x_sv4cpio=655565,
application_x_sv4crc=655566,
application_x_tar=655567,
application_x_tbook=655568,
application_x_terminfo=655569,
application_x_terminfo2=655570,
application_x_tex_tfm=655571,
application_x_texinfo=655572,
application_x_ustar=655573,
application_x_visio=655574,
application_x_vnd_audioexplosion_mzz=655575,
application_x_vnd_ls_xpix=655576,
application_x_vrml=655577,
application_x_wais_source=655578,
application_x_wine_extension_ini=655579,
application_x_wintalk=655580,
application_x_world=655581,
application_x_wri=655582,
application_x_x509_ca_cert=655583,
application_x_xz=655584,
application_x_zip=655585,
application_x_zstd=655586,
application_xml=655587,
application_zip=655588,
application_zlib=655589,
audio_it=458982,
audio_make=458983,
audio_mid=458984,
audio_midi=458985,
audio_mp4=458986,
audio_mpeg=458987,
audio_ogg=458988,
audio_s3m=458989,
audio_tsp_audio=458990,
audio_tsplayer=458991,
audio_vnd_qcelp=458992,
audio_voxware=458993,
audio_x_aiff=458994,
audio_x_flac=458995,
audio_x_gsm=458996,
audio_x_hx_aac_adts=458997,
audio_x_jam=458998,
audio_x_liveaudio=458999,
audio_x_m4a=459000,
audio_x_midi=459001,
audio_x_mod=459002,
audio_x_mp4a_latm=459003,
audio_x_mpeg_3=459004,
audio_x_mpequrl=459005,
audio_x_nspaudio=459006,
audio_x_pn_realaudio=459007,
audio_x_psid=459008,
audio_x_realaudio=459009,
audio_x_twinvq=459010,
audio_x_twinvq_plugin=459011,
audio_x_voc=459012,
audio_x_wav=459013,
audio_xm=459014,
font_otf=327943 | 0x20000000,
font_sfnt=327944 | 0x20000000,
font_woff=327945 | 0x20000000,
font_woff2=327946 | 0x20000000,
image_cmu_raster=524555,
image_fif=524556,
image_florian=524557,
image_g3fax=524558,
image_gif=524559,
image_heic=524560,
image_ief=524561,
image_jpeg=524562,
image_jutvision=524563,
image_naplps=524564,
image_pict=524565,
image_png=524566,
image_svg=524567 | 0x80000000,
image_svg_xml=524568 | 0x80000000,
image_tiff=524569,
image_vnd_adobe_photoshop=524570 | 0x80000000,
image_vnd_djvu=524571 | 0x80000000,
image_vnd_fpx=524572,
image_vnd_microsoft_icon=524573,
image_vnd_rn_realflash=524574,
image_vnd_rn_realpix=524575,
image_vnd_wap_wbmp=524576,
image_vnd_xiff=524577,
image_webp=524578,
image_wmf=524579,
image_x_3ds=524580,
image_x_cmu_raster=524581,
image_x_cur=524582,
image_x_dwg=524583,
image_x_eps=524584,
image_x_exr=524585,
image_x_gem=524586,
image_x_icns=524587,
image_x_icon=524588 | 0x80000000,
image_x_jg=524589,
image_x_jps=524590,
image_x_ms_bmp=524591,
image_x_niff=524592,
image_x_pcx=524593,
image_x_pict=524594,
image_x_portable_bitmap=524595,
image_x_portable_graymap=524596,
image_x_portable_pixmap=524597,
image_x_quicktime=524598,
image_x_rgb=524599,
image_x_tga=524600,
image_x_tiff=524601,
image_x_win_bitmap=524602,
image_x_xcf=524603 | 0x80000000,
image_x_xpixmap=524604 | 0x80000000,
image_x_xwindowdump=524605,
message_news=196926,
message_rfc822=196927,
model_vnd_dwf=65856,
model_vnd_gdl=65857,
model_vnd_gs_gdl=65858,
model_vrml=65859,
model_x_pov=65860,
text_PGP=590149,
text_asp=590150,
text_css=590151,
text_html=590152,
text_javascript=590153,
text_mcf=590154,
text_pascal=590155,
text_plain=590156,
text_richtext=590157,
text_rtf=590158,
text_scriplet=590159,
text_tab_separated_values=590160,
text_troff=590161,
text_uri_list=590162,
text_vnd_abc=590163,
text_vnd_fmi_flexstor=590164,
text_vnd_wap_wml=590165,
text_vnd_wap_wmlscript=590166,
text_webviewhtml=590167,
text_x_Algol68=590168,
text_x_asm=590169,
text_x_audiosoft_intra=590170,
text_x_awk=590171,
text_x_bcpl=590172,
text_x_c=590173,
text_x_c__=590174,
text_x_component=590175,
text_x_diff=590176,
text_x_fortran=590177,
text_x_java=590178,
text_x_la_asf=590179,
text_x_lisp=590180,
text_x_m=590181,
text_x_m4=590182,
text_x_makefile=590183,
text_x_ms_regedit=590184,
text_x_msdos_batch=590185,
text_x_objective_c=590186,
text_x_pascal=590187,
text_x_perl=590188,
text_x_php=590189,
text_x_po=590190,
text_x_python=590191,
text_x_ruby=590192,
text_x_sass=590193,
text_x_scss=590194,
text_x_server_parsed_html=590195,
text_x_setext=590196,
text_x_sgml=590197,
text_x_shellscript=590198,
text_x_speech=590199,
text_x_tcl=590200,
text_x_tex=590201,
text_x_uil=590202,
text_x_uuencode=590203,
text_x_vcalendar=590204,
text_x_vcard=590205,
text_xml=590206,
video_MP2T=393599,
video_animaflex=393600,
video_avi=393601,
video_avs_video=393602,
video_mp4=393603,
video_mpeg=393604,
video_quicktime=393605,
video_vdo=393606,
video_vivo=393607,
video_vnd_rn_realvideo=393608,
video_vosaic=393609,
video_webm=393610,
video_x_amt_demorun=393611,
video_x_amt_showrun=393612,
video_x_atomic3d_feature=393613,
video_x_dl=393614,
video_x_dv=393615,
video_x_fli=393616,
video_x_flv=393617,
video_x_isvideo=393618,
video_x_jng=393619 | 0x80000000,
video_x_m4v=393620,
video_x_matroska=393621,
video_x_mng=393622,
video_x_motion_jpeg=393623,
video_x_ms_asf=393624,
video_x_msvideo=393625,
video_x_qtc=393626,
video_x_sgi_movie=393627,
x_epoc_x_sisx_app=721308,
application_x_envoy=655479,
application_x_esrehber=655480,
application_x_excel=655481,
application_x_executable=655482,
application_x_font_gdos=655483,
application_x_font_pf2=655484,
application_x_font_pfm=655485,
application_x_font_sfn=655486,
application_x_font_ttf=655487 | 0x20000000,
application_x_freelance=655488,
application_x_gamecube_rom=655489,
application_x_gdbm=655490,
application_x_gettext_translation=655491,
application_x_git=655492,
application_x_gsp=655493,
application_x_gss=655494,
application_x_gtar=655495,
application_x_gzip=655496,
application_x_hdf=655497,
application_x_helpfile=655498,
application_x_httpd_imap=655499,
application_x_ima=655500,
application_x_innosetup=655501,
application_x_internett_signup=655502,
application_x_inventor=655503,
application_x_ip2=655504,
application_x_java_applet=655505,
application_x_java_commerce=655506,
application_x_java_image=655507,
application_x_java_jmod=655508,
application_x_java_keystore=655509,
application_x_kdelnk=655510,
application_x_koan=655511,
application_x_latex=655512,
application_x_livescreen=655513,
application_x_lotus=655514,
application_x_lz4=655515 | 0x08000000,
application_x_lz4_json=655516,
application_x_lzh=655517,
application_x_lzh_compressed=655518,
application_x_lzip=655519 | 0x08000000,
application_x_lzma=655520 | 0x08000000,
application_x_lzop=655521 | 0x08000000,
application_x_lzx=655522,
application_x_mach_binary=655523,
application_x_mach_executable=655524,
application_x_magic_cap_package_1_0=655525,
application_x_mathcad=655526,
application_x_maxis_dbpf=655527,
application_x_meme=655528,
application_x_midi=655529,
application_x_mif=655530,
application_x_mix_transfer=655531,
application_x_mobipocket_ebook=655532,
application_x_ms_compress_szdd=655533,
application_x_ms_pdb=655534,
application_x_ms_reader=655535,
application_x_msaccess=655536,
application_x_navi_animation=655537,
application_x_navidoc=655538,
application_x_navimap=655539,
application_x_navistyle=655540,
application_x_nes_rom=655541,
application_x_netcdf=655542,
application_x_newton_compatible_pkg=655543,
application_x_nintendo_ds_rom=655544,
application_x_object=655545,
application_x_omc=655546,
application_x_omcdatamaker=655547,
application_x_omcregerator=655548,
application_x_pagemaker=655549,
application_x_pcl=655550,
application_x_pgp_keyring=655551,
application_x_pixclscript=655552,
application_x_pkcs7_certreqresp=655553,
application_x_pkcs7_signature=655554,
application_x_project=655555,
application_x_qpro=655556,
application_x_rar=655557 | 0x10000000,
application_x_rpm=655558,
application_x_sdp=655559,
application_x_sea=655560,
application_x_seelogo=655561,
application_x_setupscript=655562,
application_x_shar=655563,
application_x_sharedlib=655564,
application_x_shockwave_flash=655565,
application_x_snappy_framed=655566,
application_x_sprite=655567,
application_x_sqlite3=655568,
application_x_sv4cpio=655569,
application_x_sv4crc=655570,
application_x_tar=655571 | 0x10000000,
application_x_tbook=655572,
application_x_terminfo=655573,
application_x_terminfo2=655574,
application_x_tex_tfm=655575,
application_x_texinfo=655576,
application_x_ustar=655577,
application_x_visio=655578,
application_x_vnd_audioexplosion_mzz=655579,
application_x_vnd_ls_xpix=655580,
application_x_vrml=655581,
application_x_wais_source=655582,
application_x_wine_extension_ini=655583,
application_x_wintalk=655584,
application_x_world=655585,
application_x_wri=655586,
application_x_x509_ca_cert=655587,
application_x_xz=655588 | 0x08000000,
application_x_zip=655589,
application_x_zstd=655590 | 0x08000000,
application_xml=655591,
application_zip=655592 | 0x10000000,
application_zlib=655593,
audio_it=458986,
audio_make=458987,
audio_mid=458988,
audio_midi=458989,
audio_mp4=458990,
audio_mpeg=458991,
audio_ogg=458992,
audio_s3m=458993,
audio_tsp_audio=458994,
audio_tsplayer=458995,
audio_vnd_qcelp=458996,
audio_voxware=458997,
audio_x_aiff=458998,
audio_x_flac=458999,
audio_x_gsm=459000,
audio_x_hx_aac_adts=459001,
audio_x_jam=459002,
audio_x_liveaudio=459003,
audio_x_m4a=459004,
audio_x_midi=459005,
audio_x_mod=459006,
audio_x_mp4a_latm=459007,
audio_x_mpeg_3=459008,
audio_x_mpequrl=459009,
audio_x_nspaudio=459010,
audio_x_pn_realaudio=459011,
audio_x_psid=459012,
audio_x_realaudio=459013,
audio_x_twinvq=459014,
audio_x_twinvq_plugin=459015,
audio_x_voc=459016,
audio_x_wav=459017,
audio_xm=459018,
font_otf=327947 | 0x20000000,
font_sfnt=327948 | 0x20000000,
font_woff=327949 | 0x20000000,
font_woff2=327950 | 0x20000000,
image_cmu_raster=524559,
image_fif=524560,
image_florian=524561,
image_g3fax=524562,
image_gif=524563,
image_heic=524564,
image_ief=524565,
image_jpeg=524566,
image_jutvision=524567,
image_naplps=524568,
image_pict=524569,
image_png=524570,
image_svg=524571 | 0x80000000,
image_svg_xml=524572 | 0x80000000,
image_tiff=524573,
image_vnd_adobe_photoshop=524574 | 0x80000000,
image_vnd_djvu=524575 | 0x80000000,
image_vnd_fpx=524576,
image_vnd_microsoft_icon=524577,
image_vnd_rn_realflash=524578,
image_vnd_rn_realpix=524579,
image_vnd_wap_wbmp=524580,
image_vnd_xiff=524581,
image_webp=524582,
image_wmf=524583,
image_x_3ds=524584,
image_x_cmu_raster=524585,
image_x_cur=524586,
image_x_dwg=524587,
image_x_eps=524588,
image_x_exr=524589,
image_x_gem=524590,
image_x_icns=524591,
image_x_icon=524592 | 0x80000000,
image_x_jg=524593,
image_x_jps=524594,
image_x_ms_bmp=524595,
image_x_niff=524596,
image_x_pcx=524597,
image_x_pict=524598,
image_x_portable_bitmap=524599,
image_x_portable_graymap=524600,
image_x_portable_pixmap=524601,
image_x_quicktime=524602,
image_x_rgb=524603,
image_x_tga=524604,
image_x_tiff=524605,
image_x_win_bitmap=524606,
image_x_xcf=524607 | 0x80000000,
image_x_xpixmap=524608 | 0x80000000,
image_x_xwindowdump=524609,
message_news=196930,
message_rfc822=196931,
model_vnd_dwf=65860,
model_vnd_gdl=65861,
model_vnd_gs_gdl=65862,
model_vrml=65863,
model_x_pov=65864,
text_PGP=590153,
text_asp=590154,
text_css=590155,
text_html=590156,
text_javascript=590157,
text_mcf=590158,
text_pascal=590159,
text_plain=590160,
text_richtext=590161,
text_rtf=590162,
text_scriplet=590163,
text_tab_separated_values=590164,
text_troff=590165,
text_uri_list=590166,
text_vnd_abc=590167,
text_vnd_fmi_flexstor=590168,
text_vnd_wap_wml=590169,
text_vnd_wap_wmlscript=590170,
text_webviewhtml=590171,
text_x_Algol68=590172,
text_x_asm=590173,
text_x_audiosoft_intra=590174,
text_x_awk=590175,
text_x_bcpl=590176,
text_x_c=590177,
text_x_c__=590178,
text_x_component=590179,
text_x_diff=590180,
text_x_fortran=590181,
text_x_java=590182,
text_x_la_asf=590183,
text_x_lisp=590184,
text_x_m=590185,
text_x_m4=590186,
text_x_makefile=590187,
text_x_ms_regedit=590188,
text_x_msdos_batch=590189,
text_x_objective_c=590190,
text_x_pascal=590191,
text_x_perl=590192,
text_x_php=590193,
text_x_po=590194,
text_x_python=590195,
text_x_ruby=590196,
text_x_sass=590197,
text_x_scss=590198,
text_x_server_parsed_html=590199,
text_x_setext=590200,
text_x_sgml=590201,
text_x_shellscript=590202,
text_x_speech=590203,
text_x_tcl=590204,
text_x_tex=590205,
text_x_uil=590206,
text_x_uuencode=590207,
text_x_vcalendar=590208,
text_x_vcard=590209,
text_xml=590210,
video_MP2T=393603,
video_animaflex=393604,
video_avi=393605,
video_avs_video=393606,
video_mp4=393607,
video_mpeg=393608,
video_quicktime=393609,
video_vdo=393610,
video_vivo=393611,
video_vnd_rn_realvideo=393612,
video_vosaic=393613,
video_webm=393614,
video_x_amt_demorun=393615,
video_x_amt_showrun=393616,
video_x_atomic3d_feature=393617,
video_x_dl=393618,
video_x_dv=393619,
video_x_fli=393620,
video_x_flv=393621,
video_x_isvideo=393622,
video_x_jng=393623 | 0x80000000,
video_x_m4v=393624,
video_x_matroska=393625,
video_x_mng=393626,
video_x_motion_jpeg=393627,
video_x_ms_asf=393628,
video_x_msvideo=393629,
video_x_qtc=393630,
video_x_sgi_movie=393631,
x_epoc_x_sisx_app=721312,
};
char *mime_get_mime_text(unsigned int mime_id) {switch (mime_id) {
case application_arj: return "application/arj";
@ -832,6 +836,10 @@ case audio_x_hx_aac_adts: return "audio/x-hx-aac-adts";
case application_x_chrome_extension: return "application/x-chrome-extension";
case image_heic: return "image/heic";
case image_x_gem: return "image/x-gem";
case application_x_lzma: return "application/x-lzma";
case application_warc: return "application/warc";
case application_x_lzip: return "application/x-lzip";
case application_x_lzop: return "application/x-lzop";
default: return NULL;}}
GHashTable *mime_get_ext_table() {GHashTable *ext_table = g_hash_table_new(g_str_hash, g_str_equal);
g_hash_table_insert(ext_table, "arj", (gpointer)application_arj);
@ -1337,6 +1345,10 @@ g_hash_table_insert(ext_table, "z", (gpointer)application_zlib);
g_hash_table_insert(ext_table, "pf2", (gpointer)application_x_font_pf2);
g_hash_table_insert(ext_table, "jmod", (gpointer)application_x_java_jmod);
g_hash_table_insert(ext_table, "heic", (gpointer)image_heic);
g_hash_table_insert(ext_table, "lzma", (gpointer)application_x_lzma);
g_hash_table_insert(ext_table, "warc", (gpointer)application_warc);
g_hash_table_insert(ext_table, "lz", (gpointer)application_x_lzip);
g_hash_table_insert(ext_table, "lzo", (gpointer)application_x_lzop);
return ext_table;}
GHashTable *mime_get_mime_table() {GHashTable *mime_table = g_hash_table_new(g_str_hash, g_str_equal);
g_hash_table_insert(mime_table, "application/arj", (gpointer)application_arj);
@ -1751,5 +1763,9 @@ g_hash_table_insert(mime_table, "audio/x-hx-aac-adts", (gpointer)audio_x_hx_aac_
g_hash_table_insert(mime_table, "application/x-chrome-extension", (gpointer)application_x_chrome_extension);
g_hash_table_insert(mime_table, "image/heic", (gpointer)image_heic);
g_hash_table_insert(mime_table, "image/x-gem", (gpointer)image_x_gem);
g_hash_table_insert(mime_table, "application/x-lzma", (gpointer)application_x_lzma);
g_hash_table_insert(mime_table, "application/warc", (gpointer)application_warc);
g_hash_table_insert(mime_table, "application/x-lzip", (gpointer)application_x_lzip);
g_hash_table_insert(mime_table, "application/x-lzop", (gpointer)application_x_lzop);
return mime_table;}
#endif

View File

@ -1,9 +1,32 @@
#include <src/ctx.h>
#include "src/sist.h"
#include "src/ctx.h"
__thread magic_t Magic = NULL;
void *read_all(parse_job_t *job, const char *buf, int bytes_read, int *fd) {
int fs_read(struct vfile *f, void *buf, size_t size) {
if (f->fd == -1) {
f->fd = open(f->filepath, O_RDONLY);
if (f->fd == -1) {
perror("open");
printf("%s\n", f->filepath);
return -1;
}
}
return read(f->fd, buf, size);
}
#define CLOSE_FILE(f) if (f.close != NULL) {f.close(&f);};
void fs_close(struct vfile *f) {
if (f->fd != -1) {
close(f->fd);
}
}
void *read_all(parse_job_t *job, const char *buf, int bytes_read) {
void *full_buf;
@ -11,17 +34,10 @@ void *read_all(parse_job_t *job, const char *buf, int bytes_read, int *fd) {
full_buf = malloc(job->info.st_size);
memcpy(full_buf, buf, job->info.st_size);
} else {
if (*fd == -1) {
*fd = open(job->filepath, O_RDONLY);
if (*fd == -1) {
perror("open");
printf("%s\n", job->filepath);
return NULL;
}
}
full_buf = malloc(job->info.st_size);
memcpy(full_buf, buf, bytes_read);
int ret = read(*fd, full_buf + bytes_read, job->info.st_size - bytes_read);
int ret = job->vfile.read(&job->vfile, full_buf + bytes_read, job->info.st_size - bytes_read);
if (ret == -1) {
perror("read");
return NULL;
@ -65,24 +81,13 @@ void parse(void *arg) {
doc.mime = mime_get_mime_by_ext(ScanCtx.ext_table, job->filepath + job->ext);
}
int fd = -1;
int bytes_read = 0;
if (doc.mime == 0) {
// Get mime type with libmagic
fd = open(job->filepath, O_RDONLY);
if (fd == -1) {
perror("open");
free(job);
return;
}
bytes_read = read(fd, buf, PARSE_BUF_SIZE);
bytes_read = job->vfile.read(&job->vfile, buf, PARSE_BUF_SIZE);
if (bytes_read == -1) {
perror("read");
close(fd);
free(job);
CLOSE_FILE(job->vfile)
return;
}
@ -100,11 +105,16 @@ void parse(void *arg) {
if (!(SHOULD_PARSE(doc.mime))) {
} else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) ||
(mmime == MimeImage && doc.size >= MIN_IMAGE_SIZE) || mmime == MimeAudio) {
parse_media(job->filepath, &doc);
(mmime == MimeImage && doc.size >= MIN_IMAGE_SIZE) || mmime == MimeAudio) {
if (job->vfile.is_fs_file) {
parse_media_filename(job->filepath, &doc);
} else {
parse_media_vfile(&job->vfile, &doc);
}
} else if (IS_PDF(doc.mime)) {
void *pdf_buf = read_all(job, (char *) buf, bytes_read, &fd);
void *pdf_buf = read_all(job, (char *) buf, bytes_read);
parse_pdf(pdf_buf, doc.size, &doc);
if (pdf_buf != buf && pdf_buf != NULL) {
@ -112,22 +122,35 @@ void parse(void *arg) {
}
} else if (mmime == MimeText && ScanCtx.content_size > 0) {
parse_text(bytes_read, &fd, (char *) buf, &doc);
parse_text(bytes_read, &job->vfile, (char *) buf, &doc);
} else if (IS_FONT(doc.mime)) {
void *font_buf = read_all(job, (char *) buf, bytes_read, &fd);
void *font_buf = read_all(job, (char *) buf, bytes_read);
parse_font(font_buf, doc.size, &doc);
if (font_buf != buf && font_buf != NULL) {
free(font_buf);
}
} else if (
ScanCtx.archive_mode != ARC_MODE_SKIP && (
IS_ARC(doc.mime) ||
(IS_ARC_FILTER(doc.mime) && should_parse_filtered_file(doc.filepath, doc.ext))
)) {
parse_archive(&job->vfile, &doc);
}
//Parent meta
if (!uuid_is_null(job->parent)) {
char tmp[UUID_STR_LEN];
uuid_unparse(job->parent, tmp);
meta_line_t *meta_parent = malloc(sizeof(meta_line_t) + UUID_STR_LEN + 1);
meta_parent->key = MetaParent;
strcpy(meta_parent->strval, tmp);
APPEND_META((&doc), meta_parent)
}
write_document(&doc);
if (fd != -1) {
close(fd);
}
free(job);
CLOSE_FILE(job->vfile)
}

View File

@ -5,6 +5,9 @@
#define PARSE_BUF_SIZE 4096
int fs_read(struct vfile *f, void *buf, size_t size);
void fs_close(struct vfile *f);
void parse(void *arg);
#endif

View File

@ -1,7 +1,7 @@
#include "text.h"
#include "src/ctx.h"
void parse_text(int bytes_read, int *fd, char *buf, document_t *doc) {
void parse_text(int bytes_read, struct vfile *f, char *buf, document_t *doc) {
char *intermediate_buf;
int intermediate_buf_len;
@ -13,10 +13,6 @@ void parse_text(int bytes_read, int *fd, char *buf, document_t *doc) {
memcpy(intermediate_buf, buf, to_copy);
} else {
if (*fd == -1) {
*fd = open(doc->filepath, O_RDONLY);
}
int to_read = MIN(ScanCtx.content_size, doc->size) - bytes_read;
intermediate_buf = malloc(to_read + bytes_read);
@ -25,7 +21,7 @@ void parse_text(int bytes_read, int *fd, char *buf, document_t *doc) {
memcpy(intermediate_buf, buf, bytes_read);
}
read(*fd, intermediate_buf + bytes_read, to_read);
f->read(f, intermediate_buf + bytes_read, to_read);
}
text_buffer_t tex = text_buffer_create(ScanCtx.content_size);
text_buffer_append_string(&tex, intermediate_buf, intermediate_buf_len);

View File

@ -3,6 +3,6 @@
#include "src/sist.h"
void parse_text(int bytes_read, int *fd, char *buf, document_t *doc);
void parse_text(int bytes_read, struct vfile *f, char *buf, document_t *doc);
#endif

View File

@ -28,6 +28,8 @@
#include <wordexp.h>
#include "ft2build.h"
#include "freetype/freetype.h"
#include <archive.h>
#include <archive_entry.h>
#ifndef SIST_SCAN_ONLY
#include <onion/onion.h>
@ -52,6 +54,7 @@
#include "parsing/pdf.h"
#include "parsing/media.h"
#include "parsing/font.h"
#include "parsing/arc.h"
#include "cli.h"
#include "utf8.h/utf8.h"

View File

@ -119,6 +119,7 @@ static void *tpool_worker(void *arg) {
}
work->func(work->arg);
free(work->arg);
free(work);
}

View File

@ -9,6 +9,12 @@
#define IS_META_LONG(key) (key & META_LONG_MASK) == META_LONG_MASK
#define IS_META_STR(meta) (meta->key & META_STR_MASK) == META_STR_MASK
#define ARC_MODE_SKIP 0
#define ARC_MODE_LIST 1
#define ARC_MODE_SHALLOW 2
#define ARC_MODE_RECURSE 3
typedef int archive_mode_t;
// This is written to file as a 8bit char!
enum metakey {
MetaContent = 1 | META_STR_MASK,
@ -24,6 +30,7 @@ enum metakey {
MetaGenre = 11 | META_STR_MASK,
MetaTitle = 12 | META_STR_MASK,
MetaFontName = 13 | META_STR_MASK,
MetaParent = 14 | META_STR_MASK,
};
typedef struct index_descriptor {
@ -63,13 +70,39 @@ typedef struct document {
short ext;
meta_line_t *meta_head;
meta_line_t *meta_tail;
struct document *child_head;
struct document *child_tail;
char *filepath;
} document_t;
typedef struct vfile vfile_t;
typedef int (*read_func_t)(struct vfile *, void *buf, size_t size);
typedef int (*seek_func_t)(struct vfile *, size_t size, int whence);
typedef void (*close_func_t)(struct vfile *);
typedef struct vfile {
union {
int fd;
struct archive *arc;
};
int is_fs_file;
char *filepath;
read_func_t read;
close_func_t close;
} vfile_t;
typedef struct parse_job_t {
int base;
int ext;
struct stat info;
struct vfile vfile;
uuid_t parent;
char filepath[1];
} parse_job_t;

View File

@ -46,6 +46,10 @@ void dyn_buffer_write_str(dyn_buffer_t *buf, char *str) {
dyn_buffer_write_char(buf, '\0');
}
void dyn_buffer_append_string(dyn_buffer_t *buf, char *str) {
dyn_buffer_write(buf, str, strlen(str));
}
void dyn_buffer_write_int(dyn_buffer_t *buf, int d) {
grow_buffer_small(buf);

View File

@ -47,6 +47,8 @@ void dyn_buffer_write_char(dyn_buffer_t *buf, char c);
void dyn_buffer_write_str(dyn_buffer_t *buf, char *str);
void dyn_buffer_append_string(dyn_buffer_t *buf, char *str);
void dyn_buffer_write_int(dyn_buffer_t *buf, int d);
void dyn_buffer_write_short(dyn_buffer_t *buf, short s);

View File

@ -360,12 +360,24 @@ int file(void *p, onion_request *req, onion_response *res) {
return OCS_PROCESSED;
}
cJSON *doc = elastic_get_document(arg_uuid);
cJSON *source = cJSON_GetObjectItem(doc, "_source");
cJSON *index_id = cJSON_GetObjectItem(source, "index");
if (index_id == NULL) {
cJSON_Delete(doc);
return OCS_NOT_PROCESSED;
char *next = arg_uuid;
cJSON *doc = NULL;
cJSON *index_id = NULL;
cJSON *source = NULL;
while (true) {
doc = elastic_get_document(next);
source = cJSON_GetObjectItem(doc, "_source");
index_id = cJSON_GetObjectItem(source, "index");
if (index_id == NULL) {
cJSON_Delete(doc);
return OCS_NOT_PROCESSED;
}
cJSON *parent = cJSON_GetObjectItem(source, "parent");
if (parent == NULL) {
break;
}
next = parent->valuestring;
}
index_t *idx = get_index_by_id(index_id->valuestring);

File diff suppressed because one or more lines are too long

View File

@ -23,6 +23,15 @@ body {
border: none;
}
.sub-document {
background: #37474F;
}
.sub-document .text-muted {
color: #8a949c !important;
}
.list-group-item {
background: #212121;
color: #e0e0e0;

View File

@ -15,6 +15,10 @@ body {
box-shadow: 0 .125rem .25rem rgba(0, 0, 0, .075) !important;
}
.sub-document {
background: #AB47BC1F;
}
.navbar-brand {
font-size: 1.75rem;
padding: 0;

View File

@ -165,12 +165,19 @@ function createDocCard(hit) {
let docCardBody = document.createElement("div");
docCardBody.setAttribute("class", "card-body document");
//Title
let title = makeTitle(hit);
let isSubDocument = false;
let link = document.createElement("a");
link.setAttribute("href", "f/" + hit["_id"]);
link.setAttribute("target", "_blank");
link.appendChild(title);
//Title
let title = makeTitle(hit);
if (hit["_source"].hasOwnProperty("parent")) {
docCard.classList.add("sub-document");
isSubDocument = true;
}
let tagContainer = document.createElement("div");
tagContainer.setAttribute("class", "card-text");
@ -204,7 +211,7 @@ function createDocCard(hit) {
}
// Hover
if (thumbnail && hit["_source"]["videoc"] === "gif") {
if (thumbnail && hit["_source"]["videoc"] === "gif" && !isSubDocument) {
gifOver(thumbnail, hit);
}
break;
@ -241,7 +248,7 @@ function createDocCard(hit) {
}
//Audio
if (mimeCategory === "audio" && hit["_source"].hasOwnProperty("audioc")) {
if (mimeCategory === "audio" && hit["_source"].hasOwnProperty("audioc") && !isSubDocument) {
let audio = document.createElement("audio");
audio.setAttribute("preload", "none");
@ -267,7 +274,6 @@ function createDocCard(hit) {
docCardBody.appendChild(link);
docCard.appendChild(docCardBody);
link.appendChild(title);
docCardBody.appendChild(tagContainer);
return docCard;
@ -275,8 +281,9 @@ function createDocCard(hit) {
function makeThumbnail(mimeCategory, hit, imgWrapper, small) {
let thumbnail;
let isSubDocument = hit["_source"].hasOwnProperty("parent");
if (mimeCategory === "video" && shouldPlayVideo(hit)) {
if (mimeCategory === "video" && shouldPlayVideo(hit) && !isSubDocument) {
thumbnail = document.createElement("video");
addVidSrc("f/" + hit["_id"], hit["_source"]["mime"], thumbnail);

View File

@ -3,7 +3,7 @@
*/
function humanFileSize(bytes) {
if (bytes === 0) {
return "? B"
return "0 B"
}
let thresh = 1000;

View File

@ -11,7 +11,7 @@
<nav class="navbar navbar-expand-lg">
<a class="navbar-brand" href="/">sist2</a>
<span class="badge badge-pill version">v1.1.9</span>
<span class="badge badge-pill version">v1.1.10</span>
<span class="tagline">Lightning-fast file system indexer and search tool </span>
<a style="margin-left: auto" id="theme" class="btn" title="Toggle theme" href="/">Theme</a>
</nav>