From 7962a994e2ff42a8902a9f963db15f77e46984dc Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 3 Nov 2019 07:44:54 -0500 Subject: [PATCH] utf8 update + bug fixes --- .gitmodules | 9 + CMakeLists.txt | 22 +- README.md | 2 +- cJSON | 2 +- lib/harfbuzz | 1 + lib/openjpeg | 1 + mime.csv | 53 +- scripts/get_static_libs.sh | 18 +- scripts/get_static_libs_freebsd.sh | 16 + scripts/mime.py | 3 +- src/index/elastic.c | 9 +- src/io/serialize.c | 5 +- src/io/walk.c | 5 +- src/main.c | 7 +- src/parsing/font.c | 6 +- src/parsing/media.c | 68 ++- src/parsing/media.h | 1 + src/parsing/mime.c | 6 +- src/parsing/mime_generated.c | 830 +++++++++++++++++------------ src/parsing/parse.c | 16 +- src/parsing/pdf.c | 294 ++++++---- src/parsing/text.c | 17 +- src/sist.h | 2 + src/tpool.c | 20 +- src/tpool.h | 1 + src/util.c | 95 +++- src/util.h | 15 +- utf8.h | 1 + 28 files changed, 1022 insertions(+), 503 deletions(-) create mode 160000 lib/harfbuzz create mode 160000 lib/openjpeg create mode 160000 utf8.h diff --git a/.gitmodules b/.gitmodules index e706e95..09a2175 100644 --- a/.gitmodules +++ b/.gitmodules @@ -16,3 +16,12 @@ [submodule "lmdb"] path = lmdb url = https://github.com/LMDB/lmdb +[submodule "utf8.h"] + path = utf8.h + url = https://github.com/sheredom/utf8.h +[submodule "lib/openjpeg"] + path = lib/openjpeg + url = https://github.com/uclouvain/openjpeg +[submodule "lib/harfbuzz"] + path = lib/harfbuzz + url = https://github.com/harfbuzz/harfbuzz diff --git a/CMakeLists.txt b/CMakeLists.txt index faf7908..cd70f5f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,9 @@ if (WITH_SIST2) lmdb/libraries/liblmdb/lmdb.h lmdb/libraries/liblmdb/mdb.c lmdb/libraries/liblmdb/midl.h lmdb/libraries/liblmdb/midl.c src/cli.c src/cli.h + + # utf8.h + utf8.h/utf8.h ) endif () @@ -67,6 +70,9 @@ if (WITH_SIST2_SCAN) lmdb/libraries/liblmdb/lmdb.h lmdb/libraries/liblmdb/mdb.c lmdb/libraries/liblmdb/midl.h lmdb/libraries/liblmdb/midl.c src/cli.c src/cli.h + + # utf8.h + utf8.h/utf8.h ) endif () @@ -116,10 +122,10 @@ if (WITH_SIST2) target_compile_options(sist2 PRIVATE -# -O3 - # -march=native -# -fno-stack-protector -# -fomit-frame-pointer + -Ofast +# -march=native + -fno-stack-protector + -fomit-frame-pointer ) TARGET_LINK_LIBRARIES( @@ -150,6 +156,9 @@ if (WITH_SIST2) m bz2 magic + harfbuzz + openjp2 + freetype ) endif () @@ -187,7 +196,7 @@ if (WITH_SIST2_SCAN) ) target_compile_options(sist2_scan PRIVATE - -O3 + -Ofast # -march=native -fno-stack-protector -fomit-frame-pointer @@ -215,6 +224,9 @@ if (WITH_SIST2_SCAN) pthread m + ${PROJECT_SOURCE_DIR}/lib/libharfbuzz.a + ${PROJECT_SOURCE_DIR}/lib/libopenjp2.a + freetype ) endif () diff --git a/README.md b/README.md index fc53de3..aaaf1bc 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ File type | Library | Content | Thumbnail | Metadata :---|:---|:---|:---|:--- pdf,xps,cbz,cbr,fb2,epub | MuPDF | yes | yes, `png` | title | `audio/*` | libav | - | yes, `jpeg` | ID3 tags | -`video/*` | libav | - | yes, `jpeg` | *planned* | +`video/*` | libav | - | yes, `jpeg` | title, comment | `image/*` | libav | - | yes, `jpeg` | *planned* | ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style | `text/plain` | *(none)* | yes | no | - | diff --git a/cJSON b/cJSON index 2de7d04..533ff8a 160000 --- a/cJSON +++ b/cJSON @@ -1 +1 @@ -Subproject commit 2de7d04aaf67598e23d06573dcb4e370ebbad410 +Subproject commit 533ff8a783be0d5c81581ab17cd2aeba3f0044c1 diff --git a/lib/harfbuzz b/lib/harfbuzz new file mode 160000 index 0000000..7cde68f --- /dev/null +++ b/lib/harfbuzz @@ -0,0 +1 @@ +Subproject commit 7cde68f10cdf2c3ff77c1d9077475c0fc034c75c diff --git a/lib/openjpeg b/lib/openjpeg new file mode 160000 index 0000000..5875a6b --- /dev/null +++ b/lib/openjpeg @@ -0,0 +1 @@ +Subproject commit 5875a6b44618fb7dfd5cd6d742533eaee2014060 diff --git a/mime.csv b/mime.csv index 0572019..fc3f242 100644 --- a/mime.csv +++ b/mime.csv @@ -254,6 +254,7 @@ text/mcf, mcf text/pascal, pas text/plain, com|cmd|conf|def|g|idc|list|lst|mar|sdml|text|txt|md|groovy|license|properties|desktop|ini|rst|cmake|ipynb|readme|less|lo|go|yml|d|cs|hpp|srt text/richtext, rt|rtf|rtx +text/rtf, text/scriplet, wsc text/x-awk, awk !video/x-jng, jng @@ -263,7 +264,7 @@ image/x-xwindowdump, xwd !image/vnd.adobe.photoshop, psd text/tab-separated-values, tsv text/troff, man|me|ms|roff|t|tr -text/uri-list, uni|unis|uri|uris +text/uri-list, uji|unis|uri|uris text/vnd.abc, abc text/vnd.fmi.flexstor, flx text/vnd.wap.wmlscript, wmls @@ -360,3 +361,53 @@ application/x-wine-extension-ini, application/x-cbz, cbz application/x-cbr, cbr application/x-ms-compress-szdd, fon +application/x-atari-7800-rom, a78 +application/x-nes-rom, nes +application/x-font-pfm, pfm +application/x-gettext-translation, +image/wmf, +application/pgp-keys, +image/x-3ds, 3ds +application/x-lz4, lz4 +application/vnd.openxmlformats-officedocument.presentationml.presentation, pptx +application/vnd.oasis.opendocument.presentation, odp +application/x-msaccess, accdb +application/vnd.oasis.opendocument.spreadsheet, ods +audio/x-aiff, aiff|aif +text/x-ms-regedit, reg +application/x-gamecube-rom, +application/x-nintendo-ds-rom, +text/x-objective-c, +application/x-font-gdos, +application/x-apple-diskimage, +application/x-zstd, zst +video/x-m4v, m4v +message/news, +application/vnd.symbian.install, +application/x-lzh-compressed, +application/x-dosdriver, +application/vnd.tcpdump.pcap, pcap +x-epoc/x-sisx-app, +application/x-avira-qua, +video/MP2T, +application/x-snappy-framed, +application/x-lz4+json, jsonlz4 +application/x-dmp, dmp +application/zlib, z +application/x-pgp-keyring, +application/x-gdbm, +application/x-font-pf2, pf2 +application/x-zip, +application/x-coredump, +application/x-java-jmod, jmod +application/x-terminfo, +application/x-terminfo2, +application/x-arc, +application/vnd.lotus-1-2-3, +image/x-win-bitmap, +application/x-maxis-dbpf, +text/PGP, +audio/x-hx-aac-adts, +application/x-chrome-extension, +image/heic, heic +image/x-gem, \ No newline at end of file diff --git a/scripts/get_static_libs.sh b/scripts/get_static_libs.sh index 13f1f34..2941750 100755 --- a/scripts/get_static_libs.sh +++ b/scripts/get_static_libs.sh @@ -2,12 +2,28 @@ cd lib cd mupdf -HAVE_X11=no HAVE_GLUT=no make -j 4 +USE_SYSTEM_HARFBUZZ=yes USE_SYSTEM_OPENJPEG=yes HAVE_X11=no HAVE_GLUT=no make -j 4 cd .. mv mupdf/build/release/libmupdf.a . mv mupdf/build/release/libmupdf-third.a . +# openjp2 +cd openjpeg +#cmake . -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-O3 -march=native -DNDEBUG" +cmake . -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-O3" +make -j 4 +cd .. +mv openjpeg/bin/libopenjp2.a . + +# harfbuzz +cd harfbuzz +./autogen.sh +./configure --disable-shared --enable-static +make -j 4 +cd .. +mv harfbuzz/src/.libs/libharfbuzz.a . + # ffmpeg cd ffmpeg ./configure --disable-shared --enable-static --disable-ffmpeg --disable-ffplay \ diff --git a/scripts/get_static_libs_freebsd.sh b/scripts/get_static_libs_freebsd.sh index 3fb0be3..eb9c4d2 100755 --- a/scripts/get_static_libs_freebsd.sh +++ b/scripts/get_static_libs_freebsd.sh @@ -9,6 +9,22 @@ cd .. mv mupdf/build/release/libmupdf.a . mv mupdf/build/release/libmupdf-third.a . +# openjp2 +cd openjpeg +#cmake . -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-O3 -march=native -DNDEBUG" +cmake . -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-O3" +gmake -j 4 +cd .. +mv openjpeg/bin/libopenjp2.a . + +# harfbuzz +cd harfbuzz +./autogen.sh +./configure --disable-shared --enable-static +gmake -j 4 +cd .. +mv harfbuzz/src/.libs/libharfbuzz.a . + # ffmpeg cd ffmpeg ./configure --disable-shared --enable-static --disable-ffmpeg --disable-ffplay \ diff --git a/scripts/mime.py b/scripts/mime.py index dc5bed4..00833df 100644 --- a/scripts/mime.py +++ b/scripts/mime.py @@ -12,7 +12,8 @@ major_mime = { "audio": 7, "image": 8, "text": 9, - "application": 10 + "application": 10, + "x-epoc": 11, } pdf = ( diff --git a/src/index/elastic.c b/src/index/elastic.c index 0ff65a8..9a79a33 100644 --- a/src/index/elastic.c +++ b/src/index/elastic.c @@ -102,7 +102,14 @@ void elastic_flush() { cJSON *ret_json = cJSON_Parse(r->body); if (cJSON_GetObjectItem(ret_json, "errors")->valueint != 0) { - fprintf(stderr, "%s\n", r->body); + cJSON *err; + cJSON_ArrayForEach(err, cJSON_GetObjectItem(ret_json, "items")) { + if (cJSON_GetObjectItem(cJSON_GetObjectItem(err, "index"), "status")->valueint != 201) { + char* str = cJSON_Print(err); + fprintf(stderr, "%s\n", str); + cJSON_free(str); + } + } } cJSON_Delete(ret_json); diff --git a/src/io/serialize.c b/src/io/serialize.c index a49890b..c591536 100644 --- a/src/io/serialize.c +++ b/src/io/serialize.c @@ -236,7 +236,7 @@ void read_index(const char *path, const char index_id[UUID_STR_LEN], index_func case MetaTitle: { buf.cur = 0; while ((c = getc(file)) != 0) { - if (!(SHOULD_IGNORE_CHAR(c)) || c == ' ') { + if (SHOULD_KEEP_CHAR(c) || c == ' ') { dyn_buffer_write_char(&buf, (char) c); } } @@ -244,6 +244,9 @@ void read_index(const char *path, const char index_id[UUID_STR_LEN], index_func cJSON_AddStringToObject(document, get_meta_key_text(key), buf.buf); break; } + default: + fprintf(stderr, "Invalid meta key (corrupt index): %x", key); + break; } key = getc(file); diff --git a/src/io/walk.c b/src/io/walk.c index 2d724ab..483ac90 100644 --- a/src/io/walk.c +++ b/src/io/walk.c @@ -3,14 +3,13 @@ parse_job_t *create_parse_job(const char *filepath, const struct stat *info, int base) { int len = (int) strlen(filepath); - parse_job_t *job = malloc(sizeof(parse_job_t) + len); - memcpy(&(job->filepath), filepath, len + 1); + strcpy(job->filepath, filepath); job->base = base; char *p = strrchr(filepath + base, '.'); if (p != NULL) { - job->ext = (int)(p - filepath + 1); + job->ext = (int) (p - filepath + 1); } else { job->ext = len; } diff --git a/src/main.c b/src/main.c index b1deacb..8558367 100644 --- a/src/main.c +++ b/src/main.c @@ -10,7 +10,7 @@ #define EPILOG "Made by simon987 . Released under GPL-3.0" -static const char *const Version = "1.0.14"; +static const char *const Version = "1.1.0"; static const char *const usage[] = { "sist2 scan [OPTION]... PATH", "sist2 index [OPTION]... INDEX", @@ -52,11 +52,10 @@ void sist2_scan(scan_args_t *args) { ScanCtx.tn_qscale = args->quality; ScanCtx.tn_size = args->size; ScanCtx.content_size = args->content_size; - ScanCtx.pool = tpool_create(args->threads, serializer_cleanup); ScanCtx.threads = args->threads; strncpy(ScanCtx.index.path, args->output, sizeof(ScanCtx.index.path)); strncpy(ScanCtx.index.desc.name, args->name, sizeof(ScanCtx.index.desc.name)); - strcpy(ScanCtx.index.desc.root, args->path); + strncpy(ScanCtx.index.desc.root, args->path, sizeof(ScanCtx.index.desc.root)); ScanCtx.index.desc.root_len = (short) strlen(ScanCtx.index.desc.root); init_dir(ScanCtx.index.path); @@ -93,6 +92,8 @@ void sist2_scan(scan_args_t *args) { printf("Loaded %d items in to mtime table.", g_hash_table_size(ScanCtx.original_table)); } + ScanCtx.pool = tpool_create(args->threads, serializer_cleanup); + tpool_start(ScanCtx.pool); walk_directory_tree(ScanCtx.index.desc.root); tpool_wait(ScanCtx.pool); tpool_destroy(ScanCtx.pool); diff --git a/src/parsing/font.c b/src/parsing/font.c index 19b216d..7049831 100644 --- a/src/parsing/font.c +++ b/src/parsing/font.c @@ -186,7 +186,11 @@ void parse_font(const char *buf, size_t buf_len, document_t *doc) { err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER); if (err != 0) { - continue; + c = c >= 'a' && c <= 'z' ? c - 32 : c + 32; + err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER); + if (err != 0) { + continue; + } } glyph_t glyph = ft_glyph_to_glyph(face->glyph); diff --git a/src/parsing/media.c b/src/parsing/media.c index 0fa8a17..ad95bca 100644 --- a/src/parsing/media.c +++ b/src/parsing/media.c @@ -1,6 +1,9 @@ #include "src/sist.h" #include "src/ctx.h" +#define MIN_SIZE 32 + +__always_inline AVCodecContext *alloc_jpeg_encoder(int dstW, int dstH, float qscale) { AVCodec *jpeg_codec = avcodec_find_encoder(AV_CODEC_ID_MJPEG); @@ -22,8 +25,8 @@ AVCodecContext *alloc_jpeg_encoder(int dstW, int dstH, float qscale) { return jpeg; } +__always_inline AVFrame *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) { - AVFrame *scaled_frame = av_frame_alloc(); int dstW; int dstH; @@ -41,16 +44,22 @@ AVFrame *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int si } } + if (dstW <= MIN_SIZE || dstH <= MIN_SIZE) { + return NULL; + } + + AVFrame *scaled_frame = av_frame_alloc(); + struct SwsContext *ctx = sws_getContext( decoder->width, decoder->height, decoder->pix_fmt, dstW, dstH, AV_PIX_FMT_YUVJ420P, SWS_FAST_BILINEAR, 0, 0, 0 ); - int dst_buf_len = avpicture_get_size(AV_PIX_FMT_YUVJ420P, dstW, dstH); + int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, dstW, dstH, 1); uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len); - avpicture_fill((AVPicture *) scaled_frame, dst_buf, AV_PIX_FMT_YUVJ420P, dstW, dstH); + av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, dstW, dstH, 1); sws_scale(ctx, (const uint8_t *const *) frame->data, frame->linesize, @@ -81,7 +90,7 @@ AVFrame *read_frame(AVFormatContext *pFormatCtx, AVCodecContext *decoder, int st if (read_frame_ret != 0) { if (read_frame_ret != AVERROR_EOF) { - fprintf(stderr, "Error reading frame: %s\n", av_err2str(read_frame_ret)); + fprintf(stderr, "Error reading frame: %d\n", read_frame_ret); } av_frame_free(&frame); av_packet_unref(&avPacket); @@ -108,35 +117,40 @@ AVFrame *read_frame(AVFormatContext *pFormatCtx, AVCodecContext *decoder, int st } #define APPEND_TAG_META(doc, tag, keyname) \ - text_buffer_t tex = text_buffer_create(4096); \ - text_buffer_append_string(&tex, tag->value); \ + text_buffer_t tex = text_buffer_create(-1); \ + text_buffer_append_string0(&tex, tag->value); \ meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \ meta_tag->key = keyname; \ strcpy(meta_tag->strval, tex.dyn_buffer.buf); \ APPEND_META(doc, meta_tag) \ text_buffer_destroy(&tex); +__always_inline void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) { AVDictionaryEntry *tag = NULL; while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) { - char *key = tag->key; - for (; *key; ++key) *key = (char) tolower(*key); + char key[32]; + strncpy(key, tag->key, sizeof(key)); - if (strcmp(tag->key, "artist") == 0) { + char *ptr = key; + for (; *ptr; ++ptr) *ptr = (char) tolower(*ptr); + + if (strcmp(key, "artist") == 0) { APPEND_TAG_META(doc, tag, MetaArtist) - } else if (strcmp(tag->key, "genre") == 0) { + } else if (strcmp(key, "genre") == 0) { APPEND_TAG_META(doc, tag, MetaGenre) - } else if (strcmp(tag->key, "title") == 0) { + } else if (strcmp(key, "title") == 0) { APPEND_TAG_META(doc, tag, MetaTitle) - } else if (strcmp(tag->key, "album_artist") == 0) { + } else if (strcmp(key, "album_artist") == 0) { APPEND_TAG_META(doc, tag, MetaAlbumArtist) - } else if (strcmp(tag->key, "album") == 0) { + } else if (strcmp(key, "album") == 0) { APPEND_TAG_META(doc, tag, MetaAlbum) } } } +__always_inline void append_video_meta(AVFormatContext *pFormatCtx, document_t *doc, int include_audio_tags) { meta_line_t *meta_duration = malloc(sizeof(meta_line_t)); @@ -146,17 +160,20 @@ void append_video_meta(AVFormatContext *pFormatCtx, document_t *doc, int include meta_line_t *meta_bitrate = malloc(sizeof(meta_line_t)); meta_bitrate->key = MetaMediaBitrate; - meta_bitrate->intval = pFormatCtx->bit_rate; + meta_bitrate->longval = pFormatCtx->bit_rate; APPEND_META(doc, meta_bitrate) AVDictionaryEntry *tag = NULL; while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) { - char *key = tag->key; - for (; *key; ++key) *key = (char) tolower(*key); + char key[32]; + strncpy(key, tag->key, sizeof(key)); - if (strcmp(tag->key, "title") == 0 && include_audio_tags) { + char *ptr = key; + for (; *ptr; ++ptr) *ptr = (char) tolower(*ptr); + + if (strcmp(key, "title") == 0 && include_audio_tags) { APPEND_TAG_META(doc, tag, MetaTitle) - } else if (strcmp(tag->key, "comment") == 0) { + } else if (strcmp(key, "comment") == 0) { APPEND_TAG_META(doc, tag, MetaContent) } } @@ -174,7 +191,7 @@ void parse_media(const char *filepath, document_t *doc) { } int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL); if (res < 0) { - printf("ERR%s %s\n", filepath, av_err2str(res)); + fprintf(stderr, "media error: %s %s\n", filepath, av_err2str(res)); return; } @@ -224,7 +241,7 @@ void parse_media(const char *filepath, document_t *doc) { append_video_meta(pFormatCtx, doc, audio_stream == -1); } - if (stream->codecpar->width <= 20 || stream->codecpar->height <= 20) { + if (stream->codecpar->width <= MIN_SIZE || stream->codecpar->height <= MIN_SIZE) { avformat_close_input(&pFormatCtx); avformat_free_context(pFormatCtx); return; @@ -259,6 +276,14 @@ void parse_media(const char *filepath, document_t *doc) { // Scale frame AVFrame *scaled_frame = scale_frame(decoder, frame, ScanCtx.tn_size); + if (scaled_frame == NULL) { + av_frame_free(&frame); + avcodec_free_context(&decoder); + avformat_close_input(&pFormatCtx); + avformat_free_context(pFormatCtx); + return; + } + // Encode frame to jpeg AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, ScanCtx.tn_qscale); avcodec_send_frame(jpeg_encoder, scaled_frame); @@ -268,7 +293,8 @@ void parse_media(const char *filepath, document_t *doc) { avcodec_receive_packet(jpeg_encoder, &jpeg_packet); // Save thumbnail - store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) jpeg_packet.data, jpeg_packet.size); + store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) jpeg_packet.data, + jpeg_packet.size); av_packet_unref(&jpeg_packet); av_frame_free(&frame); diff --git a/src/parsing/media.h b/src/parsing/media.h index 60e2f34..694b77e 100644 --- a/src/parsing/media.h +++ b/src/parsing/media.h @@ -5,6 +5,7 @@ #include "src/sist.h" #define MIN_VIDEO_SIZE 1024 * 64 +#define MIN_IMAGE_SIZE 1024 * 2 void parse_media(const char * filepath, document_t *doc); diff --git a/src/parsing/mime.c b/src/parsing/mime.c index e418077..4a218df 100644 --- a/src/parsing/mime.c +++ b/src/parsing/mime.c @@ -1,10 +1,12 @@ #include "mime.h" unsigned int mime_get_mime_by_ext(GHashTable *ext_table, const char * ext) { - char lower[64]; + char lower[8]; char *p = lower; - while ((*ext)) { + int cnt = 0; + while ((*ext) != '\0' && cnt + 1 < sizeof(lower)) { *p++ = (char)tolower(*ext++); + cnt++; } *p = '\0'; return (size_t) g_hash_table_lookup(ext_table, lower); diff --git a/src/parsing/mime_generated.c b/src/parsing/mime_generated.c index be5d0be..53bf203 100644 --- a/src/parsing/mime_generated.c +++ b/src/parsing/mime_generated.c @@ -39,334 +39,385 @@ enum mime { application_oda=655391, application_ogg=655392, application_pdf=655393 | 0x40000000, - application_pgp_signature=655394, - application_pkcs7_signature=655395, - application_pkix_cert=655396, - application_postscript=655397, - application_pro_eng=655398, - application_ringing_tones=655399, - application_smil=655400, - application_solids=655401, - application_sounder=655402, - application_step=655403, - application_streamingmedia=655404, - application_vda=655405, - application_vnd_fdf=655406, - application_vnd_font_fontforge_sfd=655407, - application_vnd_hp_hpgl=655408, - application_vnd_iccprofile=655409, - application_vnd_ms_cab_compressed=655410, - application_vnd_ms_excel=655411, - application_vnd_ms_fontobject=655412, - application_vnd_ms_opentype=655413 | 0x20000000, - application_vnd_ms_pki_certstore=655414, - application_vnd_ms_pki_pko=655415, - application_vnd_ms_pki_seccat=655416, - application_vnd_ms_powerpoint=655417, - application_vnd_ms_project=655418, - application_vnd_oasis_opendocument_base=655419, - application_vnd_oasis_opendocument_formula=655420, - application_vnd_oasis_opendocument_graphics=655421, - application_vnd_oasis_opendocument_text=655422, - application_vnd_openxmlformats_officedocument_spreadsheetml_sheet=655423, - application_vnd_openxmlformats_officedocument_wordprocessingml_document=655424, - application_vnd_wap_wmlc=655425, - application_vnd_wap_wmlscriptc=655426, - application_vnd_xara=655427, - application_vocaltec_media_desc=655428, - application_vocaltec_media_file=655429, - application_winhelp=655430, - application_wordperfect=655431, - application_wordperfect6_0=655432, - application_wordperfect6_1=655433, - application_x_123=655434, - application_x_7z_compressed=655435, - application_x_aim=655436, - application_x_archive=655437, - application_x_authorware_bin=655438, - application_x_authorware_map=655439, - application_x_authorware_seg=655440, - application_x_bcpio=655441, - application_x_bittorrent=655442, - application_x_bsh=655443, - application_x_bytecode_python=655444, - application_x_bzip=655445, - application_x_bzip2=655446, - application_x_cbr=655447, - application_x_cbz=655448 | 0x40000000, - application_x_cdlink=655449, - application_x_chat=655450, - application_x_cocoa=655451, - application_x_conference=655452, - application_x_cpio=655453, - application_x_dbf=655454, - application_x_dbt=655455, - application_x_debian_package=655456, - application_x_deepv=655457, - application_x_director=655458, - application_x_dosexec=655459, - application_x_dvi=655460, - application_x_elc=655461, + application_pgp_keys=655394, + application_pgp_signature=655395, + application_pkcs7_signature=655396, + application_pkix_cert=655397, + application_postscript=655398, + application_pro_eng=655399, + application_ringing_tones=655400, + application_smil=655401, + application_solids=655402, + application_sounder=655403, + application_step=655404, + application_streamingmedia=655405, + application_vda=655406, + application_vnd_fdf=655407, + application_vnd_font_fontforge_sfd=655408, + application_vnd_hp_hpgl=655409, + application_vnd_iccprofile=655410, + application_vnd_lotus_1_2_3=655411, + application_vnd_ms_cab_compressed=655412, + application_vnd_ms_excel=655413, + application_vnd_ms_fontobject=655414, + application_vnd_ms_opentype=655415 | 0x20000000, + application_vnd_ms_pki_certstore=655416, + application_vnd_ms_pki_pko=655417, + application_vnd_ms_pki_seccat=655418, + application_vnd_ms_powerpoint=655419, + application_vnd_ms_project=655420, + application_vnd_oasis_opendocument_base=655421, + application_vnd_oasis_opendocument_formula=655422, + application_vnd_oasis_opendocument_graphics=655423, + application_vnd_oasis_opendocument_presentation=655424, + application_vnd_oasis_opendocument_spreadsheet=655425, + application_vnd_oasis_opendocument_text=655426, + application_vnd_openxmlformats_officedocument_presentationml_presentation=655427, + application_vnd_openxmlformats_officedocument_spreadsheetml_sheet=655428, + application_vnd_openxmlformats_officedocument_wordprocessingml_document=655429, + application_vnd_symbian_install=655430, + application_vnd_tcpdump_pcap=655431, + application_vnd_wap_wmlc=655432, + application_vnd_wap_wmlscriptc=655433, + application_vnd_xara=655434, + application_vocaltec_media_desc=655435, + application_vocaltec_media_file=655436, + application_winhelp=655437, + application_wordperfect=655438, + application_wordperfect6_0=655439, + application_wordperfect6_1=655440, + application_x_123=655441, + application_x_7z_compressed=655442, + application_x_aim=655443, + application_x_apple_diskimage=655444, + application_x_arc=655445, + application_x_archive=655446, + application_x_atari_7800_rom=655447, + application_x_authorware_bin=655448, + application_x_authorware_map=655449, + application_x_authorware_seg=655450, + application_x_avira_qua=655451, + application_x_bcpio=655452, + application_x_bittorrent=655453, + application_x_bsh=655454, + application_x_bytecode_python=655455, + application_x_bzip=655456, + application_x_bzip2=655457, + application_x_cbr=655458, + application_x_cbz=655459 | 0x40000000, + application_x_cdlink=655460, + application_x_chat=655461, + application_x_chrome_extension=655462, + application_x_cocoa=655463, + application_x_conference=655464, + application_x_coredump=655465, + application_x_cpio=655466, + application_x_dbf=655467, + application_x_dbt=655468, + application_x_debian_package=655469, + application_x_deepv=655470, + application_x_director=655471, + application_x_dmp=655472, + application_x_dosdriver=655473, + application_x_dosexec=655474, + application_x_dvi=655475, + application_x_elc=655476, application_x_empty=1, - application_x_envoy=655463, - application_x_esrehber=655464, - application_x_excel=655465, - application_x_executable=655466, - application_x_font_sfn=655467, - application_x_font_ttf=655468 | 0x20000000, - application_x_freelance=655469, - application_x_git=655470, - application_x_gsp=655471, - application_x_gss=655472, - application_x_gtar=655473, - application_x_gzip=655474, - application_x_hdf=655475, - application_x_helpfile=655476, - application_x_httpd_imap=655477, - application_x_ima=655478, - application_x_innosetup=655479, - application_x_internett_signup=655480, - application_x_inventor=655481, - application_x_ip2=655482, - application_x_java_applet=655483, - application_x_java_commerce=655484, - application_x_java_image=655485, - application_x_java_keystore=655486, - application_x_kdelnk=655487, - application_x_koan=655488, - application_x_latex=655489, - application_x_livescreen=655490, - application_x_lotus=655491, - application_x_lzh=655492, - application_x_lzx=655493, - application_x_mach_binary=655494, - application_x_mach_executable=655495, - application_x_magic_cap_package_1_0=655496, - application_x_mathcad=655497, - application_x_meme=655498, - application_x_midi=655499, - application_x_mif=655500, - application_x_mix_transfer=655501, - application_x_mobipocket_ebook=655502, - application_x_ms_compress_szdd=655503, - application_x_ms_pdb=655504, - application_x_ms_reader=655505, - application_x_navi_animation=655506, - application_x_navidoc=655507, - application_x_navimap=655508, - application_x_navistyle=655509, - application_x_netcdf=655510, - application_x_newton_compatible_pkg=655511, - application_x_object=655512, - application_x_omc=655513, - application_x_omcdatamaker=655514, - application_x_omcregerator=655515, - application_x_pagemaker=655516, - application_x_pcl=655517, - application_x_pixclscript=655518, - application_x_pkcs7_certreqresp=655519, - application_x_pkcs7_signature=655520, - application_x_project=655521, - application_x_qpro=655522, - application_x_rar=655523, - application_x_rpm=655524, - application_x_sdp=655525, - application_x_sea=655526, - application_x_seelogo=655527, - application_x_setupscript=655528, - application_x_shar=655529, - application_x_sharedlib=655530, - application_x_shockwave_flash=655531, - application_x_sprite=655532, - application_x_sqlite3=655533, - application_x_sv4cpio=655534, - application_x_sv4crc=655535, - application_x_tar=655536, - application_x_tbook=655537, - application_x_tex_tfm=655538, - application_x_texinfo=655539, - application_x_ustar=655540, - application_x_visio=655541, - application_x_vnd_audioexplosion_mzz=655542, - application_x_vnd_ls_xpix=655543, - application_x_vrml=655544, - application_x_wais_source=655545, - application_x_wine_extension_ini=655546, - application_x_wintalk=655547, - application_x_world=655548, - application_x_wri=655549, - application_x_x509_ca_cert=655550, - application_x_xz=655551, - application_xml=655552, - application_zip=655553, - audio_it=458946, - audio_make=458947, - audio_mid=458948, - audio_midi=458949, - audio_mp4=458950, - audio_mpeg=458951, - audio_ogg=458952, - audio_s3m=458953, - audio_tsp_audio=458954, - audio_tsplayer=458955, - audio_vnd_qcelp=458956, - audio_voxware=458957, - audio_x_flac=458958, - audio_x_gsm=458959, - audio_x_jam=458960, - audio_x_liveaudio=458961, - audio_x_m4a=458962, - audio_x_midi=458963, - audio_x_mod=458964, - audio_x_mp4a_latm=458965, - audio_x_mpeg_3=458966, - audio_x_mpequrl=458967, - audio_x_nspaudio=458968, - audio_x_pn_realaudio=458969, - audio_x_psid=458970, - audio_x_realaudio=458971, - audio_x_twinvq=458972, - audio_x_twinvq_plugin=458973, - audio_x_voc=458974, - audio_x_wav=458975, - audio_xm=458976, - font_otf=327905 | 0x20000000, - font_sfnt=327906 | 0x20000000, - font_woff=327907 | 0x20000000, - font_woff2=327908 | 0x20000000, - image_cmu_raster=524517, - image_fif=524518, - image_florian=524519, - image_g3fax=524520, - image_gif=524521, - image_ief=524522, - image_jpeg=524523, - image_jutvision=524524, - image_naplps=524525, - image_pict=524526, - image_png=524527, - image_svg=524528 | 0x80000000, - image_svg_xml=524529 | 0x80000000, - image_tiff=524530, - image_vnd_adobe_photoshop=524531 | 0x80000000, - image_vnd_djvu=524532 | 0x80000000, - image_vnd_fpx=524533, - image_vnd_microsoft_icon=524534, - image_vnd_rn_realflash=524535, - image_vnd_rn_realpix=524536, - image_vnd_wap_wbmp=524537, - image_vnd_xiff=524538, - image_webp=524539, - image_x_cmu_raster=524540, - image_x_cur=524541, - image_x_dwg=524542, - image_x_eps=524543, - image_x_exr=524544, - image_x_icns=524545, - image_x_icon=524546 | 0x80000000, - image_x_jg=524547, - image_x_jps=524548, - image_x_ms_bmp=524549, - image_x_niff=524550, - image_x_pcx=524551, - image_x_pict=524552, - image_x_portable_bitmap=524553, - image_x_portable_graymap=524554, - image_x_portable_pixmap=524555, - image_x_quicktime=524556, - image_x_rgb=524557, - image_x_tga=524558, - image_x_tiff=524559, - image_x_xcf=524560 | 0x80000000, - image_x_xpixmap=524561 | 0x80000000, - image_x_xwindowdump=524562, - message_rfc822=196883, - model_vnd_dwf=65812, - model_vnd_gdl=65813, - model_vnd_gs_gdl=65814, - model_vrml=65815, - model_x_pov=65816, - text_asp=590105, - text_css=590106, - text_html=590107, - text_javascript=590108, - text_mcf=590109, - text_pascal=590110, - text_plain=590111, - text_richtext=590112, - text_scriplet=590113, - text_tab_separated_values=590114, - text_troff=590115, - text_uri_list=590116, - text_vnd_abc=590117, - text_vnd_fmi_flexstor=590118, - text_vnd_wap_wml=590119, - text_vnd_wap_wmlscript=590120, - text_webviewhtml=590121, - text_x_Algol68=590122, - text_x_asm=590123, - text_x_audiosoft_intra=590124, - text_x_awk=590125, - text_x_bcpl=590126, - text_x_c=590127, - text_x_c__=590128, - text_x_component=590129, - text_x_diff=590130, - text_x_fortran=590131, - text_x_java=590132, - text_x_la_asf=590133, - text_x_lisp=590134, - text_x_m=590135, - text_x_m4=590136, - text_x_makefile=590137, - text_x_msdos_batch=590138, - text_x_pascal=590139, - text_x_perl=590140, - text_x_php=590141, - text_x_po=590142, - text_x_python=590143, - text_x_ruby=590144, - text_x_sass=590145, - text_x_scss=590146, - text_x_server_parsed_html=590147, - text_x_setext=590148, - text_x_sgml=590149, - text_x_shellscript=590150, - text_x_speech=590151, - text_x_tcl=590152, - text_x_tex=590153, - text_x_uil=590154, - text_x_uuencode=590155, - text_x_vcalendar=590156, - text_x_vcard=590157, - text_xml=590158, - video_animaflex=393551, - video_avi=393552, - video_avs_video=393553, - video_mp4=393554, - video_mpeg=393555, - video_quicktime=393556, - video_vdo=393557, - video_vivo=393558, - video_vnd_rn_realvideo=393559, - video_vosaic=393560, - video_webm=393561, - video_x_amt_demorun=393562, - video_x_amt_showrun=393563, - video_x_atomic3d_feature=393564, - video_x_dl=393565, - video_x_dv=393566, - video_x_fli=393567, - video_x_flv=393568, - video_x_isvideo=393569, - video_x_jng=393570 | 0x80000000, - video_x_matroska=393571, - video_x_mng=393572, - video_x_motion_jpeg=393573, - video_x_ms_asf=393574, - video_x_msvideo=393575, - video_x_qtc=393576, - video_x_sgi_movie=393577, + application_x_envoy=655478, + application_x_esrehber=655479, + application_x_excel=655480, + application_x_executable=655481, + application_x_font_gdos=655482, + application_x_font_pf2=655483, + application_x_font_pfm=655484, + application_x_font_sfn=655485, + application_x_font_ttf=655486 | 0x20000000, + application_x_freelance=655487, + application_x_gamecube_rom=655488, + application_x_gdbm=655489, + application_x_gettext_translation=655490, + application_x_git=655491, + application_x_gsp=655492, + application_x_gss=655493, + application_x_gtar=655494, + application_x_gzip=655495, + application_x_hdf=655496, + application_x_helpfile=655497, + application_x_httpd_imap=655498, + application_x_ima=655499, + application_x_innosetup=655500, + application_x_internett_signup=655501, + application_x_inventor=655502, + application_x_ip2=655503, + application_x_java_applet=655504, + application_x_java_commerce=655505, + application_x_java_image=655506, + application_x_java_jmod=655507, + application_x_java_keystore=655508, + application_x_kdelnk=655509, + application_x_koan=655510, + application_x_latex=655511, + application_x_livescreen=655512, + application_x_lotus=655513, + application_x_lz4=655514, + application_x_lz4_json=655515, + application_x_lzh=655516, + application_x_lzh_compressed=655517, + application_x_lzx=655518, + application_x_mach_binary=655519, + application_x_mach_executable=655520, + application_x_magic_cap_package_1_0=655521, + application_x_mathcad=655522, + application_x_maxis_dbpf=655523, + application_x_meme=655524, + application_x_midi=655525, + application_x_mif=655526, + application_x_mix_transfer=655527, + application_x_mobipocket_ebook=655528, + application_x_ms_compress_szdd=655529, + application_x_ms_pdb=655530, + application_x_ms_reader=655531, + application_x_msaccess=655532, + application_x_navi_animation=655533, + application_x_navidoc=655534, + application_x_navimap=655535, + application_x_navistyle=655536, + application_x_nes_rom=655537, + application_x_netcdf=655538, + application_x_newton_compatible_pkg=655539, + application_x_nintendo_ds_rom=655540, + application_x_object=655541, + application_x_omc=655542, + application_x_omcdatamaker=655543, + application_x_omcregerator=655544, + application_x_pagemaker=655545, + application_x_pcl=655546, + application_x_pgp_keyring=655547, + application_x_pixclscript=655548, + application_x_pkcs7_certreqresp=655549, + application_x_pkcs7_signature=655550, + application_x_project=655551, + application_x_qpro=655552, + application_x_rar=655553, + application_x_rpm=655554, + application_x_sdp=655555, + application_x_sea=655556, + application_x_seelogo=655557, + application_x_setupscript=655558, + application_x_shar=655559, + application_x_sharedlib=655560, + application_x_shockwave_flash=655561, + application_x_snappy_framed=655562, + application_x_sprite=655563, + application_x_sqlite3=655564, + application_x_sv4cpio=655565, + application_x_sv4crc=655566, + application_x_tar=655567, + application_x_tbook=655568, + application_x_terminfo=655569, + application_x_terminfo2=655570, + application_x_tex_tfm=655571, + application_x_texinfo=655572, + application_x_ustar=655573, + application_x_visio=655574, + application_x_vnd_audioexplosion_mzz=655575, + application_x_vnd_ls_xpix=655576, + application_x_vrml=655577, + application_x_wais_source=655578, + application_x_wine_extension_ini=655579, + application_x_wintalk=655580, + application_x_world=655581, + application_x_wri=655582, + application_x_x509_ca_cert=655583, + application_x_xz=655584, + application_x_zip=655585, + application_x_zstd=655586, + application_xml=655587, + application_zip=655588, + application_zlib=655589, + audio_it=458982, + audio_make=458983, + audio_mid=458984, + audio_midi=458985, + audio_mp4=458986, + audio_mpeg=458987, + audio_ogg=458988, + audio_s3m=458989, + audio_tsp_audio=458990, + audio_tsplayer=458991, + audio_vnd_qcelp=458992, + audio_voxware=458993, + audio_x_aiff=458994, + audio_x_flac=458995, + audio_x_gsm=458996, + audio_x_hx_aac_adts=458997, + audio_x_jam=458998, + audio_x_liveaudio=458999, + audio_x_m4a=459000, + audio_x_midi=459001, + audio_x_mod=459002, + audio_x_mp4a_latm=459003, + audio_x_mpeg_3=459004, + audio_x_mpequrl=459005, + audio_x_nspaudio=459006, + audio_x_pn_realaudio=459007, + audio_x_psid=459008, + audio_x_realaudio=459009, + audio_x_twinvq=459010, + audio_x_twinvq_plugin=459011, + audio_x_voc=459012, + audio_x_wav=459013, + audio_xm=459014, + font_otf=327943 | 0x20000000, + font_sfnt=327944 | 0x20000000, + font_woff=327945 | 0x20000000, + font_woff2=327946 | 0x20000000, + image_cmu_raster=524555, + image_fif=524556, + image_florian=524557, + image_g3fax=524558, + image_gif=524559, + image_heic=524560, + image_ief=524561, + image_jpeg=524562, + image_jutvision=524563, + image_naplps=524564, + image_pict=524565, + image_png=524566, + image_svg=524567 | 0x80000000, + image_svg_xml=524568 | 0x80000000, + image_tiff=524569, + image_vnd_adobe_photoshop=524570 | 0x80000000, + image_vnd_djvu=524571 | 0x80000000, + image_vnd_fpx=524572, + image_vnd_microsoft_icon=524573, + image_vnd_rn_realflash=524574, + image_vnd_rn_realpix=524575, + image_vnd_wap_wbmp=524576, + image_vnd_xiff=524577, + image_webp=524578, + image_wmf=524579, + image_x_3ds=524580, + image_x_cmu_raster=524581, + image_x_cur=524582, + image_x_dwg=524583, + image_x_eps=524584, + image_x_exr=524585, + image_x_gem=524586, + image_x_icns=524587, + image_x_icon=524588 | 0x80000000, + image_x_jg=524589, + image_x_jps=524590, + image_x_ms_bmp=524591, + image_x_niff=524592, + image_x_pcx=524593, + image_x_pict=524594, + image_x_portable_bitmap=524595, + image_x_portable_graymap=524596, + image_x_portable_pixmap=524597, + image_x_quicktime=524598, + image_x_rgb=524599, + image_x_tga=524600, + image_x_tiff=524601, + image_x_win_bitmap=524602, + image_x_xcf=524603 | 0x80000000, + image_x_xpixmap=524604 | 0x80000000, + image_x_xwindowdump=524605, + message_news=196926, + message_rfc822=196927, + model_vnd_dwf=65856, + model_vnd_gdl=65857, + model_vnd_gs_gdl=65858, + model_vrml=65859, + model_x_pov=65860, + text_PGP=590149, + text_asp=590150, + text_css=590151, + text_html=590152, + text_javascript=590153, + text_mcf=590154, + text_pascal=590155, + text_plain=590156, + text_richtext=590157, + text_rtf=590158, + text_scriplet=590159, + text_tab_separated_values=590160, + text_troff=590161, + text_uri_list=590162, + text_vnd_abc=590163, + text_vnd_fmi_flexstor=590164, + text_vnd_wap_wml=590165, + text_vnd_wap_wmlscript=590166, + text_webviewhtml=590167, + text_x_Algol68=590168, + text_x_asm=590169, + text_x_audiosoft_intra=590170, + text_x_awk=590171, + text_x_bcpl=590172, + text_x_c=590173, + text_x_c__=590174, + text_x_component=590175, + text_x_diff=590176, + text_x_fortran=590177, + text_x_java=590178, + text_x_la_asf=590179, + text_x_lisp=590180, + text_x_m=590181, + text_x_m4=590182, + text_x_makefile=590183, + text_x_ms_regedit=590184, + text_x_msdos_batch=590185, + text_x_objective_c=590186, + text_x_pascal=590187, + text_x_perl=590188, + text_x_php=590189, + text_x_po=590190, + text_x_python=590191, + text_x_ruby=590192, + text_x_sass=590193, + text_x_scss=590194, + text_x_server_parsed_html=590195, + text_x_setext=590196, + text_x_sgml=590197, + text_x_shellscript=590198, + text_x_speech=590199, + text_x_tcl=590200, + text_x_tex=590201, + text_x_uil=590202, + text_x_uuencode=590203, + text_x_vcalendar=590204, + text_x_vcard=590205, + text_xml=590206, + video_MP2T=393599, + video_animaflex=393600, + video_avi=393601, + video_avs_video=393602, + video_mp4=393603, + video_mpeg=393604, + video_quicktime=393605, + video_vdo=393606, + video_vivo=393607, + video_vnd_rn_realvideo=393608, + video_vosaic=393609, + video_webm=393610, + video_x_amt_demorun=393611, + video_x_amt_showrun=393612, + video_x_atomic3d_feature=393613, + video_x_dl=393614, + video_x_dv=393615, + video_x_fli=393616, + video_x_flv=393617, + video_x_isvideo=393618, + video_x_jng=393619 | 0x80000000, + video_x_m4v=393620, + video_x_matroska=393621, + video_x_mng=393622, + video_x_motion_jpeg=393623, + video_x_ms_asf=393624, + video_x_msvideo=393625, + video_x_qtc=393626, + video_x_sgi_movie=393627, + x_epoc_x_sisx_app=721308, }; char *mime_get_mime_text(unsigned int mime_id) {switch (mime_id) { case application_arj: return "application/arj"; @@ -625,6 +676,7 @@ case text_mcf: return "text/mcf"; case text_pascal: return "text/pascal"; case text_plain: return "text/plain"; case text_richtext: return "text/richtext"; +case text_rtf: return "text/rtf"; case text_scriplet: return "text/scriplet"; case text_x_awk: return "text/x-awk"; case video_x_jng: return "video/x-jng"; @@ -730,6 +782,56 @@ case application_x_wine_extension_ini: return "application/x-wine-extension-ini" case application_x_cbz: return "application/x-cbz"; case application_x_cbr: return "application/x-cbr"; case application_x_ms_compress_szdd: return "application/x-ms-compress-szdd"; +case application_x_atari_7800_rom: return "application/x-atari-7800-rom"; +case application_x_nes_rom: return "application/x-nes-rom"; +case application_x_font_pfm: return "application/x-font-pfm"; +case application_x_gettext_translation: return "application/x-gettext-translation"; +case image_wmf: return "image/wmf"; +case application_pgp_keys: return "application/pgp-keys"; +case image_x_3ds: return "image/x-3ds"; +case application_x_lz4: return "application/x-lz4"; +case application_vnd_openxmlformats_officedocument_presentationml_presentation: return "application/vnd.openxmlformats-officedocument.presentationml.presentation"; +case application_vnd_oasis_opendocument_presentation: return "application/vnd.oasis.opendocument.presentation"; +case application_x_msaccess: return "application/x-msaccess"; +case application_vnd_oasis_opendocument_spreadsheet: return "application/vnd.oasis.opendocument.spreadsheet"; +case audio_x_aiff: return "audio/x-aiff"; +case text_x_ms_regedit: return "text/x-ms-regedit"; +case application_x_gamecube_rom: return "application/x-gamecube-rom"; +case application_x_nintendo_ds_rom: return "application/x-nintendo-ds-rom"; +case text_x_objective_c: return "text/x-objective-c"; +case application_x_font_gdos: return "application/x-font-gdos"; +case application_x_apple_diskimage: return "application/x-apple-diskimage"; +case application_x_zstd: return "application/x-zstd"; +case video_x_m4v: return "video/x-m4v"; +case message_news: return "message/news"; +case application_vnd_symbian_install: return "application/vnd.symbian.install"; +case application_x_lzh_compressed: return "application/x-lzh-compressed"; +case application_x_dosdriver: return "application/x-dosdriver"; +case application_vnd_tcpdump_pcap: return "application/vnd.tcpdump.pcap"; +case x_epoc_x_sisx_app: return "x-epoc/x-sisx-app"; +case application_x_avira_qua: return "application/x-avira-qua"; +case video_MP2T: return "video/MP2T"; +case application_x_snappy_framed: return "application/x-snappy-framed"; +case application_x_lz4_json: return "application/x-lz4+json"; +case application_x_dmp: return "application/x-dmp"; +case application_zlib: return "application/zlib"; +case application_x_pgp_keyring: return "application/x-pgp-keyring"; +case application_x_gdbm: return "application/x-gdbm"; +case application_x_font_pf2: return "application/x-font-pf2"; +case application_x_zip: return "application/x-zip"; +case application_x_coredump: return "application/x-coredump"; +case application_x_java_jmod: return "application/x-java-jmod"; +case application_x_terminfo: return "application/x-terminfo"; +case application_x_terminfo2: return "application/x-terminfo2"; +case application_x_arc: return "application/x-arc"; +case application_vnd_lotus_1_2_3: return "application/vnd.lotus-1-2-3"; +case image_x_win_bitmap: return "image/x-win-bitmap"; +case application_x_maxis_dbpf: return "application/x-maxis-dbpf"; +case text_PGP: return "text/PGP"; +case audio_x_hx_aac_adts: return "audio/x-hx-aac-adts"; +case application_x_chrome_extension: return "application/x-chrome-extension"; +case image_heic: return "image/heic"; +case image_x_gem: return "image/x-gem"; default: return NULL;}} GHashTable *mime_get_ext_table() {GHashTable *ext_table = g_hash_table_new(g_str_hash, g_str_equal); g_hash_table_insert(ext_table, "arj", (gpointer)application_arj); @@ -1097,7 +1199,7 @@ g_hash_table_insert(ext_table, "ms", (gpointer)text_troff); g_hash_table_insert(ext_table, "roff", (gpointer)text_troff); g_hash_table_insert(ext_table, "t", (gpointer)text_troff); g_hash_table_insert(ext_table, "tr", (gpointer)text_troff); -g_hash_table_insert(ext_table, "uni", (gpointer)text_uri_list); +g_hash_table_insert(ext_table, "uji", (gpointer)text_uri_list); g_hash_table_insert(ext_table, "unis", (gpointer)text_uri_list); g_hash_table_insert(ext_table, "uri", (gpointer)text_uri_list); g_hash_table_insert(ext_table, "uris", (gpointer)text_uri_list); @@ -1211,6 +1313,27 @@ g_hash_table_insert(ext_table, "hlp", (gpointer)application_winhelp); g_hash_table_insert(ext_table, "cbz", (gpointer)application_x_cbz); g_hash_table_insert(ext_table, "cbr", (gpointer)application_x_cbr); g_hash_table_insert(ext_table, "fon", (gpointer)application_x_ms_compress_szdd); +g_hash_table_insert(ext_table, "a78", (gpointer)application_x_atari_7800_rom); +g_hash_table_insert(ext_table, "nes", (gpointer)application_x_nes_rom); +g_hash_table_insert(ext_table, "pfm", (gpointer)application_x_font_pfm); +g_hash_table_insert(ext_table, "3ds", (gpointer)image_x_3ds); +g_hash_table_insert(ext_table, "lz4", (gpointer)application_x_lz4); +g_hash_table_insert(ext_table, "pptx", (gpointer)application_vnd_openxmlformats_officedocument_presentationml_presentation); +g_hash_table_insert(ext_table, "odp", (gpointer)application_vnd_oasis_opendocument_presentation); +g_hash_table_insert(ext_table, "accdb", (gpointer)application_x_msaccess); +g_hash_table_insert(ext_table, "ods", (gpointer)application_vnd_oasis_opendocument_spreadsheet); +g_hash_table_insert(ext_table, "aiff", (gpointer)audio_x_aiff); +g_hash_table_insert(ext_table, "aif", (gpointer)audio_x_aiff); +g_hash_table_insert(ext_table, "reg", (gpointer)text_x_ms_regedit); +g_hash_table_insert(ext_table, "zst", (gpointer)application_x_zstd); +g_hash_table_insert(ext_table, "m4v", (gpointer)video_x_m4v); +g_hash_table_insert(ext_table, "pcap", (gpointer)application_vnd_tcpdump_pcap); +g_hash_table_insert(ext_table, "jsonlz4", (gpointer)application_x_lz4_json); +g_hash_table_insert(ext_table, "dmp", (gpointer)application_x_dmp); +g_hash_table_insert(ext_table, "z", (gpointer)application_zlib); +g_hash_table_insert(ext_table, "pf2", (gpointer)application_x_font_pf2); +g_hash_table_insert(ext_table, "jmod", (gpointer)application_x_java_jmod); +g_hash_table_insert(ext_table, "heic", (gpointer)image_heic); return ext_table;} GHashTable *mime_get_mime_table() {GHashTable *mime_table = g_hash_table_new(g_str_hash, g_str_equal); g_hash_table_insert(mime_table, "application/arj", (gpointer)application_arj); @@ -1469,6 +1592,7 @@ g_hash_table_insert(mime_table, "text/mcf", (gpointer)text_mcf); g_hash_table_insert(mime_table, "text/pascal", (gpointer)text_pascal); g_hash_table_insert(mime_table, "text/plain", (gpointer)text_plain); g_hash_table_insert(mime_table, "text/richtext", (gpointer)text_richtext); +g_hash_table_insert(mime_table, "text/rtf", (gpointer)text_rtf); g_hash_table_insert(mime_table, "text/scriplet", (gpointer)text_scriplet); g_hash_table_insert(mime_table, "text/x-awk", (gpointer)text_x_awk); g_hash_table_insert(mime_table, "video/x-jng", (gpointer)video_x_jng); @@ -1574,5 +1698,55 @@ g_hash_table_insert(mime_table, "application/x-wine-extension-ini", (gpointer)ap g_hash_table_insert(mime_table, "application/x-cbz", (gpointer)application_x_cbz); g_hash_table_insert(mime_table, "application/x-cbr", (gpointer)application_x_cbr); g_hash_table_insert(mime_table, "application/x-ms-compress-szdd", (gpointer)application_x_ms_compress_szdd); +g_hash_table_insert(mime_table, "application/x-atari-7800-rom", (gpointer)application_x_atari_7800_rom); +g_hash_table_insert(mime_table, "application/x-nes-rom", (gpointer)application_x_nes_rom); +g_hash_table_insert(mime_table, "application/x-font-pfm", (gpointer)application_x_font_pfm); +g_hash_table_insert(mime_table, "application/x-gettext-translation", (gpointer)application_x_gettext_translation); +g_hash_table_insert(mime_table, "image/wmf", (gpointer)image_wmf); +g_hash_table_insert(mime_table, "application/pgp-keys", (gpointer)application_pgp_keys); +g_hash_table_insert(mime_table, "image/x-3ds", (gpointer)image_x_3ds); +g_hash_table_insert(mime_table, "application/x-lz4", (gpointer)application_x_lz4); +g_hash_table_insert(mime_table, "application/vnd.openxmlformats-officedocument.presentationml.presentation", (gpointer)application_vnd_openxmlformats_officedocument_presentationml_presentation); +g_hash_table_insert(mime_table, "application/vnd.oasis.opendocument.presentation", (gpointer)application_vnd_oasis_opendocument_presentation); +g_hash_table_insert(mime_table, "application/x-msaccess", (gpointer)application_x_msaccess); +g_hash_table_insert(mime_table, "application/vnd.oasis.opendocument.spreadsheet", (gpointer)application_vnd_oasis_opendocument_spreadsheet); +g_hash_table_insert(mime_table, "audio/x-aiff", (gpointer)audio_x_aiff); +g_hash_table_insert(mime_table, "text/x-ms-regedit", (gpointer)text_x_ms_regedit); +g_hash_table_insert(mime_table, "application/x-gamecube-rom", (gpointer)application_x_gamecube_rom); +g_hash_table_insert(mime_table, "application/x-nintendo-ds-rom", (gpointer)application_x_nintendo_ds_rom); +g_hash_table_insert(mime_table, "text/x-objective-c", (gpointer)text_x_objective_c); +g_hash_table_insert(mime_table, "application/x-font-gdos", (gpointer)application_x_font_gdos); +g_hash_table_insert(mime_table, "application/x-apple-diskimage", (gpointer)application_x_apple_diskimage); +g_hash_table_insert(mime_table, "application/x-zstd", (gpointer)application_x_zstd); +g_hash_table_insert(mime_table, "video/x-m4v", (gpointer)video_x_m4v); +g_hash_table_insert(mime_table, "message/news", (gpointer)message_news); +g_hash_table_insert(mime_table, "application/vnd.symbian.install", (gpointer)application_vnd_symbian_install); +g_hash_table_insert(mime_table, "application/x-lzh-compressed", (gpointer)application_x_lzh_compressed); +g_hash_table_insert(mime_table, "application/x-dosdriver", (gpointer)application_x_dosdriver); +g_hash_table_insert(mime_table, "application/vnd.tcpdump.pcap", (gpointer)application_vnd_tcpdump_pcap); +g_hash_table_insert(mime_table, "x-epoc/x-sisx-app", (gpointer)x_epoc_x_sisx_app); +g_hash_table_insert(mime_table, "application/x-avira-qua", (gpointer)application_x_avira_qua); +g_hash_table_insert(mime_table, "video/MP2T", (gpointer)video_MP2T); +g_hash_table_insert(mime_table, "application/x-snappy-framed", (gpointer)application_x_snappy_framed); +g_hash_table_insert(mime_table, "application/x-lz4+json", (gpointer)application_x_lz4_json); +g_hash_table_insert(mime_table, "application/x-dmp", (gpointer)application_x_dmp); +g_hash_table_insert(mime_table, "application/zlib", (gpointer)application_zlib); +g_hash_table_insert(mime_table, "application/x-pgp-keyring", (gpointer)application_x_pgp_keyring); +g_hash_table_insert(mime_table, "application/x-gdbm", (gpointer)application_x_gdbm); +g_hash_table_insert(mime_table, "application/x-font-pf2", (gpointer)application_x_font_pf2); +g_hash_table_insert(mime_table, "application/x-zip", (gpointer)application_x_zip); +g_hash_table_insert(mime_table, "application/x-coredump", (gpointer)application_x_coredump); +g_hash_table_insert(mime_table, "application/x-java-jmod", (gpointer)application_x_java_jmod); +g_hash_table_insert(mime_table, "application/x-terminfo", (gpointer)application_x_terminfo); +g_hash_table_insert(mime_table, "application/x-terminfo2", (gpointer)application_x_terminfo2); +g_hash_table_insert(mime_table, "application/x-arc", (gpointer)application_x_arc); +g_hash_table_insert(mime_table, "application/vnd.lotus-1-2-3", (gpointer)application_vnd_lotus_1_2_3); +g_hash_table_insert(mime_table, "image/x-win-bitmap", (gpointer)image_x_win_bitmap); +g_hash_table_insert(mime_table, "application/x-maxis-dbpf", (gpointer)application_x_maxis_dbpf); +g_hash_table_insert(mime_table, "text/PGP", (gpointer)text_PGP); +g_hash_table_insert(mime_table, "audio/x-hx-aac-adts", (gpointer)audio_x_hx_aac_adts); +g_hash_table_insert(mime_table, "application/x-chrome-extension", (gpointer)application_x_chrome_extension); +g_hash_table_insert(mime_table, "image/heic", (gpointer)image_heic); +g_hash_table_insert(mime_table, "image/x-gem", (gpointer)image_x_gem); return mime_table;} #endif diff --git a/src/parsing/parse.c b/src/parsing/parse.c index 08c7291..315993a 100644 --- a/src/parsing/parse.c +++ b/src/parsing/parse.c @@ -1,7 +1,7 @@ #include "src/sist.h" #include "src/ctx.h" -__thread magic_t Magic; +__thread magic_t Magic = NULL; void *read_all(parse_job_t *job, const char *buf, int bytes_read, int *fd) { @@ -62,7 +62,7 @@ void parse(void *arg) { if (job->info.st_size == 0) { doc.mime = MIME_EMPTY; - } else if (*(job->filepath + job->ext) != '\0') { + } else if (*(job->filepath + job->ext) != '\0' && (job->ext - job->base != 1)) { doc.mime = mime_get_mime_by_ext(ScanCtx.ext_table, job->filepath + job->ext); } @@ -80,11 +80,18 @@ void parse(void *arg) { bytes_read = read(fd, buf, PARSE_BUF_SIZE); + if (bytes_read == -1) { + perror("read"); + close(fd); + free(job); + return; + } + const char *magic_mime_str = magic_buffer(Magic, buf, bytes_read); if (magic_mime_str != NULL) { doc.mime = mime_get_mime_by_string(ScanCtx.mime_table, magic_mime_str); if (doc.mime == 0) { - fprintf(stderr, "Couldn't find mime %s, %s!\n", magic_mime_str, job->filepath + job->base); + fprintf(stderr, "Couldn't find mime %s, %s\n", magic_mime_str, job->filepath + job->base); } } } @@ -93,7 +100,8 @@ void parse(void *arg) { if (!(SHOULD_PARSE(doc.mime))) { - } else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) || mmime == MimeAudio || mmime == MimeImage) { + } else if ((mmime == MimeVideo && doc.size >= MIN_VIDEO_SIZE) || + (mmime == MimeImage && doc.size >= MIN_IMAGE_SIZE) || mmime == MimeAudio) { parse_media(job->filepath, &doc); } else if (IS_PDF(doc.mime)) { diff --git a/src/parsing/pdf.c b/src/parsing/pdf.c index 5f0b853..4d0e2a1 100644 --- a/src/parsing/pdf.c +++ b/src/parsing/pdf.c @@ -1,10 +1,22 @@ -#include #include "pdf.h" #include "src/ctx.h" fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) { - fz_page *cover = fz_load_page(ctx, fzdoc, 0); + int err = 0; + fz_page *cover = NULL; + + fz_var(cover); + fz_try(ctx) + cover = fz_load_page(ctx, fzdoc, 0); + fz_catch(ctx) + err = 1; + + if (err != 0) { + fz_drop_page(ctx, cover); + return NULL; + } + fz_rect bounds = fz_bound_page(ctx, cover); float scale; @@ -24,24 +36,49 @@ fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) { fz_clear_pixmap_with_value(ctx, pixmap, 0xFF); fz_device *dev = fz_new_draw_device(ctx, m, pixmap); - pthread_mutex_lock(&ScanCtx.mupdf_mu); + fz_var(err); fz_try(ctx) + { + pthread_mutex_lock(&ScanCtx.mupdf_mu); fz_run_page(ctx, cover, dev, fz_identity, NULL); + } fz_always(ctx) + { + fz_close_device(ctx, dev); + fz_drop_device(ctx, dev); pthread_mutex_unlock(&ScanCtx.mupdf_mu); + } fz_catch(ctx) - fz_rethrow(ctx); + err = ctx->error.errcode; - fz_drop_device(ctx, dev); + if (err != 0) { + fz_drop_page(ctx, cover); + fz_drop_pixmap(ctx, pixmap); + return NULL; + } - fz_buffer *fzbuf = fz_new_buffer_from_pixmap_as_png(ctx, pixmap, fz_default_color_params); - unsigned char *tn_buf; - size_t tn_len = fz_buffer_storage(ctx, fzbuf, &tn_buf); + fz_buffer *fzbuf = NULL; + fz_var(fzbuf); + fz_var(err); - store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) tn_buf, tn_len); + fz_try(ctx) + fzbuf = fz_new_buffer_from_pixmap_as_png(ctx, pixmap, fz_default_color_params); + fz_catch(ctx) + err = ctx->error.errcode; + + if (err == 0) { + unsigned char *tn_buf; + size_t tn_len = fz_buffer_storage(ctx, fzbuf, &tn_buf); + store_write(ScanCtx.index.store, (char *) doc->uuid, sizeof(doc->uuid), (char *) tn_buf, tn_len); + } - fz_drop_pixmap(ctx, pixmap); fz_drop_buffer(ctx, fzbuf); + fz_drop_pixmap(ctx, pixmap); + + if (err != 0) { + fz_drop_page(ctx, cover); + return NULL; + } return cover; } @@ -49,6 +86,32 @@ fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) { void fz_noop_callback(__attribute__((unused)) void *user, __attribute__((unused)) const char *message) {} +void init_ctx(fz_context *ctx) { + fz_disable_icc(ctx); + fz_register_document_handlers(ctx); + ctx->warn.print = fz_noop_callback; + ctx->error.print = fz_noop_callback; +} + +int read_stext_block(fz_stext_block *block, text_buffer_t *tex) { + if (block->type != FZ_STEXT_BLOCK_TEXT) { + return 0; + } + + fz_stext_line *line = block->u.t.first_line; + while (line != NULL) { + fz_stext_char *c = line->first_char; + while (c != NULL) { + if (text_buffer_append_char(tex, c->c) == TEXT_BUF_FULL) { + return TEXT_BUF_FULL; + } + c = c->next; + } + line = line->next; + } + return 0; +} + void parse_pdf(void *buf, size_t buf_len, document_t *doc) { static int mu_is_initialized = 0; @@ -57,105 +120,140 @@ void parse_pdf(void *buf, size_t buf_len, document_t *doc) { mu_is_initialized = 1; } fz_context *ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED); - fz_stream *stream = NULL; - fz_document *fzdoc = NULL; - fz_var(stream); + init_ctx(ctx); + + int err = 0; + + fz_document *fzdoc = NULL; + fz_stream *stream = NULL; fz_var(fzdoc); + fz_var(stream); + fz_var(err); fz_try(ctx) { - fz_disable_icc(ctx); - fz_register_document_handlers(ctx); - - //disable warnings - ctx->warn.print = fz_noop_callback; - ctx->error.print = fz_noop_callback; - stream = fz_open_memory(ctx, buf, buf_len); fzdoc = fz_open_document_with_stream(ctx, mime_get_mime_text(doc->mime), stream); - - char title[4096] = {'\0',}; - fz_lookup_metadata(ctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title)); - printf("Title: %s\n", title); //todo rmv - - if (strlen(title) > 0) { - meta_line_t *meta_content = malloc(sizeof(meta_line_t) + strlen(title) + 1); - meta_content->key = MetaTitle; - strcpy(meta_content->strval, title); - APPEND_META(doc, meta_content) - } - - int page_count = fz_count_pages(ctx, fzdoc); - - fz_page *cover = render_cover(ctx, doc, fzdoc); - - fz_stext_options opts; - - text_buffer_t text_buf = text_buffer_create(ScanCtx.content_size); - - for (int current_page = 0; current_page < page_count; current_page++) { - fz_page *page; if (current_page == 0) { - page = cover; - } else { - page = fz_load_page(ctx, fzdoc, current_page); - } - - fz_stext_page *stext = fz_new_stext_page(ctx, fz_bound_page(ctx, page)); - fz_device *dev = fz_new_stext_device(ctx, stext, &opts); - - pthread_mutex_lock(&ScanCtx.mupdf_mu); - fz_try(ctx) - fz_run_page_contents(ctx, page, dev, fz_identity, NULL); - fz_always(ctx) - pthread_mutex_unlock(&ScanCtx.mupdf_mu); - fz_catch(ctx) - fz_rethrow(ctx); - - fz_drop_device(ctx, dev); - - fz_stext_block *block = stext->first_block; - while (block != NULL) { - - if (block->type != FZ_STEXT_BLOCK_TEXT) { - block = block->next; - continue; - } - - fz_stext_line *line = block->u.t.first_line; - while (line != NULL) { - fz_stext_char *c = line->first_char; - while (c != NULL) { - if (text_buffer_append_char(&text_buf, c->c) == TEXT_BUF_FULL) { - fz_drop_page(ctx, page); - fz_drop_stext_page(ctx, stext); - goto write_loop_end; - } - c = c->next; - } - line = line->next; - } - block = block->next; - } - fz_drop_page(ctx, page); - fz_drop_stext_page(ctx, stext); - } - write_loop_end:; - text_buffer_terminate_string(&text_buf); - - meta_line_t *meta_content = malloc(sizeof(meta_line_t) + text_buf.dyn_buffer.cur); - meta_content->key = MetaContent; - memcpy(meta_content->strval, text_buf.dyn_buffer.buf, text_buf.dyn_buffer.cur); - text_buffer_destroy(&text_buf); - APPEND_META(doc, meta_content) } - fz_always(ctx) - { + fz_catch(ctx) + err = ctx->error.errcode; + + if (err) { fz_drop_stream(ctx, stream); fz_drop_document(ctx, fzdoc); fz_drop_context(ctx); - } fz_catch(ctx) { - fprintf(stderr, "Error %s %s\n", doc->filepath, ctx->error.message); + return; } + + char title[4096] = {'\0',}; + fz_try(ctx) + fz_lookup_metadata(ctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title)); + fz_catch(ctx) + ; + + if (strlen(title) > 0) { + meta_line_t *meta_content = malloc(sizeof(meta_line_t) + strlen(title)); + meta_content->key = MetaTitle; + strcpy(meta_content->strval, title); + APPEND_META(doc, meta_content) + } + + int page_count = -1; + fz_var(err); + fz_try(ctx) + page_count = fz_count_pages(ctx, fzdoc); + fz_catch(ctx) + err = ctx->error.errcode; + + if (err) { + fz_drop_stream(ctx, stream); + fz_drop_document(ctx, fzdoc); + fz_drop_context(ctx); + return; + } + + fz_page *cover = render_cover(ctx, doc, fzdoc); + if (cover == NULL) { + fz_drop_stream(ctx, stream); + fz_drop_document(ctx, fzdoc); + fz_drop_context(ctx); + return; + } + + fz_stext_options opts = {0}; + text_buffer_t text_buf = text_buffer_create(ScanCtx.content_size); + + for (int current_page = 0; current_page < page_count; current_page++) { + fz_page *page = NULL; + if (current_page == 0) { + page = cover; + } else { + fz_var(err); + fz_try(ctx) + page = fz_load_page(ctx, fzdoc, current_page); + fz_catch(ctx) + err = ctx->error.errcode; + if (err != 0) { + text_buffer_destroy(&text_buf); + fz_drop_page(ctx, page); + fz_drop_stream(ctx, stream); + fz_drop_document(ctx, fzdoc); + fz_drop_context(ctx); + return; + } + } + + fz_stext_page *stext = fz_new_stext_page(ctx, fz_bound_page(ctx, page)); + fz_device *dev = fz_new_stext_device(ctx, stext, &opts); + + fz_var(err); + fz_try(ctx) + fz_run_page(ctx, page, dev, fz_identity, NULL); + fz_always(ctx) + { + fz_close_device(ctx, dev); + fz_drop_device(ctx, dev); + } + fz_catch(ctx) + err = ctx->error.errcode; + + if (err != 0) { + text_buffer_destroy(&text_buf); + fz_drop_page(ctx, page); + fz_drop_stext_page(ctx, stext); + fz_drop_stream(ctx, stream); + fz_drop_document(ctx, fzdoc); + fz_drop_context(ctx); + return; + } + + fz_stext_block *block = stext->first_block; + while (block != NULL) { + int ret = read_stext_block(block, &text_buf); + if (ret == TEXT_BUF_FULL) { + break; + } + block = block->next; + } + fz_drop_stext_page(ctx, stext); + fz_drop_page(ctx, page); + + if (text_buf.dyn_buffer.cur >= text_buf.dyn_buffer.size) { + break; + } + } + text_buffer_terminate_string(&text_buf); + + meta_line_t *meta_content = malloc(sizeof(meta_line_t) + text_buf.dyn_buffer.cur); + meta_content->key = MetaContent; + memcpy(meta_content->strval, text_buf.dyn_buffer.buf, text_buf.dyn_buffer.cur); + APPEND_META(doc, meta_content) + + fz_drop_stream(ctx, stream); + fz_drop_document(ctx, fzdoc); + fz_drop_context(ctx); + + text_buffer_destroy(&text_buf); } diff --git a/src/parsing/text.c b/src/parsing/text.c index e71cf0e..c164ffa 100644 --- a/src/parsing/text.c +++ b/src/parsing/text.c @@ -27,17 +27,14 @@ void parse_text(int bytes_read, int *fd, char *buf, document_t *doc) { read(*fd, intermediate_buf + bytes_read, to_read); } + text_buffer_t tex = text_buffer_create(ScanCtx.content_size); + text_buffer_append_string(&tex, intermediate_buf, intermediate_buf_len); - text_buffer_t text_buf = text_buffer_create(ScanCtx.content_size); - for (int i = 0; i < intermediate_buf_len; i++) { - text_buffer_append_char(&text_buf, *(intermediate_buf + i)); - } - text_buffer_terminate_string(&text_buf); - - meta_line_t *meta = malloc(sizeof(meta_line_t) + text_buf.dyn_buffer.cur); + meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); meta->key = MetaContent; - strcpy(meta->strval, text_buf.dyn_buffer.buf); - text_buffer_destroy(&text_buf); - free(intermediate_buf); + strcpy(meta->strval, tex.dyn_buffer.buf); APPEND_META(doc, meta) + + free(intermediate_buf); + text_buffer_destroy(&tex); } diff --git a/src/sist.h b/src/sist.h index e537aa1..ab14d0a 100644 --- a/src/sist.h +++ b/src/sist.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,7 @@ #include "parsing/media.h" #include "parsing/font.h" #include "cli.h" +#include "utf8.h/utf8.h" #ifndef SIST_SCAN_ONLY #include "src/index/elastic.h" diff --git a/src/tpool.c b/src/tpool.c index 394384b..f476a33 100644 --- a/src/tpool.c +++ b/src/tpool.c @@ -25,6 +25,7 @@ typedef struct tpool { int done_cnt; int stop; + void (*cleanup_func)(); } tpool_t; @@ -100,7 +101,7 @@ static void *tpool_worker(void *arg) { tpool_t *pool = arg; while (1) { - pthread_mutex_lock(&(pool->work_mutex)); + pthread_mutex_lock(&pool->work_mutex); if (pool->stop) { break; } @@ -120,7 +121,7 @@ static void *tpool_worker(void *arg) { pthread_mutex_lock(&(pool->work_mutex)); pool->done_cnt++; - progress_bar_print((double)pool->done_cnt / pool->work_cnt, ScanCtx.stat_tn_size, ScanCtx.stat_index_size); + progress_bar_print((double) pool->done_cnt / pool->work_cnt, ScanCtx.stat_tn_size, ScanCtx.stat_index_size); if (pool->work_head == NULL) { pthread_cond_signal(&(pool->working_cond)); @@ -188,11 +189,11 @@ tpool_t *tpool_create(size_t thread_cnt, void cleanup_func()) { tpool_t *pool = malloc(sizeof(tpool_t)); pool->thread_cnt = thread_cnt; - pool->work_cnt =0; - pool->done_cnt =0; + pool->work_cnt = 0; + pool->done_cnt = 0; pool->stop = 0; pool->cleanup_func = cleanup_func; - pool->threads = malloc(sizeof(pthread_t) * thread_cnt); + pool->threads = calloc(sizeof(pthread_t), thread_cnt); pthread_mutex_init(&(pool->work_mutex), NULL); @@ -202,11 +203,14 @@ tpool_t *tpool_create(size_t thread_cnt, void cleanup_func()) { pool->work_head = NULL; pool->work_tail = NULL; - for (size_t i = 0; i < thread_cnt; i++) { + return pool; +} + +void tpool_start(tpool_t *pool) { + + for (size_t i = 0; i < pool->thread_cnt; i++) { pthread_t thread = pool->threads[i]; pthread_create(&thread, NULL, tpool_worker, pool); pthread_detach(thread); } - - return pool; } diff --git a/src/tpool.h b/src/tpool.h index c4c32b2..c4f7da2 100644 --- a/src/tpool.h +++ b/src/tpool.h @@ -9,6 +9,7 @@ typedef struct tpool tpool_t; typedef void (*thread_func_t)(void *arg); tpool_t *tpool_create(size_t num, void (*cleanup_func)()); +void tpool_start(tpool_t *pool); void tpool_destroy(tpool_t *tm); int tpool_add_work(tpool_t *pool, thread_func_t func, void *arg); diff --git a/src/util.c b/src/util.c index 45e534c..0197656 100644 --- a/src/util.c +++ b/src/util.c @@ -89,10 +89,71 @@ void text_buffer_terminate_string(text_buffer_t *buf) { dyn_buffer_write_char(&buf->dyn_buffer, '\0'); } -int text_buffer_append_string(text_buffer_t *buf, char * str) { - char * ptr = str; - while (*ptr) { - text_buffer_append_char(buf, *ptr++); +__always_inline +int utf8_validchr(const char* s) { + if (0x00 == (0x80 & *s)) { + return TRUE; + } else if (0xf0 == (0xf8 & *s)) { + if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) || + (0x80 != (0xc0 & s[3]))) { + return FALSE; + } + + if (0x80 == (0xc0 & s[4])) { + return FALSE; + } + + if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) { + return FALSE; + } + } else if (0xe0 == (0xf0 & *s)) { + if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) { + return FALSE; + } + + if (0x80 == (0xc0 & s[3])) { + return FALSE; + } + + if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) { + return FALSE; + } + } else if (0xc0 == (0xe0 & *s)) { + if (0x80 != (0xc0 & s[1])) { + return FALSE; + } + + if (0x80 == (0xc0 & s[2])) { + return FALSE; + } + + if (0 == (0x1e & s[0])) { + return FALSE; + } + } else { + return FALSE; + } + + return TRUE; +} + +int text_buffer_append_string(text_buffer_t *buf, char *str, size_t len) { + + utf8_int32_t c; + for (void *v = utf8codepoint(str, &c); c != '\0' && ((char*)v - str + 4) < len; v = utf8codepoint(v, &c)) { + if (utf8_validchr(v)) { + text_buffer_append_char(buf, c); + } + } + text_buffer_terminate_string(buf); +} + +int text_buffer_append_string0(text_buffer_t *buf, char *str) { + utf8_int32_t c; + for (void *v = utf8codepoint(str, &c); c != '\0'; v = utf8codepoint(v, &c)) { + if (utf8_validchr(v)) { + text_buffer_append_char(buf, c); + } } text_buffer_terminate_string(buf); } @@ -104,15 +165,31 @@ int text_buffer_append_char(text_buffer_t *buf, int c) { dyn_buffer_write_char(&buf->dyn_buffer, ' '); buf->last_char_was_whitespace = TRUE; - if (buf->dyn_buffer.cur >= buf->max_size) { + if (buf->max_size > 0 && buf->dyn_buffer.cur >= buf->max_size) { return TEXT_BUF_FULL; } } } else { buf->last_char_was_whitespace = FALSE; - dyn_buffer_write_char(&buf->dyn_buffer, (char) c); + grow_buffer_small(&buf->dyn_buffer); - if (buf->dyn_buffer.cur >= buf->max_size) { + if (0 == ((utf8_int32_t) 0xffffff80 & c)) { + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = (char) c; + } else if (0 == ((utf8_int32_t) 0xfffff800 & c)) { + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xc0 | (char) (c >> 6); + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f); + } else if (0 == ((utf8_int32_t) 0xffff0000 & c)) { + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xe0 | (char) (c >> 12); + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f); + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f); + } else { + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xf0 | (char) (c >> 18); + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 12) & 0x3f); + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f); + *(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f); + } + + if (buf->max_size > 0 && buf->dyn_buffer.cur >= buf->max_size) { return TEXT_BUF_FULL; } } @@ -144,7 +221,7 @@ dyn_buffer_t url_escape(char *str) { dyn_buffer_t text = dyn_buffer_create(); - char * ptr = str; + char *ptr = str; while (*ptr) { if (*ptr == '#') { dyn_buffer_write(&text, "%23", 3); @@ -177,7 +254,7 @@ char *expandpath(const char *path) { wordexp_t w; wordexp(path, &w, 0); - char * expanded = malloc(strlen(w.we_wordv[0]) + 2); + char *expanded = malloc(strlen(w.we_wordv[0]) + 2); strcpy(expanded, w.we_wordv[0]); strcat(expanded, "/"); diff --git a/src/util.h b/src/util.h index 4096f3c..a5c227f 100644 --- a/src/util.h +++ b/src/util.h @@ -5,7 +5,10 @@ #define TEXT_BUF_FULL -1 #define INITIAL_BUF_SIZE 1024 * 16 -#define SHOULD_IGNORE_CHAR(c) c < '0' || c > 'z' + +#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c)) +#define SHOULD_KEEP_CHAR(c) (c >= (int)'!') + typedef struct dyn_buffer { char *buf; @@ -21,8 +24,10 @@ typedef struct text_buffer { dyn_buffer_t dyn_buffer; } text_buffer_t; -char *abspath(const char * path); +char *abspath(const char *path); + char *expandpath(const char *path); + dyn_buffer_t url_escape(char *str); void progress_bar_print(double percentage, size_t tn_size, size_t index_size); @@ -56,14 +61,16 @@ text_buffer_t text_buffer_create(int max_size); void text_buffer_terminate_string(text_buffer_t *buf); -int text_buffer_append_string(text_buffer_t *buf, char * str); +int text_buffer_append_string(text_buffer_t *buf, char *str, size_t len); +int text_buffer_append_string0(text_buffer_t *buf, char *str); + int text_buffer_append_char(text_buffer_t *buf, int c); void incremental_put(GHashTable *table, unsigned long inode_no, int mtime); int incremental_get(GHashTable *table, unsigned long inode_no); + int incremental_mark_file_for_copy(GHashTable *table, unsigned long inode_no); - #endif diff --git a/utf8.h b/utf8.h new file mode 160000 index 0000000..2a7c5bf --- /dev/null +++ b/utf8.h @@ -0,0 +1 @@ +Subproject commit 2a7c5bfa952816cd1c674e604d31c6e0268ba770