From bd9e56829c1c09b436f6b55013d599017f422f1b Mon Sep 17 00:00:00 2001 From: simon987 Date: Thu, 30 Apr 2020 20:21:09 -0400 Subject: [PATCH] Support for markup files --- Docker/build.sh | 4 +- scripts/mime.csv | 1 + scripts/mime.py | 8 + src/main.c | 7 +- src/parsing/mime.h | 5 +- src/parsing/mime_generated.c | 387 ++++++++++++++++++----------------- src/parsing/parse.c | 6 +- src/static/search.html | 2 +- src/web/static_generated.c | 2 +- third-party/libscan | 2 +- 10 files changed, 224 insertions(+), 200 deletions(-) diff --git a/Docker/build.sh b/Docker/build.sh index 85721f4..20312c4 100755 --- a/Docker/build.sh +++ b/Docker/build.sh @@ -1,6 +1,6 @@ rm ./sist2 sist2_debug -cp ../sist2.gz ../sist2_debug.gz . -gzip -d sist2.gz sist2_debug.gz +cp ../sist2.gz . +gzip -d sist2.gz strip sist2 version=$(./sist2 --version) diff --git a/scripts/mime.csv b/scripts/mime.csv index b53342d..2059ce6 100644 --- a/scripts/mime.csv +++ b/scripts/mime.csv @@ -428,3 +428,4 @@ video/x-msvideo, divx video/x-qtc, qtc video/x-sgi-movie, movie|mv x-epoc/x-sisx-app, +application/x-zstd-dictionary, diff --git a/scripts/mime.py b/scripts/mime.py index 520f3c6..236783f 100644 --- a/scripts/mime.py +++ b/scripts/mime.py @@ -67,6 +67,12 @@ mobi = ( "application/vnd.amazon.mobi8-ebook" ) +markup = ( + "text/xml", + "text/html", + "text/x-sgml" +) + cnt = 1 @@ -89,6 +95,8 @@ def mime_id(mime): mime_id += " | 0x04000000" elif mime in mobi: mime_id += " | 0x02000000" + elif mime in markup: + mime_id += " | 0x01000000" elif mime == "application/x-empty": return "1" return mime_id diff --git a/src/main.c b/src/main.c index 16795d5..1cd7c5c 100644 --- a/src/main.c +++ b/src/main.c @@ -19,7 +19,7 @@ #define EPILOG "Made by simon987 . Released under GPL-3.0" -static const char *const Version = "2.0.0"; +static const char *const Version = "2.1.0"; static const char *const usage[] = { "sist2 scan [OPTION]... PATH", "sist2 index [OPTION]... INDEX", @@ -136,6 +136,11 @@ void initialize_scan_context(scan_args_t *args) { ScanCtx.mobi_ctx.log = _log; ScanCtx.mobi_ctx.logf = _logf; + // TEXT + ScanCtx.text_ctx.content_size = args->content_size; + ScanCtx.text_ctx.log = _log; + ScanCtx.text_ctx.logf = _logf; + ScanCtx.threads = args->threads; ScanCtx.depth = args->depth; diff --git a/src/parsing/mime.h b/src/parsing/mime.h index cfea326..fb44b1b 100644 --- a/src/parsing/mime.h +++ b/src/parsing/mime.h @@ -3,7 +3,7 @@ #include "../sist.h" -#define MAJOR_MIME(mime_id) (mime_id & 0x0FFF0000) >> 16 +#define MAJOR_MIME(mime_id) (mime_id & 0x00FF0000) >> 16 #define MIME_EMPTY 1 @@ -28,6 +28,9 @@ #define MOBI_MASK 0x02000000 #define IS_MOBI(mime_id) (mime_id & MOBI_MASK) == MOBI_MASK +#define MARKUP_MASK 0x01000000 +#define IS_MARKUP(mime_id) (mime_id & MARKUP_MASK) == MARKUP_MASK + enum major_mime { MimeInvalid = 0, MimeModel = 1, diff --git a/src/parsing/mime_generated.c b/src/parsing/mime_generated.c index 2d05308..97a8b06 100644 --- a/src/parsing/mime_generated.c +++ b/src/parsing/mime_generated.c @@ -242,198 +242,199 @@ enum mime { application_x_xz=655594 | 0x08000000, application_x_zip=655595, application_x_zstd=655596 | 0x08000000, - application_xml=655597, - application_zip=655598 | 0x10000000, - application_zlib=655599, - audio_basic=458992 | 0x80000000, - audio_it=458993, - audio_make=458994, - audio_mid=458995, - audio_midi=458996, - audio_mp4=458997, - audio_mpeg=458998, - audio_ogg=458999, - audio_s3m=459000, - audio_tsp_audio=459001, - audio_tsplayer=459002, - audio_vnd_qcelp=459003, - audio_voxware=459004, - audio_x_aiff=459005, - audio_x_flac=459006, - audio_x_gsm=459007, - audio_x_hx_aac_adts=459008, - audio_x_jam=459009, - audio_x_liveaudio=459010, - audio_x_m4a=459011, - audio_x_midi=459012, - audio_x_mod=459013, - audio_x_mp4a_latm=459014, - audio_x_mpeg_3=459015, - audio_x_mpequrl=459016, - audio_x_nspaudio=459017, - audio_x_pn_realaudio=459018, - audio_x_psid=459019, - audio_x_realaudio=459020, - audio_x_s3m=459021, - audio_x_twinvq=459022, - audio_x_twinvq_plugin=459023, - audio_x_voc=459024, - audio_x_wav=459025, - audio_x_xbox_executable=459026 | 0x80000000, - audio_x_xbox360_executable=459027 | 0x80000000, - audio_xm=459028, - font_otf=327957 | 0x20000000, - font_sfnt=327958 | 0x20000000, - font_woff=327959 | 0x20000000, - font_woff2=327960 | 0x20000000, - image_bmp=524569, - image_cmu_raster=524570, - image_fif=524571, - image_florian=524572, - image_g3fax=524573, - image_gif=524574, - image_heic=524575, - image_ief=524576, - image_jpeg=524577, - image_jutvision=524578, - image_naplps=524579, - image_pict=524580, - image_png=524581, - image_svg=524582 | 0x80000000, - image_svg_xml=524583 | 0x80000000, - image_tiff=524584, - image_vnd_adobe_photoshop=524585 | 0x80000000, - image_vnd_djvu=524586 | 0x80000000, - image_vnd_fpx=524587, - image_vnd_microsoft_icon=524588, - image_vnd_rn_realflash=524589, - image_vnd_rn_realpix=524590, - image_vnd_wap_wbmp=524591, - image_vnd_xiff=524592, - image_webp=524593, - image_wmf=524594, - image_x_3ds=524595, - image_x_award_bioslogo=524596, - image_x_cmu_raster=524597, - image_x_cur=524598, - image_x_dwg=524599, - image_x_eps=524600, - image_x_exr=524601, - image_x_gem=524602, - image_x_icns=524603, - image_x_icon=524604 | 0x80000000, - image_x_jg=524605, - image_x_jps=524606, - image_x_ms_bmp=524607, - image_x_niff=524608, - image_x_pcx=524609, - image_x_pict=524610, - image_x_portable_bitmap=524611, - image_x_portable_graymap=524612, - image_x_portable_pixmap=524613, - image_x_quicktime=524614, - image_x_rgb=524615, - image_x_tga=524616, - image_x_tiff=524617, - image_x_win_bitmap=524618, - image_x_xcf=524619 | 0x80000000, - image_x_xpixmap=524620 | 0x80000000, - image_x_xwindowdump=524621, - message_news=196942, - message_rfc822=196943, - model_vnd_dwf=65872, - model_vnd_gdl=65873, - model_vnd_gs_gdl=65874, - model_vrml=65875, - model_x_pov=65876, - text_PGP=590165, - text_asp=590166, - text_css=590167, - text_html=590168, - text_javascript=590169, - text_mcf=590170, - text_pascal=590171, - text_plain=590172, - text_richtext=590173, - text_rtf=590174, - text_scriplet=590175, - text_tab_separated_values=590176, - text_troff=590177, - text_uri_list=590178, - text_vnd_abc=590179, - text_vnd_fmi_flexstor=590180, - text_vnd_wap_wml=590181, - text_vnd_wap_wmlscript=590182, - text_webviewhtml=590183, - text_x_Algol68=590184, - text_x_asm=590185, - text_x_audiosoft_intra=590186, - text_x_awk=590187, - text_x_bcpl=590188, - text_x_c=590189, - text_x_c__=590190, - text_x_component=590191, - text_x_diff=590192, - text_x_fortran=590193, - text_x_java=590194, - text_x_la_asf=590195, - text_x_lisp=590196, - text_x_m=590197, - text_x_m4=590198, - text_x_makefile=590199, - text_x_ms_regedit=590200, - text_x_msdos_batch=590201, - text_x_objective_c=590202, - text_x_pascal=590203, - text_x_perl=590204, - text_x_php=590205, - text_x_po=590206, - text_x_python=590207, - text_x_ruby=590208, - text_x_sass=590209, - text_x_scss=590210, - text_x_server_parsed_html=590211, - text_x_setext=590212, - text_x_sgml=590213, - text_x_shellscript=590214, - text_x_speech=590215, - text_x_tcl=590216, - text_x_tex=590217, - text_x_uil=590218, - text_x_uuencode=590219, - text_x_vcalendar=590220, - text_x_vcard=590221, - text_xml=590222, - video_MP2T=393615, - video_animaflex=393616, - video_avi=393617, - video_avs_video=393618, - video_mp4=393619, - video_mpeg=393620, - video_quicktime=393621, - video_vdo=393622, - video_vivo=393623, - video_vnd_rn_realvideo=393624, - video_vosaic=393625, - video_webm=393626, - video_x_amt_demorun=393627, - video_x_amt_showrun=393628, - video_x_atomic3d_feature=393629, - video_x_dl=393630, - video_x_dv=393631, - video_x_fli=393632, - video_x_flv=393633, - video_x_isvideo=393634, - video_x_jng=393635 | 0x80000000, - video_x_m4v=393636, - video_x_matroska=393637, - video_x_mng=393638, - video_x_motion_jpeg=393639, - video_x_ms_asf=393640, - video_x_msvideo=393641, - video_x_qtc=393642, - video_x_sgi_movie=393643, - x_epoc_x_sisx_app=721324, + application_x_zstd_dictionary=655597, + application_xml=655598, + application_zip=655599 | 0x10000000, + application_zlib=655600, + audio_basic=458993 | 0x80000000, + audio_it=458994, + audio_make=458995, + audio_mid=458996, + audio_midi=458997, + audio_mp4=458998, + audio_mpeg=458999, + audio_ogg=459000, + audio_s3m=459001, + audio_tsp_audio=459002, + audio_tsplayer=459003, + audio_vnd_qcelp=459004, + audio_voxware=459005, + audio_x_aiff=459006, + audio_x_flac=459007, + audio_x_gsm=459008, + audio_x_hx_aac_adts=459009, + audio_x_jam=459010, + audio_x_liveaudio=459011, + audio_x_m4a=459012, + audio_x_midi=459013, + audio_x_mod=459014, + audio_x_mp4a_latm=459015, + audio_x_mpeg_3=459016, + audio_x_mpequrl=459017, + audio_x_nspaudio=459018, + audio_x_pn_realaudio=459019, + audio_x_psid=459020, + audio_x_realaudio=459021, + audio_x_s3m=459022, + audio_x_twinvq=459023, + audio_x_twinvq_plugin=459024, + audio_x_voc=459025, + audio_x_wav=459026, + audio_x_xbox_executable=459027 | 0x80000000, + audio_x_xbox360_executable=459028 | 0x80000000, + audio_xm=459029, + font_otf=327958 | 0x20000000, + font_sfnt=327959 | 0x20000000, + font_woff=327960 | 0x20000000, + font_woff2=327961 | 0x20000000, + image_bmp=524570, + image_cmu_raster=524571, + image_fif=524572, + image_florian=524573, + image_g3fax=524574, + image_gif=524575, + image_heic=524576, + image_ief=524577, + image_jpeg=524578, + image_jutvision=524579, + image_naplps=524580, + image_pict=524581, + image_png=524582, + image_svg=524583 | 0x80000000, + image_svg_xml=524584 | 0x80000000, + image_tiff=524585, + image_vnd_adobe_photoshop=524586 | 0x80000000, + image_vnd_djvu=524587 | 0x80000000, + image_vnd_fpx=524588, + image_vnd_microsoft_icon=524589, + image_vnd_rn_realflash=524590, + image_vnd_rn_realpix=524591, + image_vnd_wap_wbmp=524592, + image_vnd_xiff=524593, + image_webp=524594, + image_wmf=524595, + image_x_3ds=524596, + image_x_award_bioslogo=524597, + image_x_cmu_raster=524598, + image_x_cur=524599, + image_x_dwg=524600, + image_x_eps=524601, + image_x_exr=524602, + image_x_gem=524603, + image_x_icns=524604, + image_x_icon=524605 | 0x80000000, + image_x_jg=524606, + image_x_jps=524607, + image_x_ms_bmp=524608, + image_x_niff=524609, + image_x_pcx=524610, + image_x_pict=524611, + image_x_portable_bitmap=524612, + image_x_portable_graymap=524613, + image_x_portable_pixmap=524614, + image_x_quicktime=524615, + image_x_rgb=524616, + image_x_tga=524617, + image_x_tiff=524618, + image_x_win_bitmap=524619, + image_x_xcf=524620 | 0x80000000, + image_x_xpixmap=524621 | 0x80000000, + image_x_xwindowdump=524622, + message_news=196943, + message_rfc822=196944, + model_vnd_dwf=65873, + model_vnd_gdl=65874, + model_vnd_gs_gdl=65875, + model_vrml=65876, + model_x_pov=65877, + text_PGP=590166, + text_asp=590167, + text_css=590168, + text_html=590169 | 0x01000000, + text_javascript=590170, + text_mcf=590171, + text_pascal=590172, + text_plain=590173, + text_richtext=590174, + text_rtf=590175, + text_scriplet=590176, + text_tab_separated_values=590177, + text_troff=590178, + text_uri_list=590179, + text_vnd_abc=590180, + text_vnd_fmi_flexstor=590181, + text_vnd_wap_wml=590182, + text_vnd_wap_wmlscript=590183, + text_webviewhtml=590184, + text_x_Algol68=590185, + text_x_asm=590186, + text_x_audiosoft_intra=590187, + text_x_awk=590188, + text_x_bcpl=590189, + text_x_c=590190, + text_x_c__=590191, + text_x_component=590192, + text_x_diff=590193, + text_x_fortran=590194, + text_x_java=590195, + text_x_la_asf=590196, + text_x_lisp=590197, + text_x_m=590198, + text_x_m4=590199, + text_x_makefile=590200, + text_x_ms_regedit=590201, + text_x_msdos_batch=590202, + text_x_objective_c=590203, + text_x_pascal=590204, + text_x_perl=590205, + text_x_php=590206, + text_x_po=590207, + text_x_python=590208, + text_x_ruby=590209, + text_x_sass=590210, + text_x_scss=590211, + text_x_server_parsed_html=590212, + text_x_setext=590213, + text_x_sgml=590214 | 0x01000000, + text_x_shellscript=590215, + text_x_speech=590216, + text_x_tcl=590217, + text_x_tex=590218, + text_x_uil=590219, + text_x_uuencode=590220, + text_x_vcalendar=590221, + text_x_vcard=590222, + text_xml=590223 | 0x01000000, + video_MP2T=393616, + video_animaflex=393617, + video_avi=393618, + video_avs_video=393619, + video_mp4=393620, + video_mpeg=393621, + video_quicktime=393622, + video_vdo=393623, + video_vivo=393624, + video_vnd_rn_realvideo=393625, + video_vosaic=393626, + video_webm=393627, + video_x_amt_demorun=393628, + video_x_amt_showrun=393629, + video_x_atomic3d_feature=393630, + video_x_dl=393631, + video_x_dv=393632, + video_x_fli=393633, + video_x_flv=393634, + video_x_isvideo=393635, + video_x_jng=393636 | 0x80000000, + video_x_m4v=393637, + video_x_matroska=393638, + video_x_mng=393639, + video_x_motion_jpeg=393640, + video_x_ms_asf=393641, + video_x_msvideo=393642, + video_x_qtc=393643, + video_x_sgi_movie=393644, + x_epoc_x_sisx_app=721325, }; char *mime_get_mime_text(unsigned int mime_id) {switch (mime_id) { case application_arj: return "application/arj"; @@ -864,6 +865,7 @@ case video_x_msvideo: return "video/x-msvideo"; case video_x_qtc: return "video/x-qtc"; case video_x_sgi_movie: return "video/x-sgi-movie"; case x_epoc_x_sisx_app: return "x-epoc/x-sisx-app"; +case application_x_zstd_dictionary: return "application/x-zstd-dictionary"; default: return NULL;}} GHashTable *mime_get_ext_table() {GHashTable *ext_table = g_hash_table_new(g_str_hash, g_str_equal); g_hash_table_insert(ext_table, "arj", (gpointer)application_arj); @@ -1813,5 +1815,6 @@ g_hash_table_insert(mime_table, "video/x-msvideo", (gpointer)video_x_msvideo); g_hash_table_insert(mime_table, "video/x-qtc", (gpointer)video_x_qtc); g_hash_table_insert(mime_table, "video/x-sgi-movie", (gpointer)video_x_sgi_movie); g_hash_table_insert(mime_table, "x-epoc/x-sisx-app", (gpointer)x_epoc_x_sisx_app); +g_hash_table_insert(mime_table, "application/x-zstd-dictionary", (gpointer)application_x_zstd_dictionary); return mime_table;} #endif diff --git a/src/parsing/parse.c b/src/parsing/parse.c index eb75a0e..762f2eb 100644 --- a/src/parsing/parse.c +++ b/src/parsing/parse.c @@ -124,7 +124,11 @@ void parse(void *arg) { parse_ebook(&ScanCtx.ebook_ctx, &job->vfile, mime_get_mime_text(doc.mime), &doc); } else if (mmime == MimeText && ScanCtx.text_ctx.content_size > 0) { - parse_text(&ScanCtx.text_ctx, &job->vfile, &doc); + if (IS_MARKUP(doc.mime)) { + parse_markup(&ScanCtx.text_ctx, &job->vfile, &doc); + } else { + parse_text(&ScanCtx.text_ctx, &job->vfile, &doc); + } } else if (IS_FONT(doc.mime)) { parse_font(&ScanCtx.font_ctx, &job->vfile, &doc); diff --git a/src/static/search.html b/src/static/search.html index c2e7d97..2ac8597 100644 --- a/src/static/search.html +++ b/src/static/search.html @@ -11,7 +11,7 @@