Compare commits

..

No commits in common. "bbf1aca936240aef0683a1bb620a15d6403d552f" and "f098f7916a27c51fc849a931b24b70e9fadeccf6" have entirely different histories.

15 changed files with 468 additions and 402 deletions

View File

@ -11,7 +11,7 @@ RUN ls -lh sist2-vue/dist/
FROM ubuntu:20.10 FROM ubuntu:20.10
RUN apt update && apt install -y curl libasan5 RUN apt update && apt install -y curl
RUN mkdir -p /usr/share/tessdata && \ RUN mkdir -p /usr/share/tessdata && \
cd /usr/share/tessdata/ && \ cd /usr/share/tessdata/ && \

View File

@ -9,7 +9,7 @@ RUN strip sist2
FROM ubuntu:20.10 FROM ubuntu:20.10
RUN apt update && apt install -y curl libasan5 RUN apt update && apt install -y curl
RUN mkdir -p /usr/share/tessdata && \ RUN mkdir -p /usr/share/tessdata && \
cd /usr/share/tessdata/ && \ cd /usr/share/tessdata/ && \

View File

@ -51,7 +51,7 @@ sist2 (Simple incremental search tool)
1. Download sist2 executable 1. Download sist2 executable
1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases) * 1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases) *
1. *(or)* Download a [development snapshot](https://files.simon987.net/.gate/sist2/simon987_sist2/) *(Not recommended!)* 1. *(or)* Download a [development snapshot](https://files.simon987.net/.gate/sist2/simon987_sist2/) *(Not recommended!)*
1. *(or)* `docker pull simon987/sist2:2.11.2-x64-linux` 1. *(or)* `docker pull simon987/sist2:2.11.0-x64-linux`
1. See [Usage guide](docs/USAGE.md) 1. See [Usage guide](docs/USAGE.md)
@ -82,7 +82,6 @@ tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
docx, xlsx, pptx | *(none)* | yes | if embedded | creator, modified_by, title | docx, xlsx, pptx | *(none)* | yes | if embedded | creator, modified_by, title |
doc (MS Word 97-2003) | antiword | yes | yes | author, title | doc (MS Word 97-2003) | antiword | yes | yes | author, title |
mobi, azw, azw3 | libmobi | yes | no | author, title | mobi, azw, azw3 | libmobi | yes | no | author, title |
wpd (WordPerfect) | libwpd | yes | no | *planned* |
\* *See [Archive files](#archive-files)* \* *See [Archive files](#archive-files)*

View File

@ -32,7 +32,7 @@ Lightning-fast file system indexer and search tool.
Scan options Scan options
-t, --threads=<int> Number of threads. DEFAULT=1 -t, --threads=<int> Number of threads. DEFAULT=1
-q, --quality=<flt> Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=3 -q, --quality=<flt> Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=5
--size=<int> Thumbnail size, in pixels. Use negative value to disable. DEFAULT=500 --size=<int> Thumbnail size, in pixels. Use negative value to disable. DEFAULT=500
--content-size=<int> Number of bytes to be extracted from text documents. Use negative value to disable. DEFAULT=32768 --content-size=<int> Number of bytes to be extracted from text documents. Use negative value to disable. DEFAULT=32768
--incremental=<str> Reuse an existing index and only scan modified files. --incremental=<str> Reuse an existing index and only scan modified files.
@ -41,14 +41,12 @@ Scan options
--name=<str> Index display name. DEFAULT: (name of the directory) --name=<str> Index display name. DEFAULT: (name of the directory)
--depth=<int> Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1 --depth=<int> Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1
--archive=<str> Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse --archive=<str> Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse
--archive-passphrase=<str> Passphrase for encrypted archive files
--ocr=<str> Tesseract language (use tesseract --list-langs to see which are installed on your machine) --ocr=<str> Tesseract language (use tesseract --list-langs to see which are installed on your machine)
-e, --exclude=<str> Files that match this regex will not be scanned -e, --exclude=<str> Files that match this regex will not be scanned
--fast Only index file names & mime type --fast Only index file names & mime type
--treemap-threshold=<str> Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005 --treemap-threshold=<str> Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
--mem-buffer=<int> Maximum memory buffer size per thread in MB for files inside archives (see USAGE.md). DEFAULT: 2000 --mem-buffer=<int> Maximum memory buffer size per thread in MB for files inside archives (see USAGE.md). DEFAULT: 2000
--read-subtitles Read subtitles from media files. --read-subtitles Read subtitles from media files
--fast-epub Faster but less accurate EPUB parsing (no thumbnails, metadata)
Index options Index options
-t, --threads=<int> Number of threads. DEFAULT=1 -t, --threads=<int> Number of threads. DEFAULT=1
@ -68,14 +66,13 @@ Web options
--bind=<str> Listen on this address. DEFAULT=localhost:4090 --bind=<str> Listen on this address. DEFAULT=localhost:4090
--auth=<str> Basic auth in user:password format --auth=<str> Basic auth in user:password format
--tag-auth=<str> Basic auth in user:password format for tagging --tag-auth=<str> Basic auth in user:password format for tagging
--tagline=<str> Tagline in navbar
--dev Serve html & js files from disk (for development)
Exec-script options Exec-script options
--es-url=<str> Elasticsearch url. DEFAULT=http://localhost:9200 --es-url=<str> Elasticsearch url. DEFAULT=http://localhost:9200
--es-index=<str> Elasticsearch index name. DEFAULT=sist2 --es-index=<str> Elasticsearch index name. DEFAULT=sist2
--script-file=<str> Path to user script. --script-file=<str> Path to user script.
--async-script Execute user script asynchronously. --async-script Execute user script asynchronously.
Made by simon987 <me@simon987.net>. Released under GPL-3.0
``` ```
## Scan ## Scan
@ -85,7 +82,7 @@ Exec-script options
* `-t, --threads` * `-t, --threads`
Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-WmiObject Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!** Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-WmiObject Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
* `-q, --quality` * `-q, --quality`
Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. *Does not affect PDF thumbnails quality*
* `--size` * `--size`
Thumbnail size in pixels. Thumbnail size in pixels.
* `--content-size` * `--content-size`
@ -128,7 +125,6 @@ Exec-script options
To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -` To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
* `--read-subtitles` When enabled, will attempt to read the subtitles stream from media files. * `--read-subtitles` When enabled, will attempt to read the subtitles stream from media files.
* `--fast-epub` Much faster but less accurate EPUB parsing. When enabled, sist2 will use a simple HTML parser to read epub files instead of the MuPDF library. No thumbnails are generated and author/title metadata are not parsed.
### Scan examples ### Scan examples
@ -149,11 +145,15 @@ sist2 scan --incremental ./orig_idx/ -o ./updated_idx/ ~/Documents
### Index format ### Index format
A typical `ndjson` type index structure looks like this: A typical `binary` type index structure looks like this:
``` ```
documents.idx/ documents.idx/
├── descriptor.json ├── descriptor.json
├── _index_main.ndjson.zst ├── _index_139965416830720
├── _index_139965425223424
├── _index_139965433616128
├── _index_139965442008832
├── _index_139965442008832
├── treemap.csv ├── treemap.csv
├── agg_mime.csv ├── agg_mime.csv
├── agg_date.csv ├── agg_date.csv
@ -169,7 +169,9 @@ documents.idx/
└── lock.mdb └── lock.mdb
``` ```
The `_index_*.ndjson.zst` files contain the document data in JSON format, in a compressed newline-delemited file. The `_index_*` files contain the raw binary index data and are not meant to be
read by other applications. The format is generally compatible across different
sist2 versions.
The `thumbs/` folder is a [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database) The `thumbs/` folder is a [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database)
database containing the thumbnails. database containing the thumbnails.
@ -179,6 +181,66 @@ following fields are safe to modify manually: `root`, `name`, [rewrite_url](#rew
The `.csv` are pre-computed aggregations necessary for the stats page. The `.csv` are pre-computed aggregations necessary for the stats page.
*Advanced usage*
Instead of using the `scan` module, you can also import an index generated
by a third party application. The 'external' index must have the following format:
```
my_index/
├── descriptor.json
├── _index_0
└── thumbs/
| ├── data.mdb
| └── lock.mdb
└── meta/
└── <empty>
```
*descriptor.json*:
```json
{
"uuid": "<valid UUID4>",
"version": "_external_v1",
"root": "(optional)",
"name": "<name>",
"rewrite_url": "(optional)",
"type": "json",
"timestamp": 1578971024
}
```
*_index_0*: NDJSON format (One json object per line)
```json
{
"_id": "unique uuid for the file",
"index": "index uuid4 (same one as descriptor.json!)",
"mime": "application/x-cbz",
"size": 14341204,
"mtime": 1578882996,
"extension": "cbz",
"name": "my_book",
"path": "path/to/books",
"content": "text contents of the book",
"title": "Title of the book",
"tag": ["genre.fiction", "author.someguy", "etc..."],
"_keyword": [
{"k": "ISBN", "v": "ABCD34789231"}
],
"_text": [
{"k": "other", "v": "This will be indexed as text"}
]
}
```
You can find the full list of supported fields [here](../src/io/serialize.c#L90)
The `_keyword.*` items will be indexed and searchable as **keyword** fields (only full matches allowed).
The `_text.*` items will be indexed and searchable as **text** fields (fuzzy searching allowed)
*thumbs/*: *thumbs/*:
LMDB key-value store. Keys are **binary** 16-byte md5 hash* (`_id` field) LMDB key-value store. Keys are **binary** 16-byte md5 hash* (`_id` field)
@ -186,6 +248,9 @@ and values are raw image bytes.
*\* Hash is calculated from the full path of the file, including the extension, relative to the index root* *\* Hash is calculated from the full path of the file, including the extension, relative to the index root*
Importing an external `binary` type index is technically possible but
it is currently unsupported and has no guaranties of back/forward compatibility.
## Index ## Index
### Index options ### Index options
@ -211,7 +276,6 @@ and values are raw image bytes.
down the process. down the process.
* `-f, --force-reset` * `-f, --force-reset`
Reset Elasticsearch mappings and settings. Reset Elasticsearch mappings and settings.
* `-t, --threads` Number of threads to use. Ideally, choose a number equal to the number of logical cores of the machine hosting Elasticsearch.
### Index examples ### Index examples
@ -241,8 +305,6 @@ sist2 index --print ./my_index/ | jq | less
* `--auth=<str>` Basic auth in user:password format * `--auth=<str>` Basic auth in user:password format
* `--tag-auth=<str>` Basic auth in user:password format. Works the same way as the * `--tag-auth=<str>` Basic auth in user:password format. Works the same way as the
`--auth` argument, but authentication is only applied the `/tag/` endpoint. `--auth` argument, but authentication is only applied the `/tag/` endpoint.
* `--tagline=<str>` When specified, will replace the default tagline in the navbar.
* `--dev` Serve html & js files from disk (for development, used to modify frontend files without having to recompile)
### Web examples ### Web examples
@ -265,6 +327,12 @@ instead of serving the file from disk.
Both the `root` and `rewrite_url` fields are safe to manually modify from the Both the `root` and `rewrite_url` fields are safe to manually modify from the
`descriptor.json` file. `descriptor.json` file.
### Link to specific indices
To link to specific indices, you can add a list of comma-separated index name to
the URL: `?i=<name>,<name>`. By default, indices with `"(nsfw)"` in their name are
not displayed.
## exec-script ## exec-script
The `exec-script` command is used to execute a user script for an index that has already been imported to Elasticsearch with the `index` command. Note that the documents will not be reset to their default state before each execution as the `index` command does: if you make undesired changes to the documents by accident, you will need to run `index` again to revert to the original state. The `exec-script` command is used to execute a user script for an index that has already been imported to Elasticsearch with the `index` command. Note that the documents will not be reset to their default state before each execution as the `index` command does: if you make undesired changes to the documents by accident, you will need to run `index` again to revert to the original state.

View File

@ -78,7 +78,9 @@ application/vocaltec-media-desc, vmd
application/vocaltec-media-file, vmf application/vocaltec-media-file, vmf
application/warc, warc application/warc, warc
application/winhelp, hlp application/winhelp, hlp
application/wordperfect, wp|wp5|wp6|wpd|w60|w61 application/wordperfect6.0, w60
application/wordperfect6.1, w61
application/wordperfect, wp|wp5|wp6|wpd
application/x-123, wk1 application/x-123, wk1
application/x-7z-compressed, 7z application/x-7z-compressed, 7z
application/x-aim, aim application/x-aim, aim

1 application/arj arj
78 application/vocaltec-media-file vmf
79 application/warc warc
80 application/winhelp hlp
81 application/wordperfect application/wordperfect6.0 wp|wp5|wp6|wpd|w60|w61 w60
82 application/wordperfect6.1 w61
83 application/wordperfect wp|wp5|wp6|wpd
84 application/x-123 wk1
85 application/x-7z-compressed 7z
86 application/x-aim aim

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -8,7 +8,7 @@
</b-navbar-brand> </b-navbar-brand>
<span class="badge badge-pill version" v-if="$store && $store.state.sist2Info"> <span class="badge badge-pill version" v-if="$store && $store.state.sist2Info">
v{{ sist2Version() }}<span v-if="isDebug()">-dbg</span> {{ sist2Version() }}<span v-if="isDebug()">-dbg</span>
</span> </span>
<span v-if="$store && $store.state.sist2Info" class="tagline" v-html="tagline()"></span> <span v-if="$store && $store.state.sist2Info" class="tagline" v-html="tagline()"></span>

View File

@ -14,7 +14,6 @@
#include "libscan/mobi/scan_mobi.h" #include "libscan/mobi/scan_mobi.h"
#include "libscan/raw/raw.h" #include "libscan/raw/raw.h"
#include "libscan/msdoc/msdoc.h" #include "libscan/msdoc/msdoc.h"
#include "libscan/wpd/wpd.h"
#include "src/io/store.h" #include "src/io/store.h"
#include <glib.h> #include <glib.h>
@ -61,7 +60,6 @@ typedef struct {
scan_mobi_ctx_t mobi_ctx; scan_mobi_ctx_t mobi_ctx;
scan_raw_ctx_t raw_ctx; scan_raw_ctx_t raw_ctx;
scan_msdoc_ctx_t msdoc_ctx; scan_msdoc_ctx_t msdoc_ctx;
scan_wpd_ctx_t wpd_ctx;
} ScanCtx_t; } ScanCtx_t;
typedef struct { typedef struct {

View File

@ -258,11 +258,6 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.raw_ctx.log = _log; ScanCtx.raw_ctx.log = _log;
ScanCtx.raw_ctx.logf = _logf; ScanCtx.raw_ctx.logf = _logf;
ScanCtx.raw_ctx.store = _store; ScanCtx.raw_ctx.store = _store;
ScanCtx.wpd_ctx.content_size = args->content_size;
ScanCtx.wpd_ctx.log = _log;
ScanCtx.wpd_ctx.logf = _logf;
ScanCtx.wpd_ctx.wpd_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/wordperfect");
} }

View File

@ -89,371 +89,373 @@ enum mime {
application_warc=655441, application_warc=655441,
application_winhelp=655442, application_winhelp=655442,
application_wordperfect=655443, application_wordperfect=655443,
application_x_123=655444, application_wordperfect6_0=655444,
application_x_7z_compressed=655445 | 0x10000000, application_wordperfect6_1=655445,
application_x_aim=655446, application_x_123=655446,
application_x_apple_diskimage=655447, application_x_7z_compressed=655447 | 0x10000000,
application_x_arc=655448 | 0x10000000, application_x_aim=655448,
application_x_archive=655449, application_x_apple_diskimage=655449,
application_x_atari_7800_rom=655450, application_x_arc=655450 | 0x10000000,
application_x_authorware_bin=655451, application_x_archive=655451,
application_x_authorware_map=655452, application_x_atari_7800_rom=655452,
application_x_authorware_seg=655453, application_x_authorware_bin=655453,
application_x_avira_qua=655454, application_x_authorware_map=655454,
application_x_bcpio=655455, application_x_authorware_seg=655455,
application_x_bittorrent=655456, application_x_avira_qua=655456,
application_x_bsh=655457, application_x_bcpio=655457,
application_x_bytecode_python=655458, application_x_bittorrent=655458,
application_x_bzip=655459, application_x_bsh=655459,
application_x_bzip2=655460 | 0x08000000, application_x_bytecode_python=655460,
application_x_cbr=655461, application_x_bzip=655461,
application_x_cbz=655462, application_x_bzip2=655462 | 0x08000000,
application_x_cdlink=655463, application_x_cbr=655463,
application_x_chat=655464, application_x_cbz=655464,
application_x_chrome_extension=655465, application_x_cdlink=655465,
application_x_cocoa=655466, application_x_chat=655466,
application_x_conference=655467, application_x_chrome_extension=655467,
application_x_coredump=655468, application_x_cocoa=655468,
application_x_cpio=655469, application_x_conference=655469,
application_x_dbf=655470, application_x_coredump=655470,
application_x_dbt=655471, application_x_cpio=655471,
application_x_debian_package=655472, application_x_dbf=655472,
application_x_deepv=655473, application_x_dbt=655473,
application_x_director=655474, application_x_debian_package=655474,
application_x_dmp=655475, application_x_deepv=655475,
application_x_dosdriver=655476, application_x_director=655476,
application_x_dosexec=655477, application_x_dmp=655477,
application_x_dvi=655478, application_x_dosdriver=655478,
application_x_elc=655479, application_x_dosexec=655479,
application_x_dvi=655480,
application_x_elc=655481,
application_x_empty=1, application_x_empty=1,
application_x_envoy=655480, application_x_envoy=655482,
application_x_esrehber=655481, application_x_esrehber=655483,
application_x_excel=655482, application_x_excel=655484,
application_x_executable=655483, application_x_executable=655485,
application_x_font_gdos=655484, application_x_font_gdos=655486,
application_x_font_pf2=655485, application_x_font_pf2=655487,
application_x_font_pfm=655486, application_x_font_pfm=655488,
application_x_font_sfn=655487, application_x_font_sfn=655489,
application_x_font_ttf=655488 | 0x20000000, application_x_font_ttf=655490 | 0x20000000,
application_x_fptapplication_x_dbt=655489, application_x_fptapplication_x_dbt=655491,
application_x_freelance=655490, application_x_freelance=655492,
application_x_gamecube_rom=655491, application_x_gamecube_rom=655493,
application_x_gdbm=655492, application_x_gdbm=655494,
application_x_gettext_translation=655493, application_x_gettext_translation=655495,
application_x_git=655494, application_x_git=655496,
application_x_gsp=655495, application_x_gsp=655497,
application_x_gss=655496, application_x_gss=655498,
application_x_gtar=655497, application_x_gtar=655499,
application_x_gzip=655498, application_x_gzip=655500,
application_x_hdf=655499, application_x_hdf=655501,
application_x_helpfile=655500, application_x_helpfile=655502,
application_x_httpd_imap=655501, application_x_httpd_imap=655503,
application_x_ima=655502, application_x_ima=655504,
application_x_innosetup=655503, application_x_innosetup=655505,
application_x_internett_signup=655504, application_x_internett_signup=655506,
application_x_inventor=655505, application_x_inventor=655507,
application_x_ip2=655506, application_x_ip2=655508,
application_x_java_applet=655507, application_x_java_applet=655509,
application_x_java_commerce=655508, application_x_java_commerce=655510,
application_x_java_image=655509, application_x_java_image=655511,
application_x_java_jmod=655510, application_x_java_jmod=655512,
application_x_java_keystore=655511, application_x_java_keystore=655513,
application_x_kdelnk=655512, application_x_kdelnk=655514,
application_x_koan=655513, application_x_koan=655515,
application_x_latex=655514, application_x_latex=655516,
application_x_livescreen=655515, application_x_livescreen=655517,
application_x_lotus=655516, application_x_lotus=655518,
application_x_lz4=655517 | 0x08000000, application_x_lz4=655519 | 0x08000000,
application_x_lz4_json=655518, application_x_lz4_json=655520,
application_x_lzh=655519, application_x_lzh=655521,
application_x_lzh_compressed=655520, application_x_lzh_compressed=655522,
application_x_lzip=655521 | 0x08000000, application_x_lzip=655523 | 0x08000000,
application_x_lzma=655522 | 0x08000000, application_x_lzma=655524 | 0x08000000,
application_x_lzop=655523 | 0x08000000, application_x_lzop=655525 | 0x08000000,
application_x_lzx=655524, application_x_lzx=655526,
application_x_mach_binary=655525, application_x_mach_binary=655527,
application_x_mach_executable=655526, application_x_mach_executable=655528,
application_x_magic_cap_package_1_0=655527, application_x_magic_cap_package_1_0=655529,
application_x_mathcad=655528, application_x_mathcad=655530,
application_x_maxis_dbpf=655529, application_x_maxis_dbpf=655531,
application_x_meme=655530, application_x_meme=655532,
application_x_midi=655531, application_x_midi=655533,
application_x_mif=655532, application_x_mif=655534,
application_x_mix_transfer=655533, application_x_mix_transfer=655535,
application_x_mobipocket_ebook=655534 | 0x02000000, application_x_mobipocket_ebook=655536 | 0x02000000,
application_x_ms_compress_szdd=655535, application_x_ms_compress_szdd=655537,
application_x_ms_pdb=655536, application_x_ms_pdb=655538,
application_x_ms_reader=655537, application_x_ms_reader=655539,
application_x_msaccess=655538, application_x_msaccess=655540,
application_x_n64_rom=655539, application_x_n64_rom=655541,
application_x_navi_animation=655540, application_x_navi_animation=655542,
application_x_navidoc=655541, application_x_navidoc=655543,
application_x_navimap=655542, application_x_navimap=655544,
application_x_navistyle=655543, application_x_navistyle=655545,
application_x_nes_rom=655544, application_x_nes_rom=655546,
application_x_netcdf=655545, application_x_netcdf=655547,
application_x_newton_compatible_pkg=655546, application_x_newton_compatible_pkg=655548,
application_x_nintendo_ds_rom=655547, application_x_nintendo_ds_rom=655549,
application_x_object=655548, application_x_object=655550,
application_x_omc=655549, application_x_omc=655551,
application_x_omcdatamaker=655550, application_x_omcdatamaker=655552,
application_x_omcregerator=655551, application_x_omcregerator=655553,
application_x_pagemaker=655552, application_x_pagemaker=655554,
application_x_pcl=655553, application_x_pcl=655555,
application_x_pgp_keyring=655554, application_x_pgp_keyring=655556,
application_x_pixclscript=655555, application_x_pixclscript=655557,
application_x_pkcs7_certreqresp=655556, application_x_pkcs7_certreqresp=655558,
application_x_pkcs7_signature=655557, application_x_pkcs7_signature=655559,
application_x_project=655558, application_x_project=655560,
application_x_qpro=655559, application_x_qpro=655561,
application_x_rar=655560 | 0x10000000, application_x_rar=655562 | 0x10000000,
application_x_rpm=655561, application_x_rpm=655563,
application_x_sdp=655562, application_x_sdp=655564,
application_x_sea=655563, application_x_sea=655565,
application_x_seelogo=655564, application_x_seelogo=655566,
application_x_setupscript=655565, application_x_setupscript=655567,
application_x_shar=655566, application_x_shar=655568,
application_x_sharedlib=655567, application_x_sharedlib=655569,
application_x_shockwave_flash=655568, application_x_shockwave_flash=655570,
application_x_snappy_framed=655569, application_x_snappy_framed=655571,
application_x_sprite=655570, application_x_sprite=655572,
application_x_sqlite3=655571, application_x_sqlite3=655573,
application_x_stargallery_thm=655572, application_x_stargallery_thm=655574,
application_x_stuffit=655573, application_x_stuffit=655575,
application_x_sv4cpio=655574, application_x_sv4cpio=655576,
application_x_sv4crc=655575, application_x_sv4crc=655577,
application_x_tar=655576 | 0x10000000, application_x_tar=655578 | 0x10000000,
application_x_tbook=655577, application_x_tbook=655579,
application_x_terminfo=655578, application_x_terminfo=655580,
application_x_terminfo2=655579, application_x_terminfo2=655581,
application_x_tex_tfm=655580, application_x_tex_tfm=655582,
application_x_texinfo=655581, application_x_texinfo=655583,
application_x_ustar=655582, application_x_ustar=655584,
application_x_visio=655583, application_x_visio=655585,
application_x_vnd_audioexplosion_mzz=655584, application_x_vnd_audioexplosion_mzz=655586,
application_x_vnd_ls_xpix=655585, application_x_vnd_ls_xpix=655587,
application_x_vrml=655586, application_x_vrml=655588,
application_x_wais_source=655587, application_x_wais_source=655589,
application_x_wine_extension_ini=655588, application_x_wine_extension_ini=655590,
application_x_wintalk=655589, application_x_wintalk=655591,
application_x_world=655590, application_x_world=655592,
application_x_wri=655591, application_x_wri=655593,
application_x_x509_ca_cert=655592, application_x_x509_ca_cert=655594,
application_x_xz=655593 | 0x08000000, application_x_xz=655595 | 0x08000000,
application_x_zip=655594, application_x_zip=655596,
application_x_zstd=655595 | 0x08000000, application_x_zstd=655597 | 0x08000000,
application_x_zstd_dictionary=655596, application_x_zstd_dictionary=655598,
application_xml=655597, application_xml=655599,
application_zip=655598 | 0x10000000, application_zip=655600 | 0x10000000,
application_zlib=655599, application_zlib=655601,
audio_basic=458992 | 0x80000000, audio_basic=458994 | 0x80000000,
audio_it=458993, audio_it=458995,
audio_make=458994, audio_make=458996,
audio_mid=458995, audio_mid=458997,
audio_midi=458996, audio_midi=458998,
audio_mp4=458997, audio_mp4=458999,
audio_mpeg=458998, audio_mpeg=459000,
audio_ogg=458999, audio_ogg=459001,
audio_s3m=459000, audio_s3m=459002,
audio_tsp_audio=459001, audio_tsp_audio=459003,
audio_tsplayer=459002, audio_tsplayer=459004,
audio_vnd_qcelp=459003, audio_vnd_qcelp=459005,
audio_voxware=459004, audio_voxware=459006,
audio_x_aiff=459005, audio_x_aiff=459007,
audio_x_flac=459006, audio_x_flac=459008,
audio_x_gsm=459007, audio_x_gsm=459009,
audio_x_hx_aac_adts=459008, audio_x_hx_aac_adts=459010,
audio_x_jam=459009, audio_x_jam=459011,
audio_x_liveaudio=459010, audio_x_liveaudio=459012,
audio_x_m4a=459011, audio_x_m4a=459013,
audio_x_midi=459012, audio_x_midi=459014,
audio_x_mod=459013, audio_x_mod=459015,
audio_x_mp4a_latm=459014, audio_x_mp4a_latm=459016,
audio_x_mpeg_3=459015, audio_x_mpeg_3=459017,
audio_x_mpequrl=459016, audio_x_mpequrl=459018,
audio_x_nspaudio=459017, audio_x_nspaudio=459019,
audio_x_pn_realaudio=459018, audio_x_pn_realaudio=459020,
audio_x_psid=459019, audio_x_psid=459021,
audio_x_realaudio=459020, audio_x_realaudio=459022,
audio_x_s3m=459021, audio_x_s3m=459023,
audio_x_twinvq=459022, audio_x_twinvq=459024,
audio_x_twinvq_plugin=459023, audio_x_twinvq_plugin=459025,
audio_x_voc=459024, audio_x_voc=459026,
audio_x_wav=459025, audio_x_wav=459027,
audio_x_xbox_executable=459026 | 0x80000000, audio_x_xbox_executable=459028 | 0x80000000,
audio_x_xbox360_executable=459027 | 0x80000000, audio_x_xbox360_executable=459029 | 0x80000000,
audio_xm=459028, audio_xm=459030,
font_otf=327957 | 0x20000000, font_otf=327959 | 0x20000000,
font_sfnt=327958 | 0x20000000, font_sfnt=327960 | 0x20000000,
font_woff=327959 | 0x20000000, font_woff=327961 | 0x20000000,
font_woff2=327960 | 0x20000000, font_woff2=327962 | 0x20000000,
image_bmp=524569, image_bmp=524571,
image_cmu_raster=524570, image_cmu_raster=524572,
image_fif=524571, image_fif=524573,
image_florian=524572, image_florian=524574,
image_g3fax=524573, image_g3fax=524575,
image_gif=524574, image_gif=524576,
image_heic=524575, image_heic=524577,
image_ief=524576, image_ief=524578,
image_jpeg=524577, image_jpeg=524579,
image_jutvision=524578, image_jutvision=524580,
image_naplps=524579, image_naplps=524581,
image_pict=524580, image_pict=524582,
image_png=524581, image_png=524583,
image_svg=524582 | 0x80000000, image_svg=524584 | 0x80000000,
image_svg_xml=524583 | 0x80000000, image_svg_xml=524585 | 0x80000000,
image_tiff=524584, image_tiff=524586,
image_vnd_adobe_photoshop=524585 | 0x80000000, image_vnd_adobe_photoshop=524587 | 0x80000000,
image_vnd_djvu=524586 | 0x80000000, image_vnd_djvu=524588 | 0x80000000,
image_vnd_fpx=524587, image_vnd_fpx=524589,
image_vnd_microsoft_icon=524588, image_vnd_microsoft_icon=524590,
image_vnd_rn_realflash=524589, image_vnd_rn_realflash=524591,
image_vnd_rn_realpix=524590, image_vnd_rn_realpix=524592,
image_vnd_wap_wbmp=524591, image_vnd_wap_wbmp=524593,
image_vnd_xiff=524592, image_vnd_xiff=524594,
image_webp=524593, image_webp=524595,
image_wmf=524594, image_wmf=524596,
image_x_3ds=524595, image_x_3ds=524597,
image_x_adobe_dng=524596 | 0x00800000, image_x_adobe_dng=524598 | 0x00800000,
image_x_award_bioslogo=524597, image_x_award_bioslogo=524599,
image_x_canon_cr2=524598 | 0x00800000, image_x_canon_cr2=524600 | 0x00800000,
image_x_canon_crw=524599 | 0x00800000, image_x_canon_crw=524601 | 0x00800000,
image_x_cmu_raster=524600, image_x_cmu_raster=524602,
image_x_cur=524601, image_x_cur=524603,
image_x_dcraw=524602 | 0x00800000, image_x_dcraw=524604 | 0x00800000,
image_x_dwg=524603, image_x_dwg=524605,
image_x_eps=524604, image_x_eps=524606,
image_x_epson_erf=524605 | 0x00800000, image_x_epson_erf=524607 | 0x00800000,
image_x_exr=524606, image_x_exr=524608,
image_x_fuji_raf=524607 | 0x00800000, image_x_fuji_raf=524609 | 0x00800000,
image_x_gem=524608, image_x_gem=524610,
image_x_icns=524609, image_x_icns=524611,
image_x_icon=524610 | 0x80000000, image_x_icon=524612 | 0x80000000,
image_x_jg=524611, image_x_jg=524613,
image_x_jps=524612, image_x_jps=524614,
image_x_kodak_dcr=524613 | 0x00800000, image_x_kodak_dcr=524615 | 0x00800000,
image_x_kodak_k25=524614 | 0x00800000, image_x_kodak_k25=524616 | 0x00800000,
image_x_kodak_kdc=524615 | 0x00800000, image_x_kodak_kdc=524617 | 0x00800000,
image_x_minolta_mrw=524616 | 0x00800000, image_x_minolta_mrw=524618 | 0x00800000,
image_x_ms_bmp=524617, image_x_ms_bmp=524619,
image_x_niff=524618, image_x_niff=524620,
image_x_nikon_nef=524619 | 0x00800000, image_x_nikon_nef=524621 | 0x00800000,
image_x_olympus_orf=524620 | 0x00800000, image_x_olympus_orf=524622 | 0x00800000,
image_x_panasonic_raw=524621 | 0x00800000, image_x_panasonic_raw=524623 | 0x00800000,
image_x_pcx=524622, image_x_pcx=524624,
image_x_pentax_pef=524623 | 0x00800000, image_x_pentax_pef=524625 | 0x00800000,
image_x_pict=524624, image_x_pict=524626,
image_x_portable_bitmap=524625, image_x_portable_bitmap=524627,
image_x_portable_graymap=524626, image_x_portable_graymap=524628,
image_x_portable_pixmap=524627, image_x_portable_pixmap=524629,
image_x_quicktime=524628, image_x_quicktime=524630,
image_x_rgb=524629, image_x_rgb=524631,
image_x_sigma_x3f=524630 | 0x00800000, image_x_sigma_x3f=524632 | 0x00800000,
image_x_sony_arw=524631 | 0x00800000, image_x_sony_arw=524633 | 0x00800000,
image_x_sony_sr2=524632 | 0x00800000, image_x_sony_sr2=524634 | 0x00800000,
image_x_sony_srf=524633 | 0x00800000, image_x_sony_srf=524635 | 0x00800000,
image_x_tga=524634, image_x_tga=524636,
image_x_tiff=524635, image_x_tiff=524637,
image_x_win_bitmap=524636, image_x_win_bitmap=524638,
image_x_xcf=524637 | 0x80000000, image_x_xcf=524639 | 0x80000000,
image_x_xpixmap=524638 | 0x80000000, image_x_xpixmap=524640 | 0x80000000,
image_x_xwindowdump=524639, image_x_xwindowdump=524641,
message_news=196960, message_news=196962,
message_rfc822=196961, message_rfc822=196963,
model_vnd_dwf=65890, model_vnd_dwf=65892,
model_vnd_gdl=65891, model_vnd_gdl=65893,
model_vnd_gs_gdl=65892, model_vnd_gs_gdl=65894,
model_vrml=65893, model_vrml=65895,
model_x_pov=65894, model_x_pov=65896,
sist2_sidecar=2, sist2_sidecar=2,
text_PGP=590183, text_PGP=590185,
text_asp=590184, text_asp=590186,
text_css=590185, text_css=590187,
text_html=590186 | 0x01000000, text_html=590188 | 0x01000000,
text_javascript=590187, text_javascript=590189,
text_mcf=590188, text_mcf=590190,
text_pascal=590189, text_pascal=590191,
text_plain=590190, text_plain=590192,
text_richtext=590191, text_richtext=590193,
text_rtf=590192, text_rtf=590194,
text_scriplet=590193, text_scriplet=590195,
text_tab_separated_values=590194, text_tab_separated_values=590196,
text_troff=590195, text_troff=590197,
text_uri_list=590196, text_uri_list=590198,
text_vnd_abc=590197, text_vnd_abc=590199,
text_vnd_fmi_flexstor=590198, text_vnd_fmi_flexstor=590200,
text_vnd_wap_wml=590199, text_vnd_wap_wml=590201,
text_vnd_wap_wmlscript=590200, text_vnd_wap_wmlscript=590202,
text_webviewhtml=590201, text_webviewhtml=590203,
text_x_Algol68=590202, text_x_Algol68=590204,
text_x_asm=590203, text_x_asm=590205,
text_x_audiosoft_intra=590204, text_x_audiosoft_intra=590206,
text_x_awk=590205, text_x_awk=590207,
text_x_bcpl=590206, text_x_bcpl=590208,
text_x_c=590207, text_x_c=590209,
text_x_c__=590208, text_x_c__=590210,
text_x_component=590209, text_x_component=590211,
text_x_diff=590210, text_x_diff=590212,
text_x_fortran=590211, text_x_fortran=590213,
text_x_java=590212, text_x_java=590214,
text_x_la_asf=590213, text_x_la_asf=590215,
text_x_lisp=590214, text_x_lisp=590216,
text_x_m=590215, text_x_m=590217,
text_x_m4=590216, text_x_m4=590218,
text_x_makefile=590217, text_x_makefile=590219,
text_x_ms_regedit=590218, text_x_ms_regedit=590220,
text_x_msdos_batch=590219, text_x_msdos_batch=590221,
text_x_objective_c=590220, text_x_objective_c=590222,
text_x_pascal=590221, text_x_pascal=590223,
text_x_perl=590222, text_x_perl=590224,
text_x_php=590223, text_x_php=590225,
text_x_po=590224, text_x_po=590226,
text_x_python=590225, text_x_python=590227,
text_x_ruby=590226, text_x_ruby=590228,
text_x_sass=590227, text_x_sass=590229,
text_x_scss=590228, text_x_scss=590230,
text_x_server_parsed_html=590229, text_x_server_parsed_html=590231,
text_x_setext=590230, text_x_setext=590232,
text_x_sgml=590231 | 0x01000000, text_x_sgml=590233 | 0x01000000,
text_x_shellscript=590232, text_x_shellscript=590234,
text_x_speech=590233, text_x_speech=590235,
text_x_tcl=590234, text_x_tcl=590236,
text_x_tex=590235, text_x_tex=590237,
text_x_uil=590236, text_x_uil=590238,
text_x_uuencode=590237, text_x_uuencode=590239,
text_x_vcalendar=590238, text_x_vcalendar=590240,
text_x_vcard=590239, text_x_vcard=590241,
text_xml=590240 | 0x01000000, text_xml=590242 | 0x01000000,
video_MP2T=393633, video_MP2T=393635,
video_animaflex=393634, video_animaflex=393636,
video_avi=393635, video_avi=393637,
video_avs_video=393636, video_avs_video=393638,
video_mp4=393637, video_mp4=393639,
video_mpeg=393638, video_mpeg=393640,
video_quicktime=393639, video_quicktime=393641,
video_vdo=393640, video_vdo=393642,
video_vivo=393641, video_vivo=393643,
video_vnd_rn_realvideo=393642, video_vnd_rn_realvideo=393644,
video_vosaic=393643, video_vosaic=393645,
video_webm=393644, video_webm=393646,
video_x_amt_demorun=393645, video_x_amt_demorun=393647,
video_x_amt_showrun=393646, video_x_amt_showrun=393648,
video_x_atomic3d_feature=393647, video_x_atomic3d_feature=393649,
video_x_dl=393648, video_x_dl=393650,
video_x_dv=393649, video_x_dv=393651,
video_x_fli=393650, video_x_fli=393652,
video_x_flv=393651, video_x_flv=393653,
video_x_isvideo=393652, video_x_isvideo=393654,
video_x_jng=393653 | 0x80000000, video_x_jng=393655 | 0x80000000,
video_x_m4v=393654, video_x_m4v=393656,
video_x_matroska=393655, video_x_matroska=393657,
video_x_mng=393656, video_x_mng=393658,
video_x_motion_jpeg=393657, video_x_motion_jpeg=393659,
video_x_ms_asf=393658, video_x_ms_asf=393660,
video_x_msvideo=393659, video_x_msvideo=393661,
video_x_qtc=393660, video_x_qtc=393662,
video_x_sgi_movie=393661, video_x_sgi_movie=393663,
x_epoc_x_sisx_app=721342, x_epoc_x_sisx_app=721344,
}; };
char *mime_get_mime_text(unsigned int mime_id) {switch (mime_id) { char *mime_get_mime_text(unsigned int mime_id) {switch (mime_id) {
case application_arj: return "application/arj"; case application_arj: return "application/arj";
@ -535,6 +537,8 @@ case application_vocaltec_media_desc: return "application/vocaltec-media-desc";
case application_vocaltec_media_file: return "application/vocaltec-media-file"; case application_vocaltec_media_file: return "application/vocaltec-media-file";
case application_warc: return "application/warc"; case application_warc: return "application/warc";
case application_winhelp: return "application/winhelp"; case application_winhelp: return "application/winhelp";
case application_wordperfect6_0: return "application/wordperfect6.0";
case application_wordperfect6_1: return "application/wordperfect6.1";
case application_wordperfect: return "application/wordperfect"; case application_wordperfect: return "application/wordperfect";
case application_x_123: return "application/x-123"; case application_x_123: return "application/x-123";
case application_x_7z_compressed: return "application/x-7z-compressed"; case application_x_7z_compressed: return "application/x-7z-compressed";
@ -1004,12 +1008,12 @@ g_hash_table_insert(ext_table, "vmd", (gpointer)application_vocaltec_media_desc)
g_hash_table_insert(ext_table, "vmf", (gpointer)application_vocaltec_media_file); g_hash_table_insert(ext_table, "vmf", (gpointer)application_vocaltec_media_file);
g_hash_table_insert(ext_table, "warc", (gpointer)application_warc); g_hash_table_insert(ext_table, "warc", (gpointer)application_warc);
g_hash_table_insert(ext_table, "hlp", (gpointer)application_winhelp); g_hash_table_insert(ext_table, "hlp", (gpointer)application_winhelp);
g_hash_table_insert(ext_table, "w60", (gpointer)application_wordperfect6_0);
g_hash_table_insert(ext_table, "w61", (gpointer)application_wordperfect6_1);
g_hash_table_insert(ext_table, "wp", (gpointer)application_wordperfect); g_hash_table_insert(ext_table, "wp", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "wp5", (gpointer)application_wordperfect); g_hash_table_insert(ext_table, "wp5", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "wp6", (gpointer)application_wordperfect); g_hash_table_insert(ext_table, "wp6", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "wpd", (gpointer)application_wordperfect); g_hash_table_insert(ext_table, "wpd", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "w60", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "w61", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "wk1", (gpointer)application_x_123); g_hash_table_insert(ext_table, "wk1", (gpointer)application_x_123);
g_hash_table_insert(ext_table, "7z", (gpointer)application_x_7z_compressed); g_hash_table_insert(ext_table, "7z", (gpointer)application_x_7z_compressed);
g_hash_table_insert(ext_table, "aim", (gpointer)application_x_aim); g_hash_table_insert(ext_table, "aim", (gpointer)application_x_aim);
@ -1529,6 +1533,8 @@ g_hash_table_insert(mime_table, "application/vocaltec-media-desc", (gpointer)app
g_hash_table_insert(mime_table, "application/vocaltec-media-file", (gpointer)application_vocaltec_media_file); g_hash_table_insert(mime_table, "application/vocaltec-media-file", (gpointer)application_vocaltec_media_file);
g_hash_table_insert(mime_table, "application/warc", (gpointer)application_warc); g_hash_table_insert(mime_table, "application/warc", (gpointer)application_warc);
g_hash_table_insert(mime_table, "application/winhelp", (gpointer)application_winhelp); g_hash_table_insert(mime_table, "application/winhelp", (gpointer)application_winhelp);
g_hash_table_insert(mime_table, "application/wordperfect6.0", (gpointer)application_wordperfect6_0);
g_hash_table_insert(mime_table, "application/wordperfect6.1", (gpointer)application_wordperfect6_1);
g_hash_table_insert(mime_table, "application/wordperfect", (gpointer)application_wordperfect); g_hash_table_insert(mime_table, "application/wordperfect", (gpointer)application_wordperfect);
g_hash_table_insert(mime_table, "application/x-123", (gpointer)application_x_123); g_hash_table_insert(mime_table, "application/x-123", (gpointer)application_x_123);
g_hash_table_insert(mime_table, "application/x-7z-compressed", (gpointer)application_x_7z_compressed); g_hash_table_insert(mime_table, "application/x-7z-compressed", (gpointer)application_x_7z_compressed);

View File

@ -9,8 +9,8 @@
#include <magic.h> #include <magic.h>
#define MIN_VIDEO_SIZE (1024 * 64) #define MIN_VIDEO_SIZE 1024 * 64
#define MIN_IMAGE_SIZE (1024 * 2) #define MIN_IMAGE_SIZE 1024 * 2
int fs_read(struct vfile *f, void *buf, size_t size) { int fs_read(struct vfile *f, void *buf, size_t size) {
@ -182,8 +182,6 @@ void parse(void *arg) {
return; return;
} else if (is_msdoc(&ScanCtx.msdoc_ctx, doc->mime)) { } else if (is_msdoc(&ScanCtx.msdoc_ctx, doc->mime)) {
parse_msdoc(&ScanCtx.msdoc_ctx, &job->vfile, doc); parse_msdoc(&ScanCtx.msdoc_ctx, &job->vfile, doc);
} else if (is_wpd(&ScanCtx.wpd_ctx, doc->mime)) {
parse_wpd(&ScanCtx.wpd_ctx, &job->vfile, doc);
} }
abort: abort:

View File

@ -49,7 +49,7 @@
#include <ctype.h> #include <ctype.h>
#include "git_hash.h" #include "git_hash.h"
#define VERSION "2.11.2" #define VERSION "2.11.1"
static const char *const Version = VERSION; static const char *const Version = VERSION;
#ifndef SIST_PLATFORM #ifndef SIST_PLATFORM

File diff suppressed because one or more lines are too long

2
third-party/libscan vendored

@ -1 +1 @@
Subproject commit fe53e1a219246d829439bb26093713a415a58924 Subproject commit 22522d7d4aeed0d16c6e44ebe38d70ba466357af