Compare commits

..

14 Commits

33 changed files with 631 additions and 573 deletions

View File

@@ -11,7 +11,7 @@ RUN ls -lh sist2-vue/dist/
FROM ubuntu:20.10
RUN apt update && apt install -y curl
RUN apt update && apt install -y curl libasan5
RUN mkdir -p /usr/share/tessdata && \
cd /usr/share/tessdata/ && \

View File

@@ -9,7 +9,7 @@ RUN strip sist2
FROM ubuntu:20.10
RUN apt update && apt install -y curl
RUN apt update && apt install -y curl libasan5
RUN mkdir -p /usr/share/tessdata && \
cd /usr/share/tessdata/ && \

View File

@@ -51,7 +51,7 @@ sist2 (Simple incremental search tool)
1. Download sist2 executable
1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases) *
1. *(or)* Download a [development snapshot](https://files.simon987.net/.gate/sist2/simon987_sist2/) *(Not recommended!)*
1. *(or)* `docker pull simon987/sist2:2.11.0-x64-linux`
1. *(or)* `docker pull simon987/sist2:2.11.2-x64-linux`
1. See [Usage guide](docs/USAGE.md)
@@ -82,6 +82,7 @@ tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
docx, xlsx, pptx | *(none)* | yes | if embedded | creator, modified_by, title |
doc (MS Word 97-2003) | antiword | yes | yes | author, title |
mobi, azw, azw3 | libmobi | yes | no | author, title |
wpd (WordPerfect) | libwpd | yes | no | *planned* |
\* *See [Archive files](#archive-files)*

View File

@@ -32,7 +32,7 @@ Lightning-fast file system indexer and search tool.
Scan options
-t, --threads=<int> Number of threads. DEFAULT=1
-q, --quality=<flt> Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=5
-q, --quality=<flt> Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=3
--size=<int> Thumbnail size, in pixels. Use negative value to disable. DEFAULT=500
--content-size=<int> Number of bytes to be extracted from text documents. Use negative value to disable. DEFAULT=32768
--incremental=<str> Reuse an existing index and only scan modified files.
@@ -41,12 +41,14 @@ Scan options
--name=<str> Index display name. DEFAULT: (name of the directory)
--depth=<int> Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1
--archive=<str> Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse
--archive-passphrase=<str> Passphrase for encrypted archive files
--ocr=<str> Tesseract language (use tesseract --list-langs to see which are installed on your machine)
-e, --exclude=<str> Files that match this regex will not be scanned
--fast Only index file names & mime type
--treemap-threshold=<str> Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
--mem-buffer=<int> Maximum memory buffer size per thread in MB for files inside archives (see USAGE.md). DEFAULT: 2000
--read-subtitles Read subtitles from media files
--read-subtitles Read subtitles from media files.
--fast-epub Faster but less accurate EPUB parsing (no thumbnails, metadata)
Index options
-t, --threads=<int> Number of threads. DEFAULT=1
@@ -66,13 +68,14 @@ Web options
--bind=<str> Listen on this address. DEFAULT=localhost:4090
--auth=<str> Basic auth in user:password format
--tag-auth=<str> Basic auth in user:password format for tagging
--tagline=<str> Tagline in navbar
--dev Serve html & js files from disk (for development)
Exec-script options
--es-url=<str> Elasticsearch url. DEFAULT=http://localhost:9200
--es-index=<str> Elasticsearch index name. DEFAULT=sist2
--script-file=<str> Path to user script.
--async-script Execute user script asynchronously.
Made by simon987 <me@simon987.net>. Released under GPL-3.0
```
## Scan
@@ -82,7 +85,7 @@ Made by simon987 <me@simon987.net>. Released under GPL-3.0
* `-t, --threads`
Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-WmiObject Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
* `-q, --quality`
Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. *Does not affect PDF thumbnails quality*
Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best.
* `--size`
Thumbnail size in pixels.
* `--content-size`
@@ -125,6 +128,7 @@ Made by simon987 <me@simon987.net>. Released under GPL-3.0
To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
* `--read-subtitles` When enabled, will attempt to read the subtitles stream from media files.
* `--fast-epub` Much faster but less accurate EPUB parsing. When enabled, sist2 will use a simple HTML parser to read epub files instead of the MuPDF library. No thumbnails are generated and author/title metadata are not parsed.
### Scan examples
@@ -145,15 +149,11 @@ sist2 scan --incremental ./orig_idx/ -o ./updated_idx/ ~/Documents
### Index format
A typical `binary` type index structure looks like this:
A typical `ndjson` type index structure looks like this:
```
documents.idx/
├── descriptor.json
├── _index_139965416830720
├── _index_139965425223424
├── _index_139965433616128
├── _index_139965442008832
├── _index_139965442008832
├── _index_main.ndjson.zst
├── treemap.csv
├── agg_mime.csv
├── agg_date.csv
@@ -169,9 +169,7 @@ documents.idx/
└── lock.mdb
```
The `_index_*` files contain the raw binary index data and are not meant to be
read by other applications. The format is generally compatible across different
sist2 versions.
The `_index_*.ndjson.zst` files contain the document data in JSON format, in a compressed newline-delemited file.
The `thumbs/` folder is a [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database)
database containing the thumbnails.
@@ -181,66 +179,6 @@ following fields are safe to modify manually: `root`, `name`, [rewrite_url](#rew
The `.csv` are pre-computed aggregations necessary for the stats page.
*Advanced usage*
Instead of using the `scan` module, you can also import an index generated
by a third party application. The 'external' index must have the following format:
```
my_index/
├── descriptor.json
├── _index_0
└── thumbs/
| ├── data.mdb
| └── lock.mdb
└── meta/
└── <empty>
```
*descriptor.json*:
```json
{
"uuid": "<valid UUID4>",
"version": "_external_v1",
"root": "(optional)",
"name": "<name>",
"rewrite_url": "(optional)",
"type": "json",
"timestamp": 1578971024
}
```
*_index_0*: NDJSON format (One json object per line)
```json
{
"_id": "unique uuid for the file",
"index": "index uuid4 (same one as descriptor.json!)",
"mime": "application/x-cbz",
"size": 14341204,
"mtime": 1578882996,
"extension": "cbz",
"name": "my_book",
"path": "path/to/books",
"content": "text contents of the book",
"title": "Title of the book",
"tag": ["genre.fiction", "author.someguy", "etc..."],
"_keyword": [
{"k": "ISBN", "v": "ABCD34789231"}
],
"_text": [
{"k": "other", "v": "This will be indexed as text"}
]
}
```
You can find the full list of supported fields [here](../src/io/serialize.c#L90)
The `_keyword.*` items will be indexed and searchable as **keyword** fields (only full matches allowed).
The `_text.*` items will be indexed and searchable as **text** fields (fuzzy searching allowed)
*thumbs/*:
LMDB key-value store. Keys are **binary** 16-byte md5 hash* (`_id` field)
@@ -248,9 +186,6 @@ and values are raw image bytes.
*\* Hash is calculated from the full path of the file, including the extension, relative to the index root*
Importing an external `binary` type index is technically possible but
it is currently unsupported and has no guaranties of back/forward compatibility.
## Index
### Index options
@@ -276,6 +211,7 @@ it is currently unsupported and has no guaranties of back/forward compatibility.
down the process.
* `-f, --force-reset`
Reset Elasticsearch mappings and settings.
* `-t, --threads` Number of threads to use. Ideally, choose a number equal to the number of logical cores of the machine hosting Elasticsearch.
### Index examples
@@ -305,6 +241,8 @@ sist2 index --print ./my_index/ | jq | less
* `--auth=<str>` Basic auth in user:password format
* `--tag-auth=<str>` Basic auth in user:password format. Works the same way as the
`--auth` argument, but authentication is only applied the `/tag/` endpoint.
* `--tagline=<str>` When specified, will replace the default tagline in the navbar.
* `--dev` Serve html & js files from disk (for development, used to modify frontend files without having to recompile)
### Web examples
@@ -327,12 +265,6 @@ instead of serving the file from disk.
Both the `root` and `rewrite_url` fields are safe to manually modify from the
`descriptor.json` file.
### Link to specific indices
To link to specific indices, you can add a list of comma-separated index name to
the URL: `?i=<name>,<name>`. By default, indices with `"(nsfw)"` in their name are
not displayed.
## exec-script
The `exec-script` command is used to execute a user script for an index that has already been imported to Elasticsearch with the `index` command. Note that the documents will not be reset to their default state before each execution as the `index` command does: if you make undesired changes to the documents by accident, you will need to run `index` again to revert to the original state.

View File

@@ -4,6 +4,10 @@
"type": "keyword",
"doc_values": true
},
"checksum": {
"type": "keyword",
"index": false
},
"_depth": {
"type": "integer"
},

View File

@@ -2,7 +2,8 @@
"index": {
"refresh_interval": "30s",
"codec": "best_compression",
"number_of_replicas": 0
"number_of_replicas": 0,
"highlight.max_analyzed_offset": 10000000
},
"analysis": {
"tokenizer": {

View File

@@ -22,6 +22,7 @@ application/java-archive, jar
application/java, class
application/javascript,
application/json, json
application/ndjson, jsonl|ndjson
application/marc, mrc
application/mbedlet, mbd
application/mime, aps
@@ -78,9 +79,7 @@ application/vocaltec-media-desc, vmd
application/vocaltec-media-file, vmf
application/warc, warc
application/winhelp, hlp
application/wordperfect6.0, w60
application/wordperfect6.1, w61
application/wordperfect, wp|wp5|wp6|wpd
application/wordperfect, wp|wp5|wp6|wpd|w60|w61
application/x-123, wk1
application/x-7z-compressed, 7z
application/x-aim, aim
1 application/arj arj
22 application/java class
23 application/javascript
24 application/json json
25 application/ndjson jsonl|ndjson
26 application/marc mrc
27 application/mbedlet mbd
28 application/mime aps
79 application/vocaltec-media-file vmf
80 application/warc warc
81 application/winhelp hlp
82 application/wordperfect6.0 application/wordperfect w60 wp|wp5|wp6|wpd|w60|w61
application/wordperfect6.1 w61
application/wordperfect wp|wp5|wp6|wpd
83 application/x-123 wk1
84 application/x-7z-compressed 7z
85 application/x-aim aim

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -50,6 +50,7 @@ export interface EsHit {
height: number
duration: number
tag: string[]
checksum: string
}
_props: {
isSubDocument: boolean

View File

@@ -187,7 +187,8 @@ class Sist2Query {
"name.nGram": {},
"content.nGram": {},
font_name: {},
}
},
max_analyzed_offset: 9_999_999
};
if (getters.optSearchInPath) {
q.highlight.fields["path.text"] = {};

View File

@@ -1,5 +1,5 @@
<template>
<b-list-group-item class="flex-column align-items-start mb-2">
<b-list-group-item class="flex-column align-items-start mb-2" :class="{'sub-document': doc._props.isSubDocument}">
<!-- Info modal-->
<DocInfoModal :show="showInfo" :doc="doc" @close="showInfo = false"></DocInfoModal>
@@ -40,9 +40,11 @@
</div>
<div v-if="doc._source.pages || doc._source.author" class="path-row text-muted">
<span v-if="doc._source.pages">{{ doc._source.pages }} {{ doc._source.pages > 1 ? $t("pages") : $t("page") }}</span>
<span v-if="doc._source.pages">{{ doc._source.pages }} {{
doc._source.pages > 1 ? $t("pages") : $t("page")
}}</span>
<span v-if="doc._source.author && doc._source.pages" class="mx-1">-</span>
<span v-if="doc._source.author">{{doc._source.author}}</span>
<span v-if="doc._source.author">{{ doc._source.author }}</span>
</div>
</div>
</div>
@@ -89,6 +91,14 @@ export default {
</script>
<style scoped>
.sub-document {
background: #AB47BC1F !important;
}
.theme-black .sub-document {
background: #37474F !important;
}
.list-group {
margin-top: 1em;
}

View File

@@ -3,7 +3,7 @@
<template #cell(value)="data">
<span v-if="'html' in data.item" v-html="data.item.html"></span>
<span v-else>{{data.value}}</span>
<span v-else>{{ data.value }}</span>
</template>
</b-table>
</template>
@@ -57,7 +57,8 @@ export default {
"bitrate", "artist", "album", "album_artist", "genre", "font_name", "author",
"modified_by", "pages", "tag",
"exif_make", "exif_software", "exif_exposure_time", "exif_fnumber", "exif_focal_length",
"exif_user_comment", "exif_iso_speed_ratings", "exif_model", "exif_datetime",
"exif_user_comment", "exif_iso_speed_ratings", "exif_model", "exif_datetime",
"checksum"
];
fields.forEach(field => {
@@ -76,9 +77,9 @@ export default {
items.push({
key: "Exif GPS",
html: makeGpsLink(
dmsToDecimal(src["exif_gps_latitude_dms"], src["exif_gps_latitude_ref"]),
dmsToDecimal(src["exif_gps_longitude_dms"], src["exif_gps_longitude_ref"]),
),
dmsToDecimal(src["exif_gps_latitude_dms"], src["exif_gps_latitude_ref"]),
dmsToDecimal(src["exif_gps_longitude_dms"], src["exif_gps_longitude_ref"]),
),
});
}

View File

@@ -8,7 +8,7 @@
</b-navbar-brand>
<span class="badge badge-pill version" v-if="$store && $store.state.sist2Info">
{{ sist2Version() }}<span v-if="isDebug()">-dbg</span>
v{{ sist2Version() }}<span v-if="isDebug()">-dbg</span>
</span>
<span v-if="$store && $store.state.sist2Info" class="tagline" v-html="tagline()"></span>

View File

@@ -62,7 +62,8 @@ export default {
lightboxLoadOnlyCurrent: "Do not preload full-size images for adjacent slides in image viewer.",
slideDuration: "Slide duration",
resultSize: "Number of results per page",
tagOrOperator: "Use OR operator when specifying multiple tags."
tagOrOperator: "Use OR operator when specifying multiple tags.",
hideDuplicates: "Hide duplicate results based on checksum"
},
queryMode: {
simple: "Simple",
@@ -209,7 +210,8 @@ export default {
lightboxLoadOnlyCurrent: "Désactiver le chargement des diapositives adjacentes pour le visualiseur d'images",
slideDuration: "Durée des diapositives",
resultSize: "Nombre de résultats par page",
tagOrOperator: "Utiliser l'opérateur OU lors de la spécification de plusieurs tags"
tagOrOperator: "Utiliser l'opérateur OU lors de la spécification de plusieurs tags",
hideDuplicates: "Masquer les résultats en double"
},
queryMode: {
simple: "Simple",

View File

@@ -27,6 +27,7 @@ export default new Vuex.Store({
size: 60,
optLang: "en",
optHideDuplicates: true,
optTheme: "light",
optDisplay: "grid",
@@ -79,6 +80,7 @@ export default new Vuex.Store({
setSizeMax: (state, val) => state.sizeMax = val,
setSist2Info: (state, val) => state.sist2Info = val,
setSeed: (state, val) => state.seed = val,
setOptHideDuplicates: (state, val) => state.optHideDuplicates = val,
setOptLang: (state, val) => state.optLang = val,
setSortMode: (state, val) => state.sortMode = val,
setIndices: (state, val) => {
@@ -317,6 +319,7 @@ export default new Vuex.Store({
uiLightboxKey: state => state.uiLightboxKey,
uiLightboxSlide: state => state.uiLightboxSlide,
optHideDuplicates: state => state.optHideDuplicates,
optLang: state => state.optLang,
optTheme: state => state.optTheme,
optDisplay: state => state.optDisplay,

View File

@@ -35,6 +35,11 @@
<br/>
<h4>{{ $t("searchOptions") }}</h4>
<b-card>
<b-form-checkbox :checked="optHideDuplicates" @input="setOptHideDuplicates">{{
$t("opt.hideDuplicates")
}}
</b-form-checkbox>
<b-form-checkbox :checked="optHighlight" @input="setOptHighlight">{{ $t("opt.highlight") }}</b-form-checkbox>
<b-form-checkbox :checked="optTagOrOperator" @input="setOptTagOrOperator">{{
$t("opt.tagOrOperator")
@@ -206,10 +211,10 @@ export default {
"optTreemapSize",
"optLightboxLoadOnlyCurrent",
"optLightboxSlideDuration",
"optContainerWidth",
"optResultSize",
"optTagOrOperator",
"optLang"
"optLang",
"optHideDuplicates",
]),
clientWidth() {
return window.innerWidth;
@@ -248,7 +253,8 @@ export default {
"setOptContainerWidth",
"setOptResultSize",
"setOptTagOrOperator",
"setOptLang"
"setOptLang",
"setOptHideDuplicates"
]),
onResetClick() {
localStorage.removeItem("sist2_configuration");

View File

@@ -91,6 +91,7 @@ export default Vue.extend({
search: undefined as any,
docs: [] as EsHit[],
docIds: new Set(),
docChecksums: new Set(),
searchBusy: false,
Sist2Query: Sist2Query,
showHelp: false
@@ -193,6 +194,7 @@ export default Vue.extend({
async clearResults() {
this.docs = [];
this.docIds.clear();
this.docChecksums.clear();
await this.$store.dispatch("clearResults");
this.$store.commit("setUiReachedScrollEnd", false);
},
@@ -202,7 +204,19 @@ export default Vue.extend({
}
resp.hits.hits = resp.hits.hits.filter(hit => !this.docIds.has(hit._id));
resp.hits.hits.forEach(hit => this.docIds.add(hit._id));
if (this.$store.state.optHideDuplicates) {
resp.hits.hits = resp.hits.hits.filter(hit => {
if (!("checksum" in hit._source)) {
return true;
}
const isDupe = !this.docChecksums.has(hit._source.checksum);
this.docChecksums.add(hit._source.checksum);
return isDupe;
});
}
for (const hit of resp.hits.hits) {
if (hit._props.isPlayableImage || hit._props.isPlayableVideo) {

View File

@@ -28,6 +28,7 @@ typedef struct scan_args {
int max_memory_buffer;
int read_subtitles;
int fast_epub;
int calculate_checksums;
} scan_args_t;
scan_args_t *scan_args_create();

View File

@@ -14,6 +14,8 @@
#include "libscan/mobi/scan_mobi.h"
#include "libscan/raw/raw.h"
#include "libscan/msdoc/msdoc.h"
#include "libscan/wpd/wpd.h"
#include "libscan/json/json.h"
#include "src/io/store.h"
#include <glib.h>
@@ -31,6 +33,7 @@ typedef struct {
int threads;
int depth;
int calculate_checksums;
size_t stat_tn_size;
size_t stat_index_size;
@@ -60,6 +63,8 @@ typedef struct {
scan_mobi_ctx_t mobi_ctx;
scan_raw_ctx_t raw_ctx;
scan_msdoc_ctx_t msdoc_ctx;
scan_wpd_ctx_t wpd_ctx;
scan_json_ctx_t json_ctx;
} ScanCtx_t;
typedef struct {

File diff suppressed because one or more lines are too long

View File

@@ -74,6 +74,8 @@ char *get_meta_key_text(enum metakey meta_key) {
return "exif_gps_latitude_dms";
case MetaExifGpsLatitudeDec:
return "exif_gps_latitude_dec";
case MetaChecksum:
return "checksum";
default:
LOG_FATALF("serialize.c", "FIXME: Unknown meta key: %d", meta_key)
}
@@ -165,6 +167,7 @@ char *build_json_string(document_t *doc) {
case MetaExifGpsLatitudeDMS:
case MetaExifGpsLatitudeDec:
case MetaExifGpsLatitudeRef:
case MetaChecksum:
case MetaTitle: {
cJSON_AddStringToObject(json, get_meta_key_text(meta->key), meta->str_val);
buffer_size_guess += (int) strlen(meta->str_val);

View File

@@ -4,6 +4,7 @@
store_t *store_create(const char *path, size_t chunk_size) {
store_t *store = malloc(sizeof(struct store_t));
mkdir(path, S_IWUSR | S_IRUSR | S_IXUSR);
strcpy(store->path, path);
#if (SIST_FAKE_STORE != 1)
store->chunk_size = chunk_size;
@@ -78,27 +79,57 @@ void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t bu
int put_ret = mdb_put(txn, store->dbi, &mdb_key, &mdb_value, 0);
ScanCtx.stat_tn_size += buf_len;
int db_full = FALSE;
int should_abort_transaction = FALSE;
if (put_ret == MDB_MAP_FULL) {
mdb_txn_abort(txn);
db_full = TRUE;
should_abort_transaction = TRUE;
} else {
int commit_ret = mdb_txn_commit(txn);
if (commit_ret == MDB_MAP_FULL) {
db_full = TRUE;
}
}
if (db_full) {
LOG_INFOF("store.c", "Updating mdb mapsize to %lu bytes", store->size)
if (should_abort_transaction) {
mdb_txn_abort(txn);
}
pthread_rwlock_unlock(&store->lock);
// Cannot resize when there is a opened transaction.
// Resize take effect on the next commit.
pthread_rwlock_wrlock(&store->lock);
store->size += store->chunk_size;
mdb_env_set_mapsize(store->env, store->size);
int resize_ret = mdb_env_set_mapsize(store->env, store->size);
if (resize_ret != 0) {
LOG_ERROR("store.c", mdb_strerror(put_ret))
}
mdb_txn_begin(store->env, NULL, 0, &txn);
put_ret = mdb_put(txn, store->dbi, &mdb_key, &mdb_value, 0);
int put_ret_retry = mdb_put(txn, store->dbi, &mdb_key, &mdb_value, 0);
if (put_ret_retry != 0) {
LOG_ERROR("store.c", mdb_strerror(put_ret))
}
int ret = mdb_txn_commit(txn);
if (ret != 0) {
LOG_FATALF("store.c", "FIXME: Could not commit to store %s: %s (%d), %d, %d %d",
store->path, mdb_strerror(ret), ret,
put_ret, put_ret_retry);
}
LOG_INFOF("store.c", "Updated mdb mapsize to %lu bytes", store->size)
}
mdb_txn_commit(txn);
pthread_rwlock_unlock(&store->lock);
if (put_ret != 0) {
} else if (put_ret != 0) {
LOG_ERROR("store.c", mdb_strerror(put_ret))
}
pthread_rwlock_unlock(&store->lock);
#endif
}

View File

@@ -6,12 +6,12 @@
#include <glib.h>
#define STORE_SIZE_TN 1024 * 1024 * 5
#define STORE_SIZE_TAG 1024 * 16
#define STORE_SIZE_TN (1024 * 1024 * 5)
#define STORE_SIZE_TAG (1024 * 1024)
#define STORE_SIZE_META STORE_SIZE_TAG
typedef struct store_t {
char *path;
char path[PATH_MAX];
char *tmp_path;
MDB_dbi dbi;
MDB_env *env;

View File

@@ -24,16 +24,22 @@ parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info,
job->vfile.filepath = job->filepath;
job->vfile.read = fs_read;
// Filesystem reads are always rewindable
job->vfile.read_rewindable = fs_read;
job->vfile.reset = fs_reset;
job->vfile.close = fs_close;
job->vfile.fd = -1;
job->vfile.is_fs_file = TRUE;
job->vfile.has_checksum = FALSE;
job->vfile.rewind_buffer_size = 0;
job->vfile.rewind_buffer = NULL;
job->vfile.calculate_checksum = ScanCtx.calculate_checksums;
return job;
}
int sub_strings[30];
#define EXCLUDED(str) (pcre_exec(ScanCtx.exclude, ScanCtx.exclude_extra, filepath, strlen(filepath), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
#define EXCLUDED(str) (pcre_exec(ScanCtx.exclude, ScanCtx.exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
int handle_entry(const char *filepath, const struct stat *info, int typeflag, struct FTW *ftw) {

View File

@@ -170,6 +170,8 @@ void initialize_scan_context(scan_args_t *args) {
pthread_mutex_init(&ScanCtx.dbg_current_files_mu, NULL);
pthread_mutex_init(&ScanCtx.dbg_file_counts_mu, NULL);
ScanCtx.calculate_checksums = args->calculate_checksums;
// Archive
ScanCtx.arc_ctx.mode = args->archive_mode;
ScanCtx.arc_ctx.log = _log;
@@ -258,6 +260,19 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.raw_ctx.log = _log;
ScanCtx.raw_ctx.logf = _logf;
ScanCtx.raw_ctx.store = _store;
// Wpd
ScanCtx.wpd_ctx.content_size = args->content_size;
ScanCtx.wpd_ctx.log = _log;
ScanCtx.wpd_ctx.logf = _logf;
ScanCtx.wpd_ctx.wpd_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/wordperfect");
// Json
ScanCtx.json_ctx.content_size = args->content_size;
ScanCtx.json_ctx.log = _log;
ScanCtx.json_ctx.logf = _logf;
ScanCtx.json_ctx.json_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/json");
ScanCtx.json_ctx.ndjson_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/ndjson");
}
@@ -503,8 +518,8 @@ void sist2_web(web_args_t *args) {
int main(int argc, const char *argv[]) {
sigsegv_handler = signal(SIGSEGV, sig_handler);
sigabrt_handler = signal(SIGABRT, sig_handler);
// sigsegv_handler = signal(SIGSEGV, sig_handler);
// sigabrt_handler = signal(SIGABRT, sig_handler);
setlocale(LC_ALL, "");
@@ -561,6 +576,7 @@ int main(int argc, const char *argv[]) {
OPT_BOOLEAN(0, "read-subtitles", &scan_args->read_subtitles, "Read subtitles from media files."),
OPT_BOOLEAN(0, "fast-epub", &scan_args->fast_epub,
"Faster but less accurate EPUB parsing (no thumbnails, metadata)"),
OPT_BOOLEAN(0, "checksums", &scan_args->calculate_checksums, "Calculate file checksums when scanning."),
OPT_GROUP("Index options"),
OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"),

View File

@@ -35,427 +35,426 @@ enum mime {
application_mime=655387,
application_mspowerpoint=655388,
application_msword=655389,
application_netmc=655390,
application_octet_stream=655391,
application_oda=655392,
application_ogg=655393,
application_pdf=655394 | 0x40000000,
application_pgp_keys=655395,
application_pgp_signature=655396,
application_pkcs7_signature=655397,
application_pkix_cert=655398,
application_postscript=655399,
application_pro_eng=655400,
application_ringing_tones=655401,
application_smil=655402,
application_solids=655403,
application_sounder=655404,
application_step=655405,
application_streamingmedia=655406,
application_vda=655407,
application_vnd_amazon_mobi8_ebook=655408 | 0x02000000,
application_vnd_coffeescript=655409,
application_vnd_fdf=655410,
application_vnd_font_fontforge_sfd=655411,
application_vnd_hp_hpgl=655412,
application_vnd_iccprofile=655413,
application_vnd_lotus_1_2_3=655414,
application_vnd_ms_cab_compressed=655415,
application_vnd_ms_excel=655416,
application_vnd_ms_fontobject=655417,
application_vnd_ms_opentype=655418 | 0x20000000,
application_vnd_ms_outlook=655419,
application_vnd_ms_pki_certstore=655420,
application_vnd_ms_pki_pko=655421,
application_vnd_ms_pki_seccat=655422,
application_vnd_ms_powerpoint=655423,
application_vnd_ms_project=655424,
application_vnd_oasis_opendocument_base=655425,
application_vnd_oasis_opendocument_formula=655426,
application_vnd_oasis_opendocument_graphics=655427,
application_vnd_oasis_opendocument_presentation=655428,
application_vnd_oasis_opendocument_spreadsheet=655429,
application_vnd_oasis_opendocument_text=655430,
application_vnd_openxmlformats_officedocument_presentationml_presentation=655431 | 0x04000000,
application_vnd_openxmlformats_officedocument_spreadsheetml_sheet=655432 | 0x04000000,
application_vnd_openxmlformats_officedocument_wordprocessingml_document=655433 | 0x04000000,
application_vnd_symbian_install=655434,
application_vnd_tcpdump_pcap=655435,
application_vnd_wap_wmlc=655436,
application_vnd_wap_wmlscriptc=655437,
application_vnd_xara=655438,
application_vocaltec_media_desc=655439,
application_vocaltec_media_file=655440,
application_warc=655441,
application_winhelp=655442,
application_wordperfect=655443,
application_wordperfect6_0=655444,
application_wordperfect6_1=655445,
application_x_123=655446,
application_x_7z_compressed=655447 | 0x10000000,
application_x_aim=655448,
application_x_apple_diskimage=655449,
application_x_arc=655450 | 0x10000000,
application_x_archive=655451,
application_x_atari_7800_rom=655452,
application_x_authorware_bin=655453,
application_x_authorware_map=655454,
application_x_authorware_seg=655455,
application_x_avira_qua=655456,
application_x_bcpio=655457,
application_x_bittorrent=655458,
application_x_bsh=655459,
application_x_bytecode_python=655460,
application_x_bzip=655461,
application_x_bzip2=655462 | 0x08000000,
application_x_cbr=655463,
application_x_cbz=655464,
application_x_cdlink=655465,
application_x_chat=655466,
application_x_chrome_extension=655467,
application_x_cocoa=655468,
application_x_conference=655469,
application_x_coredump=655470,
application_x_cpio=655471,
application_x_dbf=655472,
application_x_dbt=655473,
application_x_debian_package=655474,
application_x_deepv=655475,
application_x_director=655476,
application_x_dmp=655477,
application_x_dosdriver=655478,
application_x_dosexec=655479,
application_x_dvi=655480,
application_x_elc=655481,
application_ndjson=655390,
application_netmc=655391,
application_octet_stream=655392,
application_oda=655393,
application_ogg=655394,
application_pdf=655395 | 0x40000000,
application_pgp_keys=655396,
application_pgp_signature=655397,
application_pkcs7_signature=655398,
application_pkix_cert=655399,
application_postscript=655400,
application_pro_eng=655401,
application_ringing_tones=655402,
application_smil=655403,
application_solids=655404,
application_sounder=655405,
application_step=655406,
application_streamingmedia=655407,
application_vda=655408,
application_vnd_amazon_mobi8_ebook=655409 | 0x02000000,
application_vnd_coffeescript=655410,
application_vnd_fdf=655411,
application_vnd_font_fontforge_sfd=655412,
application_vnd_hp_hpgl=655413,
application_vnd_iccprofile=655414,
application_vnd_lotus_1_2_3=655415,
application_vnd_ms_cab_compressed=655416,
application_vnd_ms_excel=655417,
application_vnd_ms_fontobject=655418,
application_vnd_ms_opentype=655419 | 0x20000000,
application_vnd_ms_outlook=655420,
application_vnd_ms_pki_certstore=655421,
application_vnd_ms_pki_pko=655422,
application_vnd_ms_pki_seccat=655423,
application_vnd_ms_powerpoint=655424,
application_vnd_ms_project=655425,
application_vnd_oasis_opendocument_base=655426,
application_vnd_oasis_opendocument_formula=655427,
application_vnd_oasis_opendocument_graphics=655428,
application_vnd_oasis_opendocument_presentation=655429,
application_vnd_oasis_opendocument_spreadsheet=655430,
application_vnd_oasis_opendocument_text=655431,
application_vnd_openxmlformats_officedocument_presentationml_presentation=655432 | 0x04000000,
application_vnd_openxmlformats_officedocument_spreadsheetml_sheet=655433 | 0x04000000,
application_vnd_openxmlformats_officedocument_wordprocessingml_document=655434 | 0x04000000,
application_vnd_symbian_install=655435,
application_vnd_tcpdump_pcap=655436,
application_vnd_wap_wmlc=655437,
application_vnd_wap_wmlscriptc=655438,
application_vnd_xara=655439,
application_vocaltec_media_desc=655440,
application_vocaltec_media_file=655441,
application_warc=655442,
application_winhelp=655443,
application_wordperfect=655444,
application_x_123=655445,
application_x_7z_compressed=655446 | 0x10000000,
application_x_aim=655447,
application_x_apple_diskimage=655448,
application_x_arc=655449 | 0x10000000,
application_x_archive=655450,
application_x_atari_7800_rom=655451,
application_x_authorware_bin=655452,
application_x_authorware_map=655453,
application_x_authorware_seg=655454,
application_x_avira_qua=655455,
application_x_bcpio=655456,
application_x_bittorrent=655457,
application_x_bsh=655458,
application_x_bytecode_python=655459,
application_x_bzip=655460,
application_x_bzip2=655461 | 0x08000000,
application_x_cbr=655462,
application_x_cbz=655463,
application_x_cdlink=655464,
application_x_chat=655465,
application_x_chrome_extension=655466,
application_x_cocoa=655467,
application_x_conference=655468,
application_x_coredump=655469,
application_x_cpio=655470,
application_x_dbf=655471,
application_x_dbt=655472,
application_x_debian_package=655473,
application_x_deepv=655474,
application_x_director=655475,
application_x_dmp=655476,
application_x_dosdriver=655477,
application_x_dosexec=655478,
application_x_dvi=655479,
application_x_elc=655480,
application_x_empty=1,
application_x_envoy=655482,
application_x_esrehber=655483,
application_x_excel=655484,
application_x_executable=655485,
application_x_font_gdos=655486,
application_x_font_pf2=655487,
application_x_font_pfm=655488,
application_x_font_sfn=655489,
application_x_font_ttf=655490 | 0x20000000,
application_x_fptapplication_x_dbt=655491,
application_x_freelance=655492,
application_x_gamecube_rom=655493,
application_x_gdbm=655494,
application_x_gettext_translation=655495,
application_x_git=655496,
application_x_gsp=655497,
application_x_gss=655498,
application_x_gtar=655499,
application_x_gzip=655500,
application_x_hdf=655501,
application_x_helpfile=655502,
application_x_httpd_imap=655503,
application_x_ima=655504,
application_x_innosetup=655505,
application_x_internett_signup=655506,
application_x_inventor=655507,
application_x_ip2=655508,
application_x_java_applet=655509,
application_x_java_commerce=655510,
application_x_java_image=655511,
application_x_java_jmod=655512,
application_x_java_keystore=655513,
application_x_kdelnk=655514,
application_x_koan=655515,
application_x_latex=655516,
application_x_livescreen=655517,
application_x_lotus=655518,
application_x_lz4=655519 | 0x08000000,
application_x_lz4_json=655520,
application_x_lzh=655521,
application_x_lzh_compressed=655522,
application_x_lzip=655523 | 0x08000000,
application_x_lzma=655524 | 0x08000000,
application_x_lzop=655525 | 0x08000000,
application_x_lzx=655526,
application_x_mach_binary=655527,
application_x_mach_executable=655528,
application_x_magic_cap_package_1_0=655529,
application_x_mathcad=655530,
application_x_maxis_dbpf=655531,
application_x_meme=655532,
application_x_midi=655533,
application_x_mif=655534,
application_x_mix_transfer=655535,
application_x_mobipocket_ebook=655536 | 0x02000000,
application_x_ms_compress_szdd=655537,
application_x_ms_pdb=655538,
application_x_ms_reader=655539,
application_x_msaccess=655540,
application_x_n64_rom=655541,
application_x_navi_animation=655542,
application_x_navidoc=655543,
application_x_navimap=655544,
application_x_navistyle=655545,
application_x_nes_rom=655546,
application_x_netcdf=655547,
application_x_newton_compatible_pkg=655548,
application_x_nintendo_ds_rom=655549,
application_x_object=655550,
application_x_omc=655551,
application_x_omcdatamaker=655552,
application_x_omcregerator=655553,
application_x_pagemaker=655554,
application_x_pcl=655555,
application_x_pgp_keyring=655556,
application_x_pixclscript=655557,
application_x_pkcs7_certreqresp=655558,
application_x_pkcs7_signature=655559,
application_x_project=655560,
application_x_qpro=655561,
application_x_rar=655562 | 0x10000000,
application_x_rpm=655563,
application_x_sdp=655564,
application_x_sea=655565,
application_x_seelogo=655566,
application_x_setupscript=655567,
application_x_shar=655568,
application_x_sharedlib=655569,
application_x_shockwave_flash=655570,
application_x_snappy_framed=655571,
application_x_sprite=655572,
application_x_sqlite3=655573,
application_x_stargallery_thm=655574,
application_x_stuffit=655575,
application_x_sv4cpio=655576,
application_x_sv4crc=655577,
application_x_tar=655578 | 0x10000000,
application_x_tbook=655579,
application_x_terminfo=655580,
application_x_terminfo2=655581,
application_x_tex_tfm=655582,
application_x_texinfo=655583,
application_x_ustar=655584,
application_x_visio=655585,
application_x_vnd_audioexplosion_mzz=655586,
application_x_vnd_ls_xpix=655587,
application_x_vrml=655588,
application_x_wais_source=655589,
application_x_wine_extension_ini=655590,
application_x_wintalk=655591,
application_x_world=655592,
application_x_wri=655593,
application_x_x509_ca_cert=655594,
application_x_xz=655595 | 0x08000000,
application_x_zip=655596,
application_x_zstd=655597 | 0x08000000,
application_x_zstd_dictionary=655598,
application_xml=655599,
application_zip=655600 | 0x10000000,
application_zlib=655601,
audio_basic=458994 | 0x80000000,
audio_it=458995,
audio_make=458996,
audio_mid=458997,
audio_midi=458998,
audio_mp4=458999,
audio_mpeg=459000,
audio_ogg=459001,
audio_s3m=459002,
audio_tsp_audio=459003,
audio_tsplayer=459004,
audio_vnd_qcelp=459005,
audio_voxware=459006,
audio_x_aiff=459007,
audio_x_flac=459008,
audio_x_gsm=459009,
audio_x_hx_aac_adts=459010,
audio_x_jam=459011,
audio_x_liveaudio=459012,
audio_x_m4a=459013,
audio_x_midi=459014,
audio_x_mod=459015,
audio_x_mp4a_latm=459016,
audio_x_mpeg_3=459017,
audio_x_mpequrl=459018,
audio_x_nspaudio=459019,
audio_x_pn_realaudio=459020,
audio_x_psid=459021,
audio_x_realaudio=459022,
audio_x_s3m=459023,
audio_x_twinvq=459024,
audio_x_twinvq_plugin=459025,
audio_x_voc=459026,
audio_x_wav=459027,
audio_x_xbox_executable=459028 | 0x80000000,
audio_x_xbox360_executable=459029 | 0x80000000,
audio_xm=459030,
font_otf=327959 | 0x20000000,
font_sfnt=327960 | 0x20000000,
font_woff=327961 | 0x20000000,
font_woff2=327962 | 0x20000000,
image_bmp=524571,
image_cmu_raster=524572,
image_fif=524573,
image_florian=524574,
image_g3fax=524575,
image_gif=524576,
image_heic=524577,
image_ief=524578,
image_jpeg=524579,
image_jutvision=524580,
image_naplps=524581,
image_pict=524582,
image_png=524583,
image_svg=524584 | 0x80000000,
image_svg_xml=524585 | 0x80000000,
image_tiff=524586,
image_vnd_adobe_photoshop=524587 | 0x80000000,
image_vnd_djvu=524588 | 0x80000000,
image_vnd_fpx=524589,
image_vnd_microsoft_icon=524590,
image_vnd_rn_realflash=524591,
image_vnd_rn_realpix=524592,
image_vnd_wap_wbmp=524593,
image_vnd_xiff=524594,
image_webp=524595,
image_wmf=524596,
image_x_3ds=524597,
image_x_adobe_dng=524598 | 0x00800000,
image_x_award_bioslogo=524599,
image_x_canon_cr2=524600 | 0x00800000,
image_x_canon_crw=524601 | 0x00800000,
image_x_cmu_raster=524602,
image_x_cur=524603,
image_x_dcraw=524604 | 0x00800000,
image_x_dwg=524605,
image_x_eps=524606,
image_x_epson_erf=524607 | 0x00800000,
image_x_exr=524608,
image_x_fuji_raf=524609 | 0x00800000,
image_x_gem=524610,
image_x_icns=524611,
image_x_icon=524612 | 0x80000000,
image_x_jg=524613,
image_x_jps=524614,
image_x_kodak_dcr=524615 | 0x00800000,
image_x_kodak_k25=524616 | 0x00800000,
image_x_kodak_kdc=524617 | 0x00800000,
image_x_minolta_mrw=524618 | 0x00800000,
image_x_ms_bmp=524619,
image_x_niff=524620,
image_x_nikon_nef=524621 | 0x00800000,
image_x_olympus_orf=524622 | 0x00800000,
image_x_panasonic_raw=524623 | 0x00800000,
image_x_pcx=524624,
image_x_pentax_pef=524625 | 0x00800000,
image_x_pict=524626,
image_x_portable_bitmap=524627,
image_x_portable_graymap=524628,
image_x_portable_pixmap=524629,
image_x_quicktime=524630,
image_x_rgb=524631,
image_x_sigma_x3f=524632 | 0x00800000,
image_x_sony_arw=524633 | 0x00800000,
image_x_sony_sr2=524634 | 0x00800000,
image_x_sony_srf=524635 | 0x00800000,
image_x_tga=524636,
image_x_tiff=524637,
image_x_win_bitmap=524638,
image_x_xcf=524639 | 0x80000000,
image_x_xpixmap=524640 | 0x80000000,
image_x_xwindowdump=524641,
message_news=196962,
message_rfc822=196963,
model_vnd_dwf=65892,
model_vnd_gdl=65893,
model_vnd_gs_gdl=65894,
model_vrml=65895,
model_x_pov=65896,
application_x_envoy=655481,
application_x_esrehber=655482,
application_x_excel=655483,
application_x_executable=655484,
application_x_font_gdos=655485,
application_x_font_pf2=655486,
application_x_font_pfm=655487,
application_x_font_sfn=655488,
application_x_font_ttf=655489 | 0x20000000,
application_x_fptapplication_x_dbt=655490,
application_x_freelance=655491,
application_x_gamecube_rom=655492,
application_x_gdbm=655493,
application_x_gettext_translation=655494,
application_x_git=655495,
application_x_gsp=655496,
application_x_gss=655497,
application_x_gtar=655498,
application_x_gzip=655499,
application_x_hdf=655500,
application_x_helpfile=655501,
application_x_httpd_imap=655502,
application_x_ima=655503,
application_x_innosetup=655504,
application_x_internett_signup=655505,
application_x_inventor=655506,
application_x_ip2=655507,
application_x_java_applet=655508,
application_x_java_commerce=655509,
application_x_java_image=655510,
application_x_java_jmod=655511,
application_x_java_keystore=655512,
application_x_kdelnk=655513,
application_x_koan=655514,
application_x_latex=655515,
application_x_livescreen=655516,
application_x_lotus=655517,
application_x_lz4=655518 | 0x08000000,
application_x_lz4_json=655519,
application_x_lzh=655520,
application_x_lzh_compressed=655521,
application_x_lzip=655522 | 0x08000000,
application_x_lzma=655523 | 0x08000000,
application_x_lzop=655524 | 0x08000000,
application_x_lzx=655525,
application_x_mach_binary=655526,
application_x_mach_executable=655527,
application_x_magic_cap_package_1_0=655528,
application_x_mathcad=655529,
application_x_maxis_dbpf=655530,
application_x_meme=655531,
application_x_midi=655532,
application_x_mif=655533,
application_x_mix_transfer=655534,
application_x_mobipocket_ebook=655535 | 0x02000000,
application_x_ms_compress_szdd=655536,
application_x_ms_pdb=655537,
application_x_ms_reader=655538,
application_x_msaccess=655539,
application_x_n64_rom=655540,
application_x_navi_animation=655541,
application_x_navidoc=655542,
application_x_navimap=655543,
application_x_navistyle=655544,
application_x_nes_rom=655545,
application_x_netcdf=655546,
application_x_newton_compatible_pkg=655547,
application_x_nintendo_ds_rom=655548,
application_x_object=655549,
application_x_omc=655550,
application_x_omcdatamaker=655551,
application_x_omcregerator=655552,
application_x_pagemaker=655553,
application_x_pcl=655554,
application_x_pgp_keyring=655555,
application_x_pixclscript=655556,
application_x_pkcs7_certreqresp=655557,
application_x_pkcs7_signature=655558,
application_x_project=655559,
application_x_qpro=655560,
application_x_rar=655561 | 0x10000000,
application_x_rpm=655562,
application_x_sdp=655563,
application_x_sea=655564,
application_x_seelogo=655565,
application_x_setupscript=655566,
application_x_shar=655567,
application_x_sharedlib=655568,
application_x_shockwave_flash=655569,
application_x_snappy_framed=655570,
application_x_sprite=655571,
application_x_sqlite3=655572,
application_x_stargallery_thm=655573,
application_x_stuffit=655574,
application_x_sv4cpio=655575,
application_x_sv4crc=655576,
application_x_tar=655577 | 0x10000000,
application_x_tbook=655578,
application_x_terminfo=655579,
application_x_terminfo2=655580,
application_x_tex_tfm=655581,
application_x_texinfo=655582,
application_x_ustar=655583,
application_x_visio=655584,
application_x_vnd_audioexplosion_mzz=655585,
application_x_vnd_ls_xpix=655586,
application_x_vrml=655587,
application_x_wais_source=655588,
application_x_wine_extension_ini=655589,
application_x_wintalk=655590,
application_x_world=655591,
application_x_wri=655592,
application_x_x509_ca_cert=655593,
application_x_xz=655594 | 0x08000000,
application_x_zip=655595,
application_x_zstd=655596 | 0x08000000,
application_x_zstd_dictionary=655597,
application_xml=655598,
application_zip=655599 | 0x10000000,
application_zlib=655600,
audio_basic=458993 | 0x80000000,
audio_it=458994,
audio_make=458995,
audio_mid=458996,
audio_midi=458997,
audio_mp4=458998,
audio_mpeg=458999,
audio_ogg=459000,
audio_s3m=459001,
audio_tsp_audio=459002,
audio_tsplayer=459003,
audio_vnd_qcelp=459004,
audio_voxware=459005,
audio_x_aiff=459006,
audio_x_flac=459007,
audio_x_gsm=459008,
audio_x_hx_aac_adts=459009,
audio_x_jam=459010,
audio_x_liveaudio=459011,
audio_x_m4a=459012,
audio_x_midi=459013,
audio_x_mod=459014,
audio_x_mp4a_latm=459015,
audio_x_mpeg_3=459016,
audio_x_mpequrl=459017,
audio_x_nspaudio=459018,
audio_x_pn_realaudio=459019,
audio_x_psid=459020,
audio_x_realaudio=459021,
audio_x_s3m=459022,
audio_x_twinvq=459023,
audio_x_twinvq_plugin=459024,
audio_x_voc=459025,
audio_x_wav=459026,
audio_x_xbox_executable=459027 | 0x80000000,
audio_x_xbox360_executable=459028 | 0x80000000,
audio_xm=459029,
font_otf=327958 | 0x20000000,
font_sfnt=327959 | 0x20000000,
font_woff=327960 | 0x20000000,
font_woff2=327961 | 0x20000000,
image_bmp=524570,
image_cmu_raster=524571,
image_fif=524572,
image_florian=524573,
image_g3fax=524574,
image_gif=524575,
image_heic=524576,
image_ief=524577,
image_jpeg=524578,
image_jutvision=524579,
image_naplps=524580,
image_pict=524581,
image_png=524582,
image_svg=524583 | 0x80000000,
image_svg_xml=524584 | 0x80000000,
image_tiff=524585,
image_vnd_adobe_photoshop=524586 | 0x80000000,
image_vnd_djvu=524587 | 0x80000000,
image_vnd_fpx=524588,
image_vnd_microsoft_icon=524589,
image_vnd_rn_realflash=524590,
image_vnd_rn_realpix=524591,
image_vnd_wap_wbmp=524592,
image_vnd_xiff=524593,
image_webp=524594,
image_wmf=524595,
image_x_3ds=524596,
image_x_adobe_dng=524597 | 0x00800000,
image_x_award_bioslogo=524598,
image_x_canon_cr2=524599 | 0x00800000,
image_x_canon_crw=524600 | 0x00800000,
image_x_cmu_raster=524601,
image_x_cur=524602,
image_x_dcraw=524603 | 0x00800000,
image_x_dwg=524604,
image_x_eps=524605,
image_x_epson_erf=524606 | 0x00800000,
image_x_exr=524607,
image_x_fuji_raf=524608 | 0x00800000,
image_x_gem=524609,
image_x_icns=524610,
image_x_icon=524611 | 0x80000000,
image_x_jg=524612,
image_x_jps=524613,
image_x_kodak_dcr=524614 | 0x00800000,
image_x_kodak_k25=524615 | 0x00800000,
image_x_kodak_kdc=524616 | 0x00800000,
image_x_minolta_mrw=524617 | 0x00800000,
image_x_ms_bmp=524618,
image_x_niff=524619,
image_x_nikon_nef=524620 | 0x00800000,
image_x_olympus_orf=524621 | 0x00800000,
image_x_panasonic_raw=524622 | 0x00800000,
image_x_pcx=524623,
image_x_pentax_pef=524624 | 0x00800000,
image_x_pict=524625,
image_x_portable_bitmap=524626,
image_x_portable_graymap=524627,
image_x_portable_pixmap=524628,
image_x_quicktime=524629,
image_x_rgb=524630,
image_x_sigma_x3f=524631 | 0x00800000,
image_x_sony_arw=524632 | 0x00800000,
image_x_sony_sr2=524633 | 0x00800000,
image_x_sony_srf=524634 | 0x00800000,
image_x_tga=524635,
image_x_tiff=524636,
image_x_win_bitmap=524637,
image_x_xcf=524638 | 0x80000000,
image_x_xpixmap=524639 | 0x80000000,
image_x_xwindowdump=524640,
message_news=196961,
message_rfc822=196962,
model_vnd_dwf=65891,
model_vnd_gdl=65892,
model_vnd_gs_gdl=65893,
model_vrml=65894,
model_x_pov=65895,
sist2_sidecar=2,
text_PGP=590185,
text_asp=590186,
text_css=590187,
text_html=590188 | 0x01000000,
text_javascript=590189,
text_mcf=590190,
text_pascal=590191,
text_plain=590192,
text_richtext=590193,
text_rtf=590194,
text_scriplet=590195,
text_tab_separated_values=590196,
text_troff=590197,
text_uri_list=590198,
text_vnd_abc=590199,
text_vnd_fmi_flexstor=590200,
text_vnd_wap_wml=590201,
text_vnd_wap_wmlscript=590202,
text_webviewhtml=590203,
text_x_Algol68=590204,
text_x_asm=590205,
text_x_audiosoft_intra=590206,
text_x_awk=590207,
text_x_bcpl=590208,
text_x_c=590209,
text_x_c__=590210,
text_x_component=590211,
text_x_diff=590212,
text_x_fortran=590213,
text_x_java=590214,
text_x_la_asf=590215,
text_x_lisp=590216,
text_x_m=590217,
text_x_m4=590218,
text_x_makefile=590219,
text_x_ms_regedit=590220,
text_x_msdos_batch=590221,
text_x_objective_c=590222,
text_x_pascal=590223,
text_x_perl=590224,
text_x_php=590225,
text_x_po=590226,
text_x_python=590227,
text_x_ruby=590228,
text_x_sass=590229,
text_x_scss=590230,
text_x_server_parsed_html=590231,
text_x_setext=590232,
text_x_sgml=590233 | 0x01000000,
text_x_shellscript=590234,
text_x_speech=590235,
text_x_tcl=590236,
text_x_tex=590237,
text_x_uil=590238,
text_x_uuencode=590239,
text_x_vcalendar=590240,
text_x_vcard=590241,
text_xml=590242 | 0x01000000,
video_MP2T=393635,
video_animaflex=393636,
video_avi=393637,
video_avs_video=393638,
video_mp4=393639,
video_mpeg=393640,
video_quicktime=393641,
video_vdo=393642,
video_vivo=393643,
video_vnd_rn_realvideo=393644,
video_vosaic=393645,
video_webm=393646,
video_x_amt_demorun=393647,
video_x_amt_showrun=393648,
video_x_atomic3d_feature=393649,
video_x_dl=393650,
video_x_dv=393651,
video_x_fli=393652,
video_x_flv=393653,
video_x_isvideo=393654,
video_x_jng=393655 | 0x80000000,
video_x_m4v=393656,
video_x_matroska=393657,
video_x_mng=393658,
video_x_motion_jpeg=393659,
video_x_ms_asf=393660,
video_x_msvideo=393661,
video_x_qtc=393662,
video_x_sgi_movie=393663,
x_epoc_x_sisx_app=721344,
text_PGP=590184,
text_asp=590185,
text_css=590186,
text_html=590187 | 0x01000000,
text_javascript=590188,
text_mcf=590189,
text_pascal=590190,
text_plain=590191,
text_richtext=590192,
text_rtf=590193,
text_scriplet=590194,
text_tab_separated_values=590195,
text_troff=590196,
text_uri_list=590197,
text_vnd_abc=590198,
text_vnd_fmi_flexstor=590199,
text_vnd_wap_wml=590200,
text_vnd_wap_wmlscript=590201,
text_webviewhtml=590202,
text_x_Algol68=590203,
text_x_asm=590204,
text_x_audiosoft_intra=590205,
text_x_awk=590206,
text_x_bcpl=590207,
text_x_c=590208,
text_x_c__=590209,
text_x_component=590210,
text_x_diff=590211,
text_x_fortran=590212,
text_x_java=590213,
text_x_la_asf=590214,
text_x_lisp=590215,
text_x_m=590216,
text_x_m4=590217,
text_x_makefile=590218,
text_x_ms_regedit=590219,
text_x_msdos_batch=590220,
text_x_objective_c=590221,
text_x_pascal=590222,
text_x_perl=590223,
text_x_php=590224,
text_x_po=590225,
text_x_python=590226,
text_x_ruby=590227,
text_x_sass=590228,
text_x_scss=590229,
text_x_server_parsed_html=590230,
text_x_setext=590231,
text_x_sgml=590232 | 0x01000000,
text_x_shellscript=590233,
text_x_speech=590234,
text_x_tcl=590235,
text_x_tex=590236,
text_x_uil=590237,
text_x_uuencode=590238,
text_x_vcalendar=590239,
text_x_vcard=590240,
text_xml=590241 | 0x01000000,
video_MP2T=393634,
video_animaflex=393635,
video_avi=393636,
video_avs_video=393637,
video_mp4=393638,
video_mpeg=393639,
video_quicktime=393640,
video_vdo=393641,
video_vivo=393642,
video_vnd_rn_realvideo=393643,
video_vosaic=393644,
video_webm=393645,
video_x_amt_demorun=393646,
video_x_amt_showrun=393647,
video_x_atomic3d_feature=393648,
video_x_dl=393649,
video_x_dv=393650,
video_x_fli=393651,
video_x_flv=393652,
video_x_isvideo=393653,
video_x_jng=393654 | 0x80000000,
video_x_m4v=393655,
video_x_matroska=393656,
video_x_mng=393657,
video_x_motion_jpeg=393658,
video_x_ms_asf=393659,
video_x_msvideo=393660,
video_x_qtc=393661,
video_x_sgi_movie=393662,
x_epoc_x_sisx_app=721343,
};
char *mime_get_mime_text(unsigned int mime_id) {switch (mime_id) {
case application_arj: return "application/arj";
@@ -482,6 +481,7 @@ case application_java_archive: return "application/java-archive";
case application_java: return "application/java";
case application_javascript: return "application/javascript";
case application_json: return "application/json";
case application_ndjson: return "application/ndjson";
case application_marc: return "application/marc";
case application_mbedlet: return "application/mbedlet";
case application_mime: return "application/mime";
@@ -537,8 +537,6 @@ case application_vocaltec_media_desc: return "application/vocaltec-media-desc";
case application_vocaltec_media_file: return "application/vocaltec-media-file";
case application_warc: return "application/warc";
case application_winhelp: return "application/winhelp";
case application_wordperfect6_0: return "application/wordperfect6.0";
case application_wordperfect6_1: return "application/wordperfect6.1";
case application_wordperfect: return "application/wordperfect";
case application_x_123: return "application/x-123";
case application_x_7z_compressed: return "application/x-7z-compressed";
@@ -934,6 +932,8 @@ g_hash_table_insert(ext_table, "inf", (gpointer)application_inf);
g_hash_table_insert(ext_table, "jar", (gpointer)application_java_archive);
g_hash_table_insert(ext_table, "class", (gpointer)application_java);
g_hash_table_insert(ext_table, "json", (gpointer)application_json);
g_hash_table_insert(ext_table, "jsonl", (gpointer)application_ndjson);
g_hash_table_insert(ext_table, "ndjson", (gpointer)application_ndjson);
g_hash_table_insert(ext_table, "mrc", (gpointer)application_marc);
g_hash_table_insert(ext_table, "mbd", (gpointer)application_mbedlet);
g_hash_table_insert(ext_table, "aps", (gpointer)application_mime);
@@ -1008,12 +1008,12 @@ g_hash_table_insert(ext_table, "vmd", (gpointer)application_vocaltec_media_desc)
g_hash_table_insert(ext_table, "vmf", (gpointer)application_vocaltec_media_file);
g_hash_table_insert(ext_table, "warc", (gpointer)application_warc);
g_hash_table_insert(ext_table, "hlp", (gpointer)application_winhelp);
g_hash_table_insert(ext_table, "w60", (gpointer)application_wordperfect6_0);
g_hash_table_insert(ext_table, "w61", (gpointer)application_wordperfect6_1);
g_hash_table_insert(ext_table, "wp", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "wp5", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "wp6", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "wpd", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "w60", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "w61", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "wk1", (gpointer)application_x_123);
g_hash_table_insert(ext_table, "7z", (gpointer)application_x_7z_compressed);
g_hash_table_insert(ext_table, "aim", (gpointer)application_x_aim);
@@ -1478,6 +1478,7 @@ g_hash_table_insert(mime_table, "application/java-archive", (gpointer)applicatio
g_hash_table_insert(mime_table, "application/java", (gpointer)application_java);
g_hash_table_insert(mime_table, "application/javascript", (gpointer)application_javascript);
g_hash_table_insert(mime_table, "application/json", (gpointer)application_json);
g_hash_table_insert(mime_table, "application/ndjson", (gpointer)application_ndjson);
g_hash_table_insert(mime_table, "application/marc", (gpointer)application_marc);
g_hash_table_insert(mime_table, "application/mbedlet", (gpointer)application_mbedlet);
g_hash_table_insert(mime_table, "application/mime", (gpointer)application_mime);
@@ -1533,8 +1534,6 @@ g_hash_table_insert(mime_table, "application/vocaltec-media-desc", (gpointer)app
g_hash_table_insert(mime_table, "application/vocaltec-media-file", (gpointer)application_vocaltec_media_file);
g_hash_table_insert(mime_table, "application/warc", (gpointer)application_warc);
g_hash_table_insert(mime_table, "application/winhelp", (gpointer)application_winhelp);
g_hash_table_insert(mime_table, "application/wordperfect6.0", (gpointer)application_wordperfect6_0);
g_hash_table_insert(mime_table, "application/wordperfect6.1", (gpointer)application_wordperfect6_1);
g_hash_table_insert(mime_table, "application/wordperfect", (gpointer)application_wordperfect);
g_hash_table_insert(mime_table, "application/x-123", (gpointer)application_x_123);
g_hash_table_insert(mime_table, "application/x-7z-compressed", (gpointer)application_x_7z_compressed);

View File

@@ -9,26 +9,35 @@
#include <magic.h>
#define MIN_VIDEO_SIZE 1024 * 64
#define MIN_IMAGE_SIZE 1024 * 2
#define MIN_VIDEO_SIZE (1024 * 64)
#define MIN_IMAGE_SIZE (512)
int fs_read(struct vfile *f, void *buf, size_t size) {
if (f->fd == -1) {
SHA1_Init(&f->sha1_ctx);
f->fd = open(f->filepath, O_RDONLY);
if (f->fd == -1) {
LOG_ERRORF(f->filepath, "open(): [%d] %s", errno, strerror(errno))
return -1;
}
}
return read(f->fd, buf, size);
int ret = (int) read(f->fd, buf, size);
if (ret != 0 && f->calculate_checksum) {
f->has_checksum = TRUE;
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, ret);
}
return ret;
}
#define CLOSE_FILE(f) if ((f).close != NULL) {(f).close(&(f));};
void fs_close(struct vfile *f) {
if (f->fd != -1) {
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
close(f->fd);
}
}
@@ -66,7 +75,7 @@ void parse(void *arg) {
doc->meta_tail = NULL;
doc->mime = 0;
doc->size = job->vfile.info.st_size;
doc->mtime = job->vfile.info.st_mtim.tv_sec;
doc->mtime = (int) job->vfile.info.st_mtim.tv_sec;
int inc_ts = incremental_get(ScanCtx.original_table, doc->path_md5);
if (inc_ts != 0 && inc_ts == job->vfile.info.st_mtim.tv_sec) {
@@ -93,18 +102,17 @@ void parse(void *arg) {
doc->mime = mime_get_mime_by_ext(ScanCtx.ext_table, job->filepath + job->ext);
}
int bytes_read = 0;
if (doc->mime == 0 && !ScanCtx.fast) {
// Get mime type with libmagic
if (!job->vfile.is_fs_file) {
if (job->vfile.read_rewindable == NULL) {
LOG_WARNING(job->filepath,
"Guessing mime type with libmagic inside archive files is not currently supported");
"File does not support rewindable reads, cannot guess Media type");
goto abort;
}
bytes_read = job->vfile.read(&job->vfile, buf, MAGIC_BUF_SIZE);
int bytes_read = job->vfile.read_rewindable(&job->vfile, buf, MAGIC_BUF_SIZE);
if (bytes_read < 0) {
if (job->vfile.is_fs_file) {
@@ -135,7 +143,9 @@ void parse(void *arg) {
}
}
job->vfile.reset(&job->vfile);
if (job->vfile.reset != NULL) {
job->vfile.reset(&job->vfile);
}
magic_close(magic);
}
@@ -149,7 +159,7 @@ void parse(void *arg) {
} else if ((mmime == MimeVideo && doc->size >= MIN_VIDEO_SIZE) ||
(mmime == MimeImage && doc->size >= MIN_IMAGE_SIZE) || mmime == MimeAudio) {
parse_media(&ScanCtx.media_ctx, &job->vfile, doc);
parse_media(&ScanCtx.media_ctx, &job->vfile, doc, mime_get_mime_text(doc->mime));
} else if (IS_PDF(doc->mime)) {
parse_ebook(&ScanCtx.ebook_ctx, &job->vfile, mime_get_mime_text(doc->mime), doc);
@@ -169,7 +179,7 @@ void parse(void *arg) {
IS_ARC(doc->mime) ||
(IS_ARC_FILTER(doc->mime) && should_parse_filtered_file(doc->filepath, doc->ext))
)) {
parse_archive(&ScanCtx.arc_ctx, &job->vfile, doc);
parse_archive(&ScanCtx.arc_ctx, &job->vfile, doc, ScanCtx.exclude, ScanCtx.exclude_extra);
} else if ((ScanCtx.ooxml_ctx.content_size > 0 || ScanCtx.media_ctx.tn_size > 0) && IS_DOC(doc->mime)) {
parse_ooxml(&ScanCtx.ooxml_ctx, &job->vfile, doc);
} else if (is_cbr(&ScanCtx.comic_ctx, doc->mime) || is_cbz(&ScanCtx.comic_ctx, doc->mime)) {
@@ -179,9 +189,15 @@ void parse(void *arg) {
} else if (doc->mime == MIME_SIST2_SIDECAR) {
parse_sidecar(&job->vfile, doc);
CLOSE_FILE(job->vfile)
free(doc->filepath);
free(doc);
return;
} else if (is_msdoc(&ScanCtx.msdoc_ctx, doc->mime)) {
parse_msdoc(&ScanCtx.msdoc_ctx, &job->vfile, doc);
} else if (is_json(&ScanCtx.json_ctx, doc->mime)) {
parse_json(&ScanCtx.json_ctx, &job->vfile, doc);
} else if (is_ndjson(&ScanCtx.json_ctx, doc->mime)) {
parse_ndjson(&ScanCtx.json_ctx, &job->vfile, doc);
}
abort:
@@ -198,9 +214,15 @@ void parse(void *arg) {
doc->has_parent = FALSE;
}
write_document(doc);
CLOSE_FILE(job->vfile)
if (job->vfile.has_checksum) {
char sha1_digest_str[SHA1_STR_LENGTH];
buf2hex((unsigned char *) job->vfile.sha1_digest, SHA1_DIGEST_LENGTH, (char *) sha1_digest_str);
APPEND_STR_META(doc, MetaChecksum, (const char *) sha1_digest_str);
}
write_document(doc);
}
void cleanup_parse() {

View File

@@ -3,7 +3,7 @@
#include "../sist.h"
#define MAGIC_BUF_SIZE 4096 * 6
#define MAGIC_BUF_SIZE (4096 * 6)
int fs_read(struct vfile *f, void *buf, size_t size);
void fs_close(struct vfile *f);

View File

@@ -27,7 +27,10 @@ void parse_sidecar(vfile_t *vfile, document_t *doc) {
MD5((unsigned char *) vfile->filepath + ScanCtx.index.desc.root_len, doc->ext - 1 - ScanCtx.index.desc.root_len,
path_md5);
store_write(ScanCtx.index.meta_store, (char *) path_md5, sizeof(path_md5), json_str, strlen(json_str) + 1);
char path_md5_str[MD5_STR_LENGTH];
buf2hex(path_md5, MD5_DIGEST_LENGTH, path_md5_str);
store_write(ScanCtx.index.meta_store, path_md5_str, MD5_STR_LENGTH, json_str, strlen(json_str) + 1);
cJSON_Delete(json);
free(json_str);

View File

@@ -26,6 +26,8 @@
#define UNUSED(x) __attribute__((__unused__)) x
#define MD5_STR_LENGTH 33
#define SHA1_STR_LENGTH 41
#define SHA1_DIGEST_LENGTH 20
#include "util.h"
#include "log.h"
@@ -49,7 +51,7 @@
#include <ctype.h>
#include "git_hash.h"
#define VERSION "2.11.1"
#define VERSION "2.11.2"
static const char *const Version = VERSION;
#ifndef SIST_PLATFORM

File diff suppressed because one or more lines are too long