Compare commits

...

10 Commits

34 changed files with 312 additions and 117 deletions

4
.gitignore vendored
View File

@ -23,4 +23,6 @@ git_hash.h
Testing/
test_i
test_i_inc
node_modules/
node_modules/
.cmake/
i_inc/

View File

@ -67,23 +67,23 @@ See [Usage guide](docs/USAGE.md) for more details
## Format support
File type | Library | Content | Thumbnail | Metadata
:---|:---|:---|:---|:---
pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
cbz,cbr | [libscan](https://github.com/simon987/libscan) | - | yes | - |
`audio/*` | ffmpeg | - | yes | ID3 tags |
`video/*` | ffmpeg | - | yes | title, comment, artist |
`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags |
ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
`text/plain` | [libscan](https://github.com/simon987/libscan) | yes | no | - |
html, xml | [libscan](https://github.com/simon987/libscan) | yes | no | - |
tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
docx, xlsx, pptx | [libscan](https://github.com/simon987/libscan) | yes | if embedded | creator, modified_by, title |
doc (MS Word 97-2003) | antiword | yes | yes | author, title |
mobi, azw, azw3 | libmobi | yes | no | author, title |
wpd (WordPerfect) | libwpd | yes | no | *planned* |
json, jsonl, ndjson | [libscan](https://github.com/simon987/libscan) | yes | - | - |
| File type | Library | Content | Thumbnail | Metadata |
|:--------------------------------------------------------------------------|:-----------------------------------------------------------------------------|:---------|:------------|:---------------------------------------------------------------------------------------------------------------------------------------|
| pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
| cbz,cbr | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | - | yes | - |
| `audio/*` | ffmpeg | - | yes | ID3 tags |
| `video/*` | ffmpeg | - | yes | title, comment, artist |
| `image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
| raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags |
| ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
| `text/plain` | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - |
| html, xml | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - |
| tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
| docx, xlsx, pptx | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | if embedded | creator, modified_by, title |
| doc (MS Word 97-2003) | antiword | yes | yes | author, title |
| mobi, azw, azw3 | libmobi | yes | no | author, title |
| wpd (WordPerfect) | libwpd | yes | no | *planned* |
| json, jsonl, ndjson | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | - | - |
\* *See [Archive files](#archive-files)*
@ -102,18 +102,24 @@ scan is also supported.
### OCR
You can enable OCR support for pdf,xps,fb2,epub file types with the
`--ocr <lang>` option. Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
You can enable OCR support for ebook (pdf,xps,fb2,epub) or image file types with the
`--ocr-lang <lang>` option in combination with `--ocr-images` and/or `--ocr-ebooks`.
Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).
The `simon987/sist2` image comes with common languages
(hin, jpn, eng, fra, rus, spa) pre-installed.
Examples
You can use the `+` separator to specify multiple languages. The language
name must be identical to the `*.traineddata` file installed on your system
(use `chi_sim` rather than `chi-sim`).
Examples:
```bash
sist2 scan --ocr jpn ~/Books/Manga/
sist2 scan --ocr eng ~/Books/Textbooks/
sist2 scan --ocr-ebooks --ocr-lang jpn ~/Books/Manga/
sist2 scan --ocr-images --ocr-lang eng ~/Images/Screenshots/
sist2 scan --ocr-ebooks --ocr-images --ocr-lang eng+chi_sim ~/Chinese-Bilingual/
```
## Build from source
@ -126,7 +132,7 @@ You can compile **sist2** by yourself if you don't want to use the pre-compiled
git clone --recursive https://github.com/simon987/sist2/
cd sist2
docker build . -f ./Dockerfile -t my-sist2-image
docker run --rm my-sist2-image cat /root/sist2 > sist2-x64-linux
docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
```
### On a linux computer

View File

@ -43,7 +43,7 @@ Scan options
--depth=<int> Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1
--archive=<str> Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse
--archive-passphrase=<str> Passphrase for encrypted archive files
--ocr=<str> Tesseract language (use tesseract --list-langs to see which are installed on your machine)
# TODO: add new --ocr-* options here
-e, --exclude=<str> Files that match this regex will not be scanned
--fast Only index file names & mime type
--treemap-threshold=<str> Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005

View File

@ -6,5 +6,4 @@ python3 scripts/mime.py > src/parsing/mime_generated.c
python3 scripts/serve_static.py > src/web/static_generated.c
python3 scripts/index_static.py > src/index/static_generated.c
printf "static const char *const Sist2CommitHash = \"%s\";\n" $(git rev-parse HEAD) > src/git_hash.h
printf "static const char *const LibScanCommitHash = \"%s\";\n" $(cd third-party/libscan/ && git rev-parse HEAD) >> src/git_hash.h
printf "static const char *const Sist2CommitHash = \"%s\";\n" $(git rev-parse HEAD) > src/git_hash.h

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -15,7 +15,7 @@
<script>
import IndexDebugInfo from "@/components/IndexDebugInfo";
import DebugIcon from "@/components/DebugIcon";
import DebugIcon from "@/components/icons/DebugIcon";
export default {
name: "DebugInfo.vue",
@ -27,7 +27,6 @@ export default {
{key: "platform", value: this.$store.state.sist2Info.platform},
{key: "debugBinary", value: this.$store.state.sist2Info.debug},
{key: "sist2CommitHash", value: this.$store.state.sist2Info.sist2Hash},
{key: "libscanCommitHash", value: this.$store.state.sist2Info.libscanHash},
{key: "esIndex", value: this.$store.state.sist2Info.esIndex},
{key: "tagline", value: this.$store.state.sist2Info.tagline},
{key: "dev", value: this.$store.state.sist2Info.dev},

View File

@ -34,9 +34,11 @@
</svg>
</div>
<img v-if="doc._props.isPlayableImage || doc._props.isPlayableVideo"
<img ref="tn"
v-if="doc._props.isPlayableImage || doc._props.isPlayableVideo"
:src="(doc._props.isGif && hover) ? `f/${doc._id}` : `t/${doc._source.index}/${doc._id}`"
alt=""
:style="{height: (doc._props.isGif && hover) ? `${tnHeight()}px` : undefined}"
class="pointer fit card-img-top" @click="onThumbnailClick()">
<img v-else :src="`t/${doc._source.index}/${doc._id}`" alt=""
class="fit card-img-top">
@ -122,6 +124,9 @@ export default {
},
onTnLeave() {
this.hover = false;
},
tnHeight() {
return this.$refs.tn.height;
}
},
}

View File

@ -1,5 +1,6 @@
<template>
<b-list-group-item class="flex-column align-items-start mb-2" :class="{'sub-document': doc._props.isSubDocument}">
<b-list-group-item class="flex-column align-items-start mb-2" :class="{'sub-document': doc._props.isSubDocument}"
@mouseenter="onTnEnter()" @mouseleave="onTnLeave()" >
<!-- Info modal-->
<DocInfoModal :show="showInfo" :doc="doc" @close="showInfo = false"></DocInfoModal>
@ -56,7 +57,7 @@ import TagContainer from "@/components/TagContainer";
import DocFileTitle from "@/components/DocFileTitle";
import DocInfoModal from "@/components/DocInfoModal";
import ContentDiv from "@/components/ContentDiv";
import FileIcon from "@/components/FileIcon";
import FileIcon from "@/components/icons/FileIcon";
export default {
name: "DocListItem",
@ -85,7 +86,13 @@ export default {
return this.doc.highlight["path.nGram"] + "/"
}
return this.doc._source.path + "/"
}
},
onTnEnter() {
this.hover = true;
},
onTnLeave() {
this.hover = false;
},
}
}
</script>

View File

@ -1,6 +1,5 @@
<template>
<div>
<!-- TODO: Set slideshowTime as a configurable option-->
<FsLightbox
:key="lightboxKey"
:toggler="showLightbox"
@ -10,7 +9,7 @@
:types="lightboxTypes"
:source-index="lightboxSlide"
:custom-toolbar-buttons="customButtons"
:slideshow-time="1000 * 10"
:slideshow-time="$store.getters.optLightboxSlideDuration * 1000"
:zoom-increment="0.5"
:load-only-current-source="$store.getters.optLightboxLoadOnlyCurrent"
:on-close="onClose"

View File

@ -20,7 +20,7 @@
</template>
<script>
import Sist2Icon from "@/components/Sist2Icon";
import Sist2Icon from "@/components/icons/Sist2Icon";
export default {
name: "NavBar",

View File

@ -51,7 +51,7 @@
>{{ tag.text.split(".").pop() }}</span>
<b-popover :target="hit._id+tag.rawText" triggers="focus blur" placement="top">
<b-button variant="danger" @click="onTagDeleteClick(tag, $event)">Delete</b-button>
<b-button variant="danger" @click="onTagDeleteClick(tag, $event)">{{$t("deleteTag")}}</b-button>
</b-popover>
</div>
@ -63,7 +63,7 @@
</template>
<!-- Add button -->
<small v-if="showAddButton" class="badge add-tag-button" @click="tagAdd()">Add</small>
<small v-if="showAddButton" class="badge add-tag-button" @click="tagAdd()">{{$t("addTag")}}</small>
<!-- Size tag-->
<small v-else class="text-muted badge-size">{{

View File

@ -0,0 +1,21 @@
<template>
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
<path
fill="currentColor"
d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm1 16.057v-3.057h2.994c-.059 1.143-.212 2.24-.456 3.279-.823-.12-1.674-.188-2.538-.222zm1.957 2.162c-.499 1.33-1.159 2.497-1.957 3.456v-3.62c.666.028 1.319.081 1.957.164zm-1.957-7.219v-3.015c.868-.034 1.721-.103 2.548-.224.238 1.027.389 2.111.446 3.239h-2.994zm0-5.014v-3.661c.806.969 1.471 2.15 1.971 3.496-.642.084-1.3.137-1.971.165zm2.703-3.267c1.237.496 2.354 1.228 3.29 2.146-.642.234-1.311.442-2.019.607-.344-.992-.775-1.91-1.271-2.753zm-7.241 13.56c-.244-1.039-.398-2.136-.456-3.279h2.994v3.057c-.865.034-1.714.102-2.538.222zm2.538 1.776v3.62c-.798-.959-1.458-2.126-1.957-3.456.638-.083 1.291-.136 1.957-.164zm-2.994-7.055c.057-1.128.207-2.212.446-3.239.827.121 1.68.19 2.548.224v3.015h-2.994zm1.024-5.179c.5-1.346 1.165-2.527 1.97-3.496v3.661c-.671-.028-1.329-.081-1.97-.165zm-2.005-.35c-.708-.165-1.377-.373-2.018-.607.937-.918 2.053-1.65 3.29-2.146-.496.844-.927 1.762-1.272 2.753zm-.549 1.918c-.264 1.151-.434 2.36-.492 3.611h-3.933c.165-1.658.739-3.197 1.617-4.518.88.361 1.816.67 2.808.907zm.009 9.262c-.988.236-1.92.542-2.797.9-.89-1.328-1.471-2.879-1.637-4.551h3.934c.058 1.265.231 2.488.5 3.651zm.553 1.917c.342.976.768 1.881 1.257 2.712-1.223-.49-2.326-1.211-3.256-2.115.636-.229 1.299-.435 1.999-.597zm9.924 0c.7.163 1.362.367 1.999.597-.931.903-2.034 1.625-3.257 2.116.489-.832.915-1.737 1.258-2.713zm.553-1.917c.27-1.163.442-2.386.501-3.651h3.934c-.167 1.672-.748 3.223-1.638 4.551-.877-.358-1.81-.664-2.797-.9zm.501-5.651c-.058-1.251-.229-2.46-.492-3.611.992-.237 1.929-.546 2.809-.907.877 1.321 1.451 2.86 1.616 4.518h-3.933z"/>
</svg>
</template>
<script>
export default {
name: "LanguageIcon"
}
</script>
<style scoped>
svg {
display: inline-block;
width: 20px;
height: 20px;
}
</style>

View File

@ -5,6 +5,8 @@ export default {
advanced: "Advanced search",
fuzzy: "Fuzzy"
},
addTag: "Add",
deleteTag: "Delete",
download: "Download",
and: "and",
page: "page",
@ -132,6 +134,7 @@ export default {
saveTagModalTitle: "Add tag",
saveTagPlaceholder: "Tag name",
confirm: "Confirm",
indexPickerPlaceholder: "Select an index",
sort: {
relevance: "Relevance",
dateAsc: "Date (Older first)",
@ -161,6 +164,8 @@ export default {
advanced: "Recherche avancée",
fuzzy: "Approximatif"
},
addTag: "Ajouter",
deleteTag: "Supprimer",
download: "Télécharger",
and: "et",
page: "page",
@ -320,6 +325,8 @@ export default {
advanced: "高级搜索",
fuzzy: "模糊搜索"
},
addTag: "添加",
deleteTag: "删除",
download: "下载",
and: "与",
page: "页",
@ -447,6 +454,7 @@ export default {
saveTagModalTitle: "增加标签",
saveTagPlaceholder: "标签名",
confirm: "确认",
indexPickerPlaceholder: "选择一个索引",
sort: {
relevance: "相关度",
dateAsc: "日期(由旧到新)",

View File

@ -27,6 +27,7 @@ export default new Vuex.Store({
size: 60,
optLang: "en",
optLangIsDefault: true,
optHideDuplicates: true,
optTheme: "light",
optDisplay: "grid",
@ -82,7 +83,10 @@ export default new Vuex.Store({
setSist2Info: (state, val) => state.sist2Info = val,
setSeed: (state, val) => state.seed = val,
setOptHideDuplicates: (state, val) => state.optHideDuplicates = val,
setOptLang: (state, val) => state.optLang = val,
setOptLang: (state, val) => {
state.optLang = val;
state.optLangIsDefault = false;
},
setSortMode: (state, val) => state.sortMode = val,
setIndices: (state, val) => {
state.indices = val;
@ -148,6 +152,7 @@ export default new Vuex.Store({
setOptHideLegacy: (state, val) => state.optHideLegacy = val,
setOptLightboxLoadOnlyCurrent: (state, val) => state.optLightboxLoadOnlyCurrent = val,
setOptLightboxSlideDuration: (state, val) => state.optLightboxSlideDuration = val,
setUiMimeMap: (state, val) => state.uiMimeMap = val,
@ -159,6 +164,13 @@ export default new Vuex.Store({
},
},
actions: {
setSist2Info: (store, val) => {
store.commit("setSist2Info", val);
if (store.state.optLangIsDefault) {
store.commit("setOptLang", val.lang);
}
},
loadFromArgs({commit}, route: Route) {
if (route.query.q) {

View File

@ -15,15 +15,8 @@
<h4>{{ $t("displayOptions") }}</h4>
<b-card>
<b-form-checkbox :checked="optLightboxLoadOnlyCurrent" @input="setOptLightboxLoadOnlyCurrent">
{{ $t("opt.lightboxLoadOnlyCurrent") }}
</b-form-checkbox>
<b-form-checkbox :checked="optHideLegacy" @input="setOptHideLegacy">
{{ $t("opt.hideLegacy") }}
</b-form-checkbox>
<label>{{ $t("opt.lang") }}</label>
<label><LanguageIcon/><span style="vertical-align: middle">&nbsp;{{ $t("opt.lang") }}</span></label>
<b-form-select :options="langOptions" :value="optLang" @input="setOptLang"></b-form-select>
<label>{{ $t("opt.theme") }}</label>
@ -34,6 +27,16 @@
<label>{{ $t("opt.columns") }}</label>
<b-form-select :options="columnsOptions" :value="optColumns" @input="setOptColumns"></b-form-select>
<div style="height: 10px"></div>
<b-form-checkbox :checked="optLightboxLoadOnlyCurrent" @input="setOptLightboxLoadOnlyCurrent">
{{ $t("opt.lightboxLoadOnlyCurrent") }}
</b-form-checkbox>
<b-form-checkbox :checked="optHideLegacy" @input="setOptHideLegacy">
{{ $t("opt.hideLegacy") }}
</b-form-checkbox>
</b-card>
<br/>
@ -117,15 +120,15 @@
</template>
<script>
import Vue from "vue";
import {mapGetters, mapMutations} from "vuex";
import {mapActions, mapGetters, mapMutations} from "vuex";
import DebugInfo from "@/components/DebugInfo.vue";
import Preloader from "@/components/Preloader.vue";
import sist2 from "@/Sist2Api";
import GearIcon from "@/components/GearIcon.vue";
import GearIcon from "@/components/icons/GearIcon.vue";
import LanguageIcon from "@/components/icons/LanguageIcon";
export default {
components: {GearIcon, DebugInfo, Preloader},
components: {LanguageIcon, GearIcon, DebugInfo, Preloader},
data() {
return {
loading: true,
@ -228,7 +231,7 @@ export default {
},
mounted() {
sist2.getSist2Info().then(data => {
this.$store.commit("setSist2Info", data)
this.setSist2Info(data);
this.loading = false;
});
@ -239,6 +242,9 @@ export default {
});
},
methods: {
...mapActions({
setSist2Info: "setSist2Info",
}),
...mapMutations([
"setOptTheme",
"setOptDisplay",
@ -256,7 +262,6 @@ export default {
"setOptTreemapSize",
"setOptLightboxLoadOnlyCurrent",
"setOptLightboxSlideDuration",
"setOptContainerWidth",
"setOptResultSize",
"setOptTagOrOperator",
"setOptLang",

View File

@ -60,7 +60,7 @@
<script lang="ts">
import Preloader from "@/components/Preloader.vue";
import {mapGetters, mapMutations} from "vuex";
import {mapActions, mapGetters, mapMutations} from "vuex";
import sist2 from "../Sist2Api";
import Sist2Api, {EsHit, EsResult} from "../Sist2Api";
import SearchBar from "@/components/SearchBar.vue";
@ -151,8 +151,10 @@ export default Vue.extend({
});
},
methods: {
...mapMutations({
...mapActions({
setSist2Info: "setSist2Info",
}),
...mapMutations({
setIndices: "setIndices",
setDateBoundsMin: "setDateBoundsMin",
setDateBoundsMax: "setDateBoundsMax",

View File

@ -146,7 +146,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
if (args->name == NULL) {
args->name = g_path_get_basename(args->output);
} else {
char* tmp = malloc(strlen(args->name) + 1);
char *tmp = malloc(strlen(args->name) + 1);
strcpy(tmp, args->name);
args->name = tmp;
}
@ -168,17 +168,50 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
return 1;
}
if (args->tesseract_lang != NULL) {
TessBaseAPI *api = TessBaseAPICreate();
if (args->ocr_images && args->tesseract_lang == NULL) {
fprintf(stderr, "You must specify --ocr-lang <LANG> to use --ocr-images");
return 1;
}
char filename[128];
sprintf(filename, "%s.traineddata", args->tesseract_lang);
const char *path = find_file_in_paths(TESS_DATAPATHS, filename);
if (path == NULL) {
LOG_FATAL("cli.c", "Could not find tesseract language file!");
if (args->ocr_ebooks && args->tesseract_lang == NULL) {
fprintf(stderr, "You must specify --ocr-lang <LANG> to use --ocr-ebooks");
return 1;
}
if (args->tesseract_lang != NULL) {
if (!args->ocr_ebooks && !args->ocr_images) {
fprintf(stderr, "You must specify at least one of --ocr-ebooks, --ocr-images");
return 1;
}
ret = TessBaseAPIInit3(api, path, args->tesseract_lang);
TessBaseAPI *api = TessBaseAPICreate();
const char *trained_data_path = NULL;
char *lang = malloc(strlen(args->tesseract_lang) + 1);
strcpy(lang, args->tesseract_lang);
lang = strtok(lang, "+");
while (lang != NULL) {
char filename[128];
sprintf(filename, "%s.traineddata", lang);
const char *path = find_file_in_paths(TESS_DATAPATHS, filename);
if (path == NULL) {
LOG_FATALF("cli.c", "Could not find tesseract language file: %s!", filename);
}
if (trained_data_path != NULL && path != trained_data_path) {
LOG_FATAL("cli.c", "When specifying more than one tesseract language, all the traineddata "
"files must be in the same folder")
}
trained_data_path = path;
lang = strtok(NULL, "+");
}
free(lang);
ret = TessBaseAPIInit3(api, trained_data_path, args->tesseract_lang);
if (ret != 0) {
fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang);
return 1;
@ -186,7 +219,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
TessBaseAPIEnd(api);
TessBaseAPIDelete(api);
args->tesseract_path = path;
args->tesseract_path = trained_data_path;
}
if (args->exclude_regex != NULL) {
@ -220,7 +253,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
}
if (args->list_path != NULL) {
if(strcmp(args->list_path, "-") == 0) {
if (strcmp(args->list_path, "-") == 0) {
args->list_file = stdin;
LOG_DEBUG("cli.c", "Using stdin as list file")
} else {
@ -377,15 +410,15 @@ int web_args_validate(web_args_t *args, int argc, const char **argv) {
args->es_index = DEFAULT_ES_INDEX;
}
if (args->lang == NULL) {
args->lang = DEFAULT_LANG;
}
if (args->tagline == NULL) {
args->tagline = DEFAULT_TAGLINE;
}
if (strlen(args->lang) != 2) {
if (args->lang == NULL) {
args->lang = DEFAULT_LANG;
}
if (strlen(args->lang) != 2 && strlen(args->lang) != 5) {
fprintf(stderr, "Invalid --lang value, see usage\n");
return 1;
}

View File

@ -21,6 +21,8 @@ typedef struct scan_args {
char *archive_passphrase;
char *tesseract_lang;
const char *tesseract_path;
int ocr_images;
int ocr_ebooks;
char *exclude_regex;
int fast;
const char* treemap_threshold_str;

View File

@ -98,7 +98,7 @@ typedef struct {
int tag_auth_enabled;
char *tagline;
struct index_t indices[256];
char lang[3];
char lang[10];
int dev;
} WebCtx_t;

View File

@ -38,6 +38,8 @@ char *get_meta_key_text(enum metakey meta_key) {
return "parent";
case MetaExifMake:
return "exif_make";
case MetaExifDescription:
return "exif_description";
case MetaExifSoftware:
return "exif_software";
case MetaExifExposureTime:
@ -150,6 +152,7 @@ char *build_json_string(document_t *doc) {
case MetaFontName:
case MetaParent:
case MetaExifMake:
case MetaExifDescription:
case MetaExifSoftware:
case MetaExifExposureTime:
case MetaExifFNumber:

View File

@ -220,6 +220,11 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.media_ctx.store = _store;
ScanCtx.media_ctx.max_media_buffer = (long) args->max_memory_buffer * 1024 * 1024;
ScanCtx.media_ctx.read_subtitles = args->read_subtitles;
if (args->ocr_images) {
ScanCtx.media_ctx.tesseract_lang = args->tesseract_lang;
ScanCtx.media_ctx.tesseract_path = args->tesseract_path;
}
init_media();
// OOXML
@ -501,7 +506,7 @@ void sist2_web(web_args_t *args) {
WebCtx.tag_auth_enabled = args->tag_auth_enabled;
WebCtx.tagline = args->tagline;
WebCtx.dev = args->dev;
strcpy(WebCtx.lang, "en");
strcpy(WebCtx.lang, args->lang);
for (int i = 0; i < args->index_count; i++) {
char *abs_path = abspath(args->indices[i]);
@ -576,8 +581,11 @@ int main(int argc, const char *argv[]) {
OPT_STRING(0, "archive-passphrase", &scan_args->archive_passphrase,
"Passphrase for encrypted archive files"),
OPT_STRING(0, "ocr", &scan_args->tesseract_lang, "Tesseract language (use tesseract --list-langs to see "
"which are installed on your machine)"),
OPT_STRING(0, "ocr-lang", &scan_args->tesseract_lang,
"Tesseract language (use 'tesseract --list-langs' to see "
"which are installed on your machine)"),
OPT_BOOLEAN(0, "ocr-images", &scan_args->ocr_images, "Enable OCR'ing of image files."),
OPT_BOOLEAN(0, "ocr-ebooks", &scan_args->ocr_ebooks, "Enable OCR'ing of ebook files."),
OPT_STRING('e', "exclude", &scan_args->exclude_regex, "Files that match this regex will not be scanned"),
OPT_BOOLEAN(0, "fast", &scan_args->fast, "Only index file names & mime type"),
OPT_STRING(0, "treemap-threshold", &scan_args->treemap_threshold_str, "Relative size threshold for treemap "
@ -614,6 +622,7 @@ int main(int argc, const char *argv[]) {
OPT_STRING(0, "tag-auth", &web_args->tag_credentials, "Basic auth in user:password format for tagging"),
OPT_STRING(0, "tagline", &web_args->tagline, "Tagline in navbar"),
OPT_BOOLEAN(0, "dev", &web_args->dev, "Serve html & js files from disk (for development)"),
OPT_STRING(0, "lang", &web_args->lang, "Default UI language. Can be changed by the user"),
OPT_GROUP("Exec-script options"),
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"),

View File

@ -280,7 +280,6 @@ void index_info(struct mg_connection *nc) {
cJSON_AddBoolToObject(json, "esVersionLegacy", USE_LEGACY_ES_SETTINGS(WebCtx.es_version));
cJSON_AddStringToObject(json, "platform", QUOTE(SIST_PLATFORM));
cJSON_AddStringToObject(json, "sist2Hash", Sist2CommitHash);
cJSON_AddStringToObject(json, "libscanHash", LibScanCommitHash);
cJSON_AddStringToObject(json, "lang", WebCtx.lang);
cJSON_AddBoolToObject(json, "dev", WebCtx.dev);
#ifdef SIST_DEBUG

File diff suppressed because one or more lines are too long

View File

@ -5,9 +5,7 @@
#include "../media/media.h"
#include "../arc/arc.h"
#define MIN_OCR_SIZE 350
#define MIN_OCR_LEN 10
#include "../ocr/ocr.h"
/* fill_image callback doesn't let us pass opaque pointers unless I create my own device */
__thread text_buffer_t thread_buffer;
@ -225,7 +223,9 @@ static int read_stext_block(fz_stext_block *block, text_buffer_t *tex) {
return 0;
}
#define IS_VALID_BPP(d) ((d)==1 || (d)==2 || (d)==4 || (d)==8 || (d)==16 || (d)==24 || (d)==32)
static void fill_image_ocr_cb(const char* text, size_t len) {
text_buffer_append_string(&thread_buffer, text, len - 1);
}
void fill_image(fz_context *fzctx, UNUSED(fz_device *dev),
fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
@ -233,26 +233,9 @@ void fill_image(fz_context *fzctx, UNUSED(fz_device *dev),
int l2factor = 0;
if (img->w > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && IS_VALID_BPP(img->n)) {
if (img->w >= MIN_OCR_WIDTH && img->h >= MIN_OCR_HEIGHT && OCR_IS_VALID_BPP(img->n)) {
fz_pixmap *pix = img->get_pixmap(fzctx, img, NULL, img->w, img->h, &l2factor);
if (pix->h > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && img->xres != 0) {
TessBaseAPI *api = TessBaseAPICreate();
TessBaseAPIInit3(api, thread_ctx.tesseract_path, thread_ctx.tesseract_lang);
TessBaseAPISetImage(api, pix->samples, pix->w, pix->h, pix->n, pix->stride);
TessBaseAPISetSourceResolution(api, pix->xres);
char *text = TessBaseAPIGetUTF8Text(api);
size_t len = strlen(text);
if (len >= MIN_OCR_LEN) {
text_buffer_append_string(&thread_buffer, text, len - 1);
}
TessBaseAPIEnd(api);
TessBaseAPIDelete(api);
}
ocr_extract_text(thread_ctx.tesseract_path, thread_ctx.tesseract_lang, pix->samples, pix->w, pix->h, pix->n, pix->stride, pix->xres, fill_image_ocr_cb);
fz_drop_pixmap(fzctx, pix);
}
}

View File

@ -1,12 +1,18 @@
#include "media.h"
#include "../ocr/ocr.h"
#include <ctype.h>
#define MIN_SIZE 32
#define AVIO_BUF_SIZE 8192
#define IS_VIDEO(fmt) (fmt->iformat->name && strcmp(fmt->iformat->name, "image2") != 0)
#define IS_VIDEO(fmt) ((fmt)->iformat->name && strcmp((fmt)->iformat->name, "image2") != 0)
#define STREAM_IS_IMAGE (stream->nb_frames <= 1)
#define STORE_AS_IS ((void*)-1)
// Pointer to document being processed
__thread document_t *thread_doc;
const char *get_filepath_with_ext(document_t *doc, const char *filepath, const char *mime_str) {
int has_extension = doc->ext > doc->base;
@ -311,7 +317,7 @@ append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *f
if (strcmp(key, "artist") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
} else if (strcmp(key, "imagedescription") == 0) {
APPEND_TAG_META(MetaContent)
append_tag_meta_if_not_exists(ctx, doc, tag, MetaContent);
} else if (strcmp(key, "make") == 0) {
APPEND_TAG_META(MetaExifMake)
} else if (strcmp(key, "model") == 0) {
@ -343,6 +349,55 @@ append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *f
}
}
static void ocr_image_cb(const char *text, size_t len) {
APPEND_STR_META(thread_doc, MetaContent, text);
}
#define OCR_PIXEL_FORMAT AV_PIX_FMT_RGB32
#define OCR_BYTES_PER_PIXEL 4
#define OCR_PIXELS_PER_INCH 70
void ocr_image(scan_media_ctx_t *ctx, document_t *doc, const AVCodecContext *decoder, AVFrame *frame) {
// Convert to RGB32
AVFrame *rgb_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
frame->width, frame->height, decoder->pix_fmt,
frame->width, frame->height, OCR_PIXEL_FORMAT,
SWS_LANCZOS, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(OCR_PIXEL_FORMAT, frame->width, frame->height, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len * 2);
av_image_fill_arrays(rgb_frame->data, rgb_frame->linesize, dst_buf, OCR_PIXEL_FORMAT, frame->width, frame->height,
1);
sws_scale(sws_ctx,
(const uint8_t *const *) frame->data, frame->linesize,
0, frame->height,
rgb_frame->data, rgb_frame->linesize
);
thread_doc = doc;
ocr_extract_text(
ctx->tesseract_path,
ctx->tesseract_lang,
rgb_frame->data[0],
frame->width,
frame->height,
OCR_BYTES_PER_PIXEL,
rgb_frame->linesize[0],
OCR_PIXELS_PER_INCH,
ocr_image_cb
);
sws_freeContext(sws_ctx);
av_free(*rgb_frame->data);
av_frame_free(&rgb_frame);
}
void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) {
int video_stream = -1;
@ -419,11 +474,11 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
avcodec_open2(decoder, video_codec, NULL);
//Seek
if (stream->nb_frames > 1 && stream->codecpar->codec_id != AV_CODEC_ID_GIF) {
if (!STREAM_IS_IMAGE && stream->codecpar->codec_id != AV_CODEC_ID_GIF) {
int seek_ret;
for (int i = 20; i >= 0; i--) {
seek_ret = av_seek_frame(pFormatCtx, video_stream,
stream->duration * 0.10, 0);
(long) ((double) stream->duration * 0.10), 0);
if (seek_ret == 0) {
break;
}
@ -438,6 +493,11 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
return;
}
if (ctx->tesseract_lang != NULL && STREAM_IS_IMAGE) {
ocr_image(ctx, doc, decoder, frame_and_packet->frame);
}
// NOTE: OCR'd content takes precedence over exif image description
append_video_meta(ctx, pFormatCtx, frame_and_packet->frame, doc, IS_VIDEO(pFormatCtx));
// Scale frame
@ -534,7 +594,7 @@ long memfile_seek(void *ptr, long offset, int whence) {
memfile_t *mem = ptr;
if (whence == 0x10000) {
return mem->size;
return (long) mem->size;
}
int ret = fseek(mem->file, offset, whence);

View File

@ -19,6 +19,9 @@ typedef struct {
float tn_qscale;
long max_media_buffer;
int read_subtitles;
const char *tesseract_lang;
const char *tesseract_path;
} scan_media_ctx_t;
__always_inline

47
third-party/libscan/libscan/ocr/ocr.h vendored Normal file
View File

@ -0,0 +1,47 @@
#ifndef OCR_H
#define OCR_H
#include "../scan.h"
#include <tesseract/capi.h>
#define MIN_OCR_WIDTH 350
#define MIN_OCR_HEIGHT 100
#define MIN_OCR_LEN 10
#define OCR_IS_VALID_BPP(d) \
((d) == 1 || (d) == 2 || (d) == 4 || (d) == 8 || (d) == 16 || (d) == 24 || \
(d) == 32)
typedef void (*ocr_extract_callback_t)(const char *, size_t);
__always_inline static void
ocr_extract_text(const char *tesseract_path, const char *tesseract_lang,
const unsigned char *img_buf, const int img_w, const int img_h,
const int img_bpp, const int img_stride, const int img_xres,
const ocr_extract_callback_t cb) {
if (img_w < MIN_OCR_WIDTH || img_h < MIN_OCR_HEIGHT || img_xres <= 0 ||
!OCR_IS_VALID_BPP(img_bpp)) {
return;
}
TessBaseAPI *api = TessBaseAPICreate();
TessBaseAPIInit3(api, tesseract_path, tesseract_lang);
TessBaseAPISetImage(api, img_buf, img_w, img_h, img_bpp, img_stride);
TessBaseAPISetSourceResolution(api, img_xres);
char *text = TessBaseAPIGetUTF8Text(api);
if (text != NULL) {
size_t len = strlen(text);
if (len >= MIN_OCR_LEN) {
cb(text, len);
}
TessDeleteText(text);
}
TessBaseAPIEnd(api);
TessBaseAPIDelete(api);
}
#endif

View File

@ -61,6 +61,7 @@ enum metakey {
MetaFontName,
MetaParent,
MetaExifMake,
MetaExifDescription,
MetaExifSoftware,
MetaExifExposureTime,
MetaExifFNumber,