mirror of
https://github.com/simon987/sist2.git
synced 2025-04-19 02:06:46 +00:00
Add NER support
This commit is contained in:
parent
b5cdd9a5df
commit
dc39c0ec4b
28
README.md
28
README.md
@ -24,10 +24,12 @@ sist2 (Simple incremental search tool)
|
|||||||
* Recursive scan inside archive files \*\*
|
* Recursive scan inside archive files \*\*
|
||||||
* OCR support with tesseract \*\*\*
|
* OCR support with tesseract \*\*\*
|
||||||
* Stats page & disk utilisation visualization
|
* Stats page & disk utilisation visualization
|
||||||
|
* Named-entity recognition (client-side) \*\*\*\*
|
||||||
|
|
||||||
\* See [format support](#format-support)
|
\* See [format support](#format-support)
|
||||||
\*\* See [Archive files](#archive-files)
|
\*\* See [Archive files](#archive-files)
|
||||||
\*\*\* See [OCR](#ocr)
|
\*\*\* See [OCR](#ocr)
|
||||||
|
\*\*\*\* See [Named-Entity Recognition](#NER)
|
||||||
|
|
||||||
## Getting Started
|
## Getting Started
|
||||||
|
|
||||||
@ -68,10 +70,9 @@ Navigate to http://localhost:8080/ to configure sist2-admin.
|
|||||||
```
|
```
|
||||||
|
|
||||||
2. Download the [latest sist2 release](https://github.com/simon987/sist2/releases).
|
2. Download the [latest sist2 release](https://github.com/simon987/sist2/releases).
|
||||||
Select the file corresponding to your CPU architecture and mark the binary as executable with `chmod +x`.
|
Select the file corresponding to your CPU architecture and mark the binary as executable with `chmod +x`.
|
||||||
3. See [usage guide](docs/USAGE.md) for command line usage.
|
3. See [usage guide](docs/USAGE.md) for command line usage.
|
||||||
|
|
||||||
|
|
||||||
Example usage:
|
Example usage:
|
||||||
|
|
||||||
1. Scan a directory: `sist2 scan ~/Documents --output ./documents.sist2`
|
1. Scan a directory: `sist2 scan ~/Documents --output ./documents.sist2`
|
||||||
@ -135,6 +136,29 @@ sist2 scan --ocr-images --ocr-lang eng ~/Images/Screenshots/
|
|||||||
sist2 scan --ocr-ebooks --ocr-images --ocr-lang eng+chi_sim ~/Chinese-Bilingual/
|
sist2 scan --ocr-ebooks --ocr-images --ocr-lang eng+chi_sim ~/Chinese-Bilingual/
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### NER
|
||||||
|
|
||||||
|
sist2 v3.0.4+ supports named-entity recognition (NER). Simply add a supported repository URL to
|
||||||
|
**Configuration** > **Machine learning options** > **Model repositories**
|
||||||
|
to enable it.
|
||||||
|
|
||||||
|
The text processing is done in your browser, no data is sent to any third-party services.
|
||||||
|
See [simon987/sist2-ner-models](https://raw.githubusercontent.com/simon987/sist2-ner-models/main/repo.json) for more details.
|
||||||
|
|
||||||
|
#### List of available repositories:
|
||||||
|
|
||||||
|
| URL | Maintainer | Purpose |
|
||||||
|
|---------------------------------------------------------------------------------------------------------|-----------------------------------------|---------|
|
||||||
|
| [simon987/sist2-ner-models](https://raw.githubusercontent.com/simon987/sist2-ner-models/main/repo.json) | [simon987](https://github.com/simon987) | General |
|
||||||
|
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Screenshot</summary>
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
## Build from source
|
## Build from source
|
||||||
|
|
||||||
You can compile **sist2** by yourself if you don't want to use the pre-compiled binaries
|
You can compile **sist2** by yourself if you don't want to use the pre-compiled binaries
|
||||||
|
BIN
docs/ner.png
Normal file
BIN
docs/ner.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 448 KiB |
720
sist2-vue/package-lock.json
generated
720
sist2-vue/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -9,6 +9,7 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@auth0/auth0-spa-js": "^2.0.2",
|
"@auth0/auth0-spa-js": "^2.0.2",
|
||||||
"@egjs/vue-infinitegrid": "3.3.0",
|
"@egjs/vue-infinitegrid": "3.3.0",
|
||||||
|
"@tensorflow/tfjs": "^4.4.0",
|
||||||
"axios": "^0.25.0",
|
"axios": "^0.25.0",
|
||||||
"bootstrap-vue": "^2.21.2",
|
"bootstrap-vue": "^2.21.2",
|
||||||
"core-js": "^3.6.5",
|
"core-js": "^3.6.5",
|
||||||
|
@ -19,6 +19,7 @@
|
|||||||
import NavBar from "@/components/NavBar";
|
import NavBar from "@/components/NavBar";
|
||||||
import {mapActions, mapGetters, mapMutations} from "vuex";
|
import {mapActions, mapGetters, mapMutations} from "vuex";
|
||||||
import Sist2Api from "@/Sist2Api";
|
import Sist2Api from "@/Sist2Api";
|
||||||
|
import ModelsRepo from "@/ml/modelsRepo";
|
||||||
import {setupAuth0} from "@/main";
|
import {setupAuth0} from "@/main";
|
||||||
|
|
||||||
export default {
|
export default {
|
||||||
@ -36,6 +37,17 @@ export default {
|
|||||||
mounted() {
|
mounted() {
|
||||||
this.$store.dispatch("loadConfiguration").then(() => {
|
this.$store.dispatch("loadConfiguration").then(() => {
|
||||||
this.$root.$i18n.locale = this.$store.state.optLang;
|
this.$root.$i18n.locale = this.$store.state.optLang;
|
||||||
|
ModelsRepo.init(this.$store.getters.mlRepositoryList).catch(err => {
|
||||||
|
this.$bvToast.toast(
|
||||||
|
this.$t("ml.repoFetchError"),
|
||||||
|
{
|
||||||
|
title: this.$t("ml.repoFetchErrorTitle"),
|
||||||
|
noAutoHide: true,
|
||||||
|
toaster: "b-toaster-bottom-right",
|
||||||
|
headerClass: "toast-header-warning",
|
||||||
|
bodyClass: "toast-body-warning",
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
this.$store.subscribe((mutation) => {
|
this.$store.subscribe((mutation) => {
|
||||||
|
21
sist2-vue/src/components/AnalyzedContentSpan.vue
Normal file
21
sist2-vue/src/components/AnalyzedContentSpan.vue
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
<template>
|
||||||
|
<span :style="getStyle()">{{span.text}}</span>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
|
||||||
|
|
||||||
|
import ModelsRepo from "@/ml/modelsRepo";
|
||||||
|
|
||||||
|
export default {
|
||||||
|
name: "AnalyzedContentSpan",
|
||||||
|
props: ["span", "text"],
|
||||||
|
methods: {
|
||||||
|
getStyle() {
|
||||||
|
return ModelsRepo.data[this.$store.getters.mlModel.name].labelStyles[this.span.label];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<style scoped></style>
|
75
sist2-vue/src/components/AnalyzedContentSpanContainer.vue
Normal file
75
sist2-vue/src/components/AnalyzedContentSpanContainer.vue
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
<template>
|
||||||
|
<div>
|
||||||
|
<b-card class="mb-2">
|
||||||
|
<AnalyzedContentSpan v-for="span of legend" :key="span.id" :span="span"
|
||||||
|
class="mr-2"></AnalyzedContentSpan>
|
||||||
|
</b-card>
|
||||||
|
<div class="content-div">
|
||||||
|
<AnalyzedContentSpan v-for="span of mergedSpans" :key="span.id" :span="span"></AnalyzedContentSpan>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
|
||||||
|
|
||||||
|
import AnalyzedContentSpan from "@/components/AnalyzedContentSpan.vue";
|
||||||
|
import ModelsRepo from "@/ml/modelsRepo";
|
||||||
|
|
||||||
|
export default {
|
||||||
|
name: "AnalyzedContentSpanContainer",
|
||||||
|
components: {AnalyzedContentSpan},
|
||||||
|
props: ["spans", "text"],
|
||||||
|
computed: {
|
||||||
|
legend() {
|
||||||
|
return Object.entries(ModelsRepo.data[this.$store.state.mlModel.name].legend)
|
||||||
|
.map(([label, name]) => ({
|
||||||
|
text: name,
|
||||||
|
id: label,
|
||||||
|
label: label
|
||||||
|
}));
|
||||||
|
},
|
||||||
|
mergedSpans() {
|
||||||
|
const spans = this.spans;
|
||||||
|
|
||||||
|
const merged = [];
|
||||||
|
|
||||||
|
let lastLabel = null;
|
||||||
|
let fixSpace = false;
|
||||||
|
for (let i = 0; i < spans.length; i++) {
|
||||||
|
|
||||||
|
if (spans[i].label !== lastLabel) {
|
||||||
|
let start = spans[i].wordIndex;
|
||||||
|
const nextSpan = spans.slice(i + 1).find(s => s.label !== spans[i].label)
|
||||||
|
let end = nextSpan ? nextSpan.wordIndex : undefined;
|
||||||
|
|
||||||
|
if (end !== undefined && this.text[end - 1] === " ") {
|
||||||
|
end -= 1;
|
||||||
|
fixSpace = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
merged.push({
|
||||||
|
text: this.text.slice(start, end),
|
||||||
|
label: spans[i].label,
|
||||||
|
id: spans[i].wordIndex
|
||||||
|
});
|
||||||
|
|
||||||
|
if (fixSpace) {
|
||||||
|
merged.push({
|
||||||
|
text: " ",
|
||||||
|
label: "O",
|
||||||
|
id: end
|
||||||
|
});
|
||||||
|
fixSpace = false;
|
||||||
|
}
|
||||||
|
lastLabel = spans[i].label;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return merged;
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<style scoped></style>
|
@ -1,6 +1,36 @@
|
|||||||
<template>
|
<template>
|
||||||
<Preloader v-if="loading"></Preloader>
|
<Preloader v-if="loading"></Preloader>
|
||||||
<div v-else-if="content" class="content-div" v-html="content"></div>
|
<div v-else-if="content">
|
||||||
|
<b-form inline class="my-2" v-if="ModelsRepo.getOptions().length > 0">
|
||||||
|
<b-checkbox class="ml-auto mr-2" :checked="optAutoAnalyze"
|
||||||
|
@input="setOptAutoAnalyze($event); $store.dispatch('updateConfiguration')">
|
||||||
|
{{ $t("ml.auto") }}
|
||||||
|
</b-checkbox>
|
||||||
|
<b-button :disabled="mlPredictionsLoading || mlLoading" @click="mlAnalyze" variant="primary"
|
||||||
|
>{{ $t("ml.analyzeText") }}
|
||||||
|
</b-button>
|
||||||
|
<b-select :disabled="mlPredictionsLoading || mlLoading" class="ml-2" v-model="mlModel">
|
||||||
|
<b-select-option :value="opt.value" v-for="opt of ModelsRepo.getOptions()">{{ opt.text }}
|
||||||
|
</b-select-option>
|
||||||
|
</b-select>
|
||||||
|
</b-form>
|
||||||
|
|
||||||
|
<b-progress v-if="mlLoading" variant="warning" show-progress :max="1" class="mb-3"
|
||||||
|
>
|
||||||
|
<b-progress-bar :value="modelLoadingProgress">
|
||||||
|
<strong>{{ ((modelLoadingProgress * modelSize) / (1024*1024)).toFixed(1) }}MB / {{
|
||||||
|
(modelSize / (1024 * 1024)).toFixed(1)
|
||||||
|
}}MB</strong>
|
||||||
|
</b-progress-bar>
|
||||||
|
</b-progress>
|
||||||
|
|
||||||
|
<b-progress v-if="mlPredictionsLoading" variant="primary" :value="modelPredictionProgress"
|
||||||
|
:max="content.length" class="mb-3"></b-progress>
|
||||||
|
|
||||||
|
<AnalyzedContentSpansContainer v-if="analyzedContentSpans.length > 0"
|
||||||
|
:spans="analyzedContentSpans" :text="rawContent"></AnalyzedContentSpansContainer>
|
||||||
|
<div v-else class="content-div" v-html="content"></div>
|
||||||
|
</div>
|
||||||
</template>
|
</template>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
@ -8,22 +38,40 @@ import Sist2Api from "@/Sist2Api";
|
|||||||
import Preloader from "@/components/Preloader";
|
import Preloader from "@/components/Preloader";
|
||||||
import Sist2Query from "@/Sist2Query";
|
import Sist2Query from "@/Sist2Query";
|
||||||
import store from "@/store";
|
import store from "@/store";
|
||||||
|
import BertNerModel from "@/ml/BertNerModel";
|
||||||
|
import AnalyzedContentSpansContainer from "@/components/AnalyzedContentSpanContainer.vue";
|
||||||
|
import ModelsRepo from "@/ml/modelsRepo";
|
||||||
|
import {mapGetters, mapMutations} from "vuex";
|
||||||
|
|
||||||
export default {
|
export default {
|
||||||
name: "LazyContentDiv",
|
name: "LazyContentDiv",
|
||||||
components: {Preloader},
|
components: {AnalyzedContentSpansContainer, Preloader},
|
||||||
props: ["docId"],
|
props: ["docId"],
|
||||||
data() {
|
data() {
|
||||||
return {
|
return {
|
||||||
|
ModelsRepo,
|
||||||
content: "",
|
content: "",
|
||||||
loading: true
|
rawContent: "",
|
||||||
|
loading: true,
|
||||||
|
modelLoadingProgress: 0,
|
||||||
|
modelPredictionProgress: 0,
|
||||||
|
mlPredictionsLoading: false,
|
||||||
|
mlLoading: false,
|
||||||
|
mlModel: null,
|
||||||
|
analyzedContentSpans: []
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
mounted() {
|
mounted() {
|
||||||
|
|
||||||
|
if (this.$store.getters.optMlDefaultModel) {
|
||||||
|
this.mlModel = this.$store.getters.optMlDefaultModel
|
||||||
|
} else {
|
||||||
|
this.mlModel = ModelsRepo.getDefaultModel();
|
||||||
|
}
|
||||||
|
|
||||||
const query = Sist2Query.searchQuery();
|
const query = Sist2Query.searchQuery();
|
||||||
|
|
||||||
if (this.$store.state.optHighlight) {
|
if (this.$store.state.optHighlight) {
|
||||||
|
|
||||||
const fields = this.$store.state.fuzzy
|
const fields = this.$store.state.fuzzy
|
||||||
? {"content.nGram": {}}
|
? {"content.nGram": {}}
|
||||||
: {content: {}};
|
: {content: {}};
|
||||||
@ -67,14 +115,28 @@ export default {
|
|||||||
this.loading = false;
|
this.loading = false;
|
||||||
if (resp.hits.hits.length === 1) {
|
if (resp.hits.hits.length === 1) {
|
||||||
this.content = this.getContent(resp.hits.hits[0]);
|
this.content = this.getContent(resp.hits.hits[0]);
|
||||||
} else {
|
}
|
||||||
console.log("FIXME: could not get content")
|
|
||||||
console.log(resp)
|
if (this.optAutoAnalyze) {
|
||||||
|
this.mlAnalyze();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
computed: {
|
||||||
|
...mapGetters(["optAutoAnalyze"]),
|
||||||
|
modelSize() {
|
||||||
|
const modelData = ModelsRepo.data[this.mlModel];
|
||||||
|
if (!modelData) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return modelData.size;
|
||||||
|
}
|
||||||
|
},
|
||||||
methods: {
|
methods: {
|
||||||
|
...mapMutations(["setOptAutoAnalyze"]),
|
||||||
getContent(doc) {
|
getContent(doc) {
|
||||||
|
this.rawContent = doc._source.content;
|
||||||
|
|
||||||
if (!doc.highlight) {
|
if (!doc.highlight) {
|
||||||
return doc._source.content;
|
return doc._source.content;
|
||||||
}
|
}
|
||||||
@ -85,10 +147,60 @@ export default {
|
|||||||
if (doc.highlight.content) {
|
if (doc.highlight.content) {
|
||||||
return doc.highlight.content[0];
|
return doc.highlight.content[0];
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
async getMlModel() {
|
||||||
|
if (this.$store.getters.mlModel.name !== this.mlModel) {
|
||||||
|
this.mlLoading = true;
|
||||||
|
this.modelLoadingProgress = 0;
|
||||||
|
const modelInfo = ModelsRepo.data[this.mlModel];
|
||||||
|
|
||||||
|
const model = new BertNerModel(
|
||||||
|
modelInfo.vocabUrl,
|
||||||
|
modelInfo.modelUrl,
|
||||||
|
modelInfo.id2label,
|
||||||
|
)
|
||||||
|
|
||||||
|
await model.init(progress => this.modelLoadingProgress = progress);
|
||||||
|
this.$store.commit("setMlModel", {model, name: this.mlModel});
|
||||||
|
|
||||||
|
this.mlLoading = false;
|
||||||
|
return model
|
||||||
|
}
|
||||||
|
|
||||||
|
return this.$store.getters.mlModel.model;
|
||||||
|
},
|
||||||
|
async mlAnalyze() {
|
||||||
|
if (!this.content) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const modelInfo = ModelsRepo.data[this.mlModel];
|
||||||
|
if (modelInfo === undefined) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.$store.commit("setOptMlDefaultModel", this.mlModel);
|
||||||
|
await this.$store.dispatch("updateConfiguration");
|
||||||
|
|
||||||
|
const model = await this.getMlModel();
|
||||||
|
|
||||||
|
this.analyzedContentSpans = [];
|
||||||
|
|
||||||
|
this.mlPredictionsLoading = true;
|
||||||
|
|
||||||
|
await model.predict(this.rawContent, results => {
|
||||||
|
results.forEach(result => result.label = modelInfo.humanLabels[result.label]);
|
||||||
|
this.analyzedContentSpans.push(...results);
|
||||||
|
this.modelPredictionProgress = results[results.length - 1].wordIndex;
|
||||||
|
});
|
||||||
|
this.mlPredictionsLoading = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<style scoped>
|
<style>
|
||||||
|
.progress-bar {
|
||||||
|
transition: none;
|
||||||
|
}
|
||||||
</style>
|
</style>
|
@ -49,6 +49,7 @@ export default {
|
|||||||
configReset: "Reset configuration",
|
configReset: "Reset configuration",
|
||||||
searchOptions: "Search options",
|
searchOptions: "Search options",
|
||||||
treemapOptions: "Treemap options",
|
treemapOptions: "Treemap options",
|
||||||
|
mlOptions: "Machine learning options",
|
||||||
displayOptions: "Display options",
|
displayOptions: "Display options",
|
||||||
opt: {
|
opt: {
|
||||||
lang: "Language",
|
lang: "Language",
|
||||||
@ -78,7 +79,10 @@ export default {
|
|||||||
simpleLightbox: "Disable animations in image viewer",
|
simpleLightbox: "Disable animations in image viewer",
|
||||||
showTagPickerFilter: "Display the tag filter bar",
|
showTagPickerFilter: "Display the tag filter bar",
|
||||||
featuredFields: "Featured fields Javascript template string. Will appear in the search results.",
|
featuredFields: "Featured fields Javascript template string. Will appear in the search results.",
|
||||||
featuredFieldsList: "Available variables"
|
featuredFieldsList: "Available variables",
|
||||||
|
autoAnalyze: "Automatically analyze text",
|
||||||
|
defaultModel: "Default model",
|
||||||
|
mlRepositories: "Model repositories (one per line)"
|
||||||
},
|
},
|
||||||
queryMode: {
|
queryMode: {
|
||||||
simple: "Simple",
|
simple: "Simple",
|
||||||
@ -171,6 +175,12 @@ export default {
|
|||||||
selectedIndex: "selected index",
|
selectedIndex: "selected index",
|
||||||
selectedIndices: "selected indices",
|
selectedIndices: "selected indices",
|
||||||
},
|
},
|
||||||
|
ml: {
|
||||||
|
analyzeText: "Analyze",
|
||||||
|
auto: "Auto",
|
||||||
|
repoFetchError: "Failed to get list of models. Check browser console for more details.",
|
||||||
|
repoFetchErrorTitle: "Could not fetch model repositories",
|
||||||
|
}
|
||||||
},
|
},
|
||||||
de: {
|
de: {
|
||||||
filePage: {
|
filePage: {
|
||||||
|
77
sist2-vue/src/ml/BertNerModel.js
Normal file
77
sist2-vue/src/ml/BertNerModel.js
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import BertTokenizer from "@/ml/BertTokenizer";
|
||||||
|
import * as tf from "@tensorflow/tfjs";
|
||||||
|
import axios from "axios";
|
||||||
|
|
||||||
|
export default class BertNerModel {
|
||||||
|
vocabUrl;
|
||||||
|
modelUrl;
|
||||||
|
|
||||||
|
id2label;
|
||||||
|
_tokenizer;
|
||||||
|
_model;
|
||||||
|
inputSize = 128;
|
||||||
|
|
||||||
|
_previousWordId = null;
|
||||||
|
|
||||||
|
constructor(vocabUrl, modelUrl, id2label) {
|
||||||
|
this.vocabUrl = vocabUrl;
|
||||||
|
this.modelUrl = modelUrl;
|
||||||
|
this.id2label = id2label;
|
||||||
|
}
|
||||||
|
|
||||||
|
async init(onProgress) {
|
||||||
|
await Promise.all([this.loadTokenizer(), this.loadModel(onProgress)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
async loadTokenizer() {
|
||||||
|
const vocab = (await axios.get(this.vocabUrl)).data;
|
||||||
|
this._tokenizer = new BertTokenizer(vocab);
|
||||||
|
}
|
||||||
|
|
||||||
|
async loadModel(onProgress) {
|
||||||
|
this._model = await tf.loadGraphModel(this.modelUrl, {onProgress});
|
||||||
|
}
|
||||||
|
|
||||||
|
alignLabels(labels, wordIds, words) {
|
||||||
|
const result = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < this.inputSize; i++) {
|
||||||
|
const label = labels[i];
|
||||||
|
const wordId = wordIds[i];
|
||||||
|
|
||||||
|
if (wordId === -1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (wordId === this._previousWordId) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
result.push({
|
||||||
|
word: words[wordId].text, wordIndex: words[wordId].index, label: label
|
||||||
|
});
|
||||||
|
this._previousWordId = wordId;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
async predict(text, callback) {
|
||||||
|
this._previousWordId = null;
|
||||||
|
const encoded = this._tokenizer.encodeText(text, this.inputSize)
|
||||||
|
|
||||||
|
for (let chunk of encoded.inputChunks) {
|
||||||
|
const rawResult = tf.tidy(() => this._model.execute({
|
||||||
|
input_ids: tf.tensor2d(chunk.inputIds, [1, this.inputSize], "int32"),
|
||||||
|
token_type_ids: tf.tensor2d(chunk.segmentIds, [1, this.inputSize], "int32"),
|
||||||
|
attention_mask: tf.tensor2d(chunk.inputMask, [1, this.inputSize], "int32"),
|
||||||
|
}));
|
||||||
|
|
||||||
|
const labelIds = await tf.argMax(rawResult, -1);
|
||||||
|
const labelIdsArray = await labelIds.array();
|
||||||
|
const labels = labelIdsArray[0].map(id => this.id2label[id]);
|
||||||
|
rawResult.dispose()
|
||||||
|
|
||||||
|
callback(this.alignLabels(labels, chunk.wordIds, encoded.words))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
184
sist2-vue/src/ml/BertTokenizer.js
Normal file
184
sist2-vue/src/ml/BertTokenizer.js
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
import {zip, chunk} from "underscore";
|
||||||
|
|
||||||
|
const UNK_INDEX = 100;
|
||||||
|
const CLS_INDEX = 101;
|
||||||
|
const SEP_INDEX = 102;
|
||||||
|
const CONTINUING_SUBWORD_PREFIX = "##";
|
||||||
|
|
||||||
|
function isWhitespace(ch) {
|
||||||
|
return /\s/.test(ch);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isInvalid(ch) {
|
||||||
|
return (ch.charCodeAt(0) === 0 || ch.charCodeAt(0) === 0xfffd);
|
||||||
|
}
|
||||||
|
|
||||||
|
const punctuations = '[~`!@#$%^&*(){}[];:"\'<,.>?/\\|-_+=';
|
||||||
|
|
||||||
|
/** To judge whether it's a punctuation. */
|
||||||
|
function isPunctuation(ch) {
|
||||||
|
return punctuations.indexOf(ch) !== -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
export default class BertTokenizer {
|
||||||
|
vocab;
|
||||||
|
|
||||||
|
constructor(vocab) {
|
||||||
|
this.vocab = vocab;
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenize(text) {
|
||||||
|
const charOriginalIndex = [];
|
||||||
|
const cleanedText = this.cleanText(text, charOriginalIndex);
|
||||||
|
const origTokens = cleanedText.split(' ');
|
||||||
|
|
||||||
|
let charCount = 0;
|
||||||
|
const tokens = origTokens.map((token) => {
|
||||||
|
token = token.toLowerCase();
|
||||||
|
const tokens = this.runSplitOnPunctuation(token, charCount, charOriginalIndex);
|
||||||
|
charCount += token.length + 1;
|
||||||
|
return tokens;
|
||||||
|
});
|
||||||
|
|
||||||
|
let flattenTokens = [];
|
||||||
|
for (let index = 0; index < tokens.length; index++) {
|
||||||
|
flattenTokens = flattenTokens.concat(tokens[index]);
|
||||||
|
}
|
||||||
|
return flattenTokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Performs invalid character removal and whitespace cleanup on text. */
|
||||||
|
cleanText(text, charOriginalIndex) {
|
||||||
|
text = text.replace(/\?/g, "").trim();
|
||||||
|
|
||||||
|
const stringBuilder = [];
|
||||||
|
let originalCharIndex = 0;
|
||||||
|
let newCharIndex = 0;
|
||||||
|
|
||||||
|
for (const ch of text) {
|
||||||
|
// Skip the characters that cannot be used.
|
||||||
|
if (isInvalid(ch)) {
|
||||||
|
originalCharIndex += ch.length;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (isWhitespace(ch)) {
|
||||||
|
if (stringBuilder.length > 0 && stringBuilder[stringBuilder.length - 1] !== ' ') {
|
||||||
|
stringBuilder.push(' ');
|
||||||
|
charOriginalIndex[newCharIndex] = originalCharIndex;
|
||||||
|
originalCharIndex += ch.length;
|
||||||
|
} else {
|
||||||
|
originalCharIndex += ch.length;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
stringBuilder.push(ch);
|
||||||
|
charOriginalIndex[newCharIndex] = originalCharIndex;
|
||||||
|
originalCharIndex += ch.length;
|
||||||
|
}
|
||||||
|
newCharIndex++;
|
||||||
|
}
|
||||||
|
return stringBuilder.join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Splits punctuation on a piece of text. */
|
||||||
|
runSplitOnPunctuation(text, count, charOriginalIndex) {
|
||||||
|
const tokens = [];
|
||||||
|
let startNewWord = true;
|
||||||
|
for (const ch of text) {
|
||||||
|
if (isPunctuation(ch)) {
|
||||||
|
tokens.push({text: ch, index: charOriginalIndex[count]});
|
||||||
|
count += ch.length;
|
||||||
|
startNewWord = true;
|
||||||
|
} else {
|
||||||
|
if (startNewWord) {
|
||||||
|
tokens.push({text: '', index: charOriginalIndex[count]});
|
||||||
|
startNewWord = false;
|
||||||
|
}
|
||||||
|
tokens[tokens.length - 1].text += ch;
|
||||||
|
count += ch.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
encode(words) {
|
||||||
|
let outputTokens = [];
|
||||||
|
const wordIds = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < words.length; i++) {
|
||||||
|
let chars = [...words[i].text];
|
||||||
|
|
||||||
|
let isUnknown = false;
|
||||||
|
let start = 0;
|
||||||
|
let subTokens = [];
|
||||||
|
|
||||||
|
while (start < chars.length) {
|
||||||
|
let end = chars.length;
|
||||||
|
let currentSubstring = null;
|
||||||
|
while (start < end) {
|
||||||
|
let substr = chars.slice(start, end).join('');
|
||||||
|
|
||||||
|
if (start > 0) {
|
||||||
|
substr = CONTINUING_SUBWORD_PREFIX + substr;
|
||||||
|
}
|
||||||
|
if (this.vocab.includes(substr)) {
|
||||||
|
currentSubstring = this.vocab.indexOf(substr);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
--end;
|
||||||
|
}
|
||||||
|
if (currentSubstring == null) {
|
||||||
|
isUnknown = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
subTokens.push(currentSubstring);
|
||||||
|
start = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isUnknown) {
|
||||||
|
outputTokens.push(UNK_INDEX);
|
||||||
|
wordIds.push(i);
|
||||||
|
} else {
|
||||||
|
subTokens.forEach(tok => {
|
||||||
|
outputTokens.push(tok);
|
||||||
|
wordIds.push(i)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {tokens: outputTokens, wordIds};
|
||||||
|
}
|
||||||
|
|
||||||
|
encodeText(inputText, inputSize) {
|
||||||
|
|
||||||
|
const tokenized = this.tokenize(inputText);
|
||||||
|
const encoded = this.encode(tokenized);
|
||||||
|
|
||||||
|
const encodedTokenChunks = chunk(encoded.tokens, inputSize - 2);
|
||||||
|
const encodedWordIdChunks = chunk(encoded.wordIds, inputSize - 2);
|
||||||
|
|
||||||
|
const chunks = [];
|
||||||
|
|
||||||
|
zip(encodedTokenChunks, encodedWordIdChunks).forEach(([tokens, wordIds]) => {
|
||||||
|
const inputIds = [CLS_INDEX, ...tokens, SEP_INDEX];
|
||||||
|
const segmentIds = Array(inputIds.length).fill(0);
|
||||||
|
const inputMask = Array(inputIds.length).fill(1);
|
||||||
|
wordIds = [-1, ...wordIds, -1];
|
||||||
|
|
||||||
|
while (inputIds.length < inputSize) {
|
||||||
|
inputIds.push(0);
|
||||||
|
inputMask.push(0);
|
||||||
|
segmentIds.push(0);
|
||||||
|
wordIds.push(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
chunks.push({inputIds, inputMask, segmentIds, wordIds})
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
inputChunks: chunks,
|
||||||
|
words: tokenized
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
43
sist2-vue/src/ml/modelsRepo.js
Normal file
43
sist2-vue/src/ml/modelsRepo.js
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
import axios from "axios";
|
||||||
|
|
||||||
|
class ModelsRepo {
|
||||||
|
_repositories;
|
||||||
|
data = {};
|
||||||
|
|
||||||
|
async init(repositories) {
|
||||||
|
this._repositories = repositories;
|
||||||
|
|
||||||
|
const data = await Promise.all(this._repositories.map(this._loadRepository));
|
||||||
|
|
||||||
|
data.forEach(models => {
|
||||||
|
models.forEach(model => {
|
||||||
|
this.data[model.name] = model;
|
||||||
|
})
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async _loadRepository(repository) {
|
||||||
|
const data = (await axios.get(repository)).data;
|
||||||
|
data.forEach(model => {
|
||||||
|
model["modelUrl"] = new URL(model["modelPath"], repository).href;
|
||||||
|
model["vocabUrl"] = new URL(model["vocabPath"], repository).href;
|
||||||
|
});
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
getOptions() {
|
||||||
|
return Object.values(this.data).map(model => ({
|
||||||
|
text: `${model.name} (${Math.round(model.size / (1024*1024))}MB)`,
|
||||||
|
value: model.name
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
getDefaultModel() {
|
||||||
|
if (Object.values(this.data).length === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return Object.values(this.data).find(model => model.default).name;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export default new ModelsRepo();
|
@ -57,6 +57,9 @@ export default new Vuex.Store({
|
|||||||
optVidPreviewInterval: 700,
|
optVidPreviewInterval: 700,
|
||||||
optSimpleLightbox: true,
|
optSimpleLightbox: true,
|
||||||
optShowTagPickerFilter: true,
|
optShowTagPickerFilter: true,
|
||||||
|
optMlRepositories: "https://raw.githubusercontent.com/simon987/sist2-ner-models/main/repo.json",
|
||||||
|
optAutoAnalyze: false,
|
||||||
|
optMlDefaultModel: null,
|
||||||
|
|
||||||
_onLoadSelectedIndices: [] as string[],
|
_onLoadSelectedIndices: [] as string[],
|
||||||
_onLoadSelectedMimeTypes: [] as string[],
|
_onLoadSelectedMimeTypes: [] as string[],
|
||||||
@ -86,7 +89,11 @@ export default new Vuex.Store({
|
|||||||
|
|
||||||
uiMimeMap: [] as any[],
|
uiMimeMap: [] as any[],
|
||||||
|
|
||||||
auth0Token: null
|
auth0Token: null,
|
||||||
|
mlModel: {
|
||||||
|
model: null,
|
||||||
|
name: null
|
||||||
|
},
|
||||||
},
|
},
|
||||||
mutations: {
|
mutations: {
|
||||||
setUiShowDetails: (state, val) => state.uiShowDetails = val,
|
setUiShowDetails: (state, val) => state.uiShowDetails = val,
|
||||||
@ -172,6 +179,9 @@ export default new Vuex.Store({
|
|||||||
setOptVidPreviewInterval: (state, val) => state.optVidPreviewInterval = val,
|
setOptVidPreviewInterval: (state, val) => state.optVidPreviewInterval = val,
|
||||||
setOptSimpleLightbox: (state, val) => state.optSimpleLightbox = val,
|
setOptSimpleLightbox: (state, val) => state.optSimpleLightbox = val,
|
||||||
setOptShowTagPickerFilter: (state, val) => state.optShowTagPickerFilter = val,
|
setOptShowTagPickerFilter: (state, val) => state.optShowTagPickerFilter = val,
|
||||||
|
setOptAutoAnalyze: (state, val) => {state.optAutoAnalyze = val},
|
||||||
|
setOptMlRepositories: (state, val) => {state.optMlRepositories = val},
|
||||||
|
setOptMlDefaultModel: (state, val) => {state.optMlDefaultModel = val},
|
||||||
|
|
||||||
setOptLightboxLoadOnlyCurrent: (state, val) => state.optLightboxLoadOnlyCurrent = val,
|
setOptLightboxLoadOnlyCurrent: (state, val) => state.optLightboxLoadOnlyCurrent = val,
|
||||||
setOptLightboxSlideDuration: (state, val) => state.optLightboxSlideDuration = val,
|
setOptLightboxSlideDuration: (state, val) => state.optLightboxSlideDuration = val,
|
||||||
@ -194,6 +204,7 @@ export default new Vuex.Store({
|
|||||||
// noop
|
// noop
|
||||||
},
|
},
|
||||||
setAuth0Token: (state, val) => state.auth0Token = val,
|
setAuth0Token: (state, val) => state.auth0Token = val,
|
||||||
|
setMlModel: (state, val) => state.mlModel = val,
|
||||||
},
|
},
|
||||||
actions: {
|
actions: {
|
||||||
setSist2Info: (store, val) => {
|
setSist2Info: (store, val) => {
|
||||||
@ -350,6 +361,7 @@ export default new Vuex.Store({
|
|||||||
},
|
},
|
||||||
modules: {},
|
modules: {},
|
||||||
getters: {
|
getters: {
|
||||||
|
mlModel: (state) => state.mlModel,
|
||||||
seed: (state) => state.seed,
|
seed: (state) => state.seed,
|
||||||
getPathText: (state) => state.pathText,
|
getPathText: (state) => state.pathText,
|
||||||
indices: state => state.indices,
|
indices: state => state.indices,
|
||||||
@ -416,5 +428,12 @@ export default new Vuex.Store({
|
|||||||
optSimpleLightbox: state => state.optSimpleLightbox,
|
optSimpleLightbox: state => state.optSimpleLightbox,
|
||||||
optShowTagPickerFilter: state => state.optShowTagPickerFilter,
|
optShowTagPickerFilter: state => state.optShowTagPickerFilter,
|
||||||
optFeaturedFields: state => state.optFeaturedFields,
|
optFeaturedFields: state => state.optFeaturedFields,
|
||||||
|
optMlRepositories: state => state.optMlRepositories,
|
||||||
|
mlRepositoryList: state => {
|
||||||
|
const repos = state.optMlRepositories.split("\n")
|
||||||
|
return repos[0] == "" ? [] : repos;
|
||||||
|
},
|
||||||
|
optMlDefaultModel: state => state.optMlDefaultModel,
|
||||||
|
optAutoAnalyze: state => state.optAutoAnalyze,
|
||||||
}
|
}
|
||||||
})
|
})
|
@ -25,7 +25,8 @@
|
|||||||
<b-form-select :options="themeOptions" :value="optTheme" @input="setOptTheme"></b-form-select>
|
<b-form-select :options="themeOptions" :value="optTheme" @input="setOptTheme"></b-form-select>
|
||||||
|
|
||||||
<label>{{ $t("opt.displayMode") }}</label>
|
<label>{{ $t("opt.displayMode") }}</label>
|
||||||
<b-form-select :options="displayModeOptions" :value="optDisplay" @input="setOptDisplay"></b-form-select>
|
<b-form-select :options="displayModeOptions" :value="optDisplay"
|
||||||
|
@input="setOptDisplay"></b-form-select>
|
||||||
|
|
||||||
<label>{{ $t("opt.columns") }}</label>
|
<label>{{ $t("opt.columns") }}</label>
|
||||||
<b-form-select :options="columnsOptions" :value="optColumns" @input="setOptColumns"></b-form-select>
|
<b-form-select :options="columnsOptions" :value="optColumns" @input="setOptColumns"></b-form-select>
|
||||||
@ -123,7 +124,10 @@
|
|||||||
}}
|
}}
|
||||||
</b-form-checkbox>
|
</b-form-checkbox>
|
||||||
|
|
||||||
<b-form-checkbox :checked="optHighlight" @input="setOptHighlight">{{ $t("opt.highlight") }}</b-form-checkbox>
|
<b-form-checkbox :checked="optHighlight" @input="setOptHighlight">{{
|
||||||
|
$t("opt.highlight")
|
||||||
|
}}
|
||||||
|
</b-form-checkbox>
|
||||||
<b-form-checkbox :checked="optTagOrOperator" @input="setOptTagOrOperator">{{
|
<b-form-checkbox :checked="optTagOrOperator" @input="setOptTagOrOperator">{{
|
||||||
$t("opt.tagOrOperator")
|
$t("opt.tagOrOperator")
|
||||||
}}
|
}}
|
||||||
@ -148,7 +152,8 @@
|
|||||||
@input="setOptResultSize"></b-form-input>
|
@input="setOptResultSize"></b-form-input>
|
||||||
|
|
||||||
<label>{{ $t("opt.queryMode") }}</label>
|
<label>{{ $t("opt.queryMode") }}</label>
|
||||||
<b-form-select :options="queryModeOptions" :value="optQueryMode" @input="setOptQueryMode"></b-form-select>
|
<b-form-select :options="queryModeOptions" :value="optQueryMode"
|
||||||
|
@input="setOptQueryMode"></b-form-select>
|
||||||
|
|
||||||
<label>{{ $t("opt.slideDuration") }}</label>
|
<label>{{ $t("opt.slideDuration") }}</label>
|
||||||
<b-form-input :value="optLightboxSlideDuration" type="number" min="1"
|
<b-form-input :value="optLightboxSlideDuration" type="number" min="1"
|
||||||
@ -159,6 +164,17 @@
|
|||||||
@input="setOptVidPreviewInterval"></b-form-input>
|
@input="setOptVidPreviewInterval"></b-form-input>
|
||||||
</b-card>
|
</b-card>
|
||||||
|
|
||||||
|
<h4 class="mt-3">{{ $t("mlOptions") }}</h4>
|
||||||
|
<b-card>
|
||||||
|
<label>{{ $t("opt.mlRepositories") }}</label>
|
||||||
|
<b-textarea rows="3" :value="optMlRepositories" @input="setOptMlRepositories"></b-textarea>
|
||||||
|
<br>
|
||||||
|
<b-form-checkbox :checked="optAutoAnalyze" @input="setOptAutoAnalyze">{{
|
||||||
|
$t("opt.autoAnalyze")
|
||||||
|
}}
|
||||||
|
</b-form-checkbox>
|
||||||
|
</b-card>
|
||||||
|
|
||||||
<h4 class="mt-3">{{ $t("treemapOptions") }}</h4>
|
<h4 class="mt-3">{{ $t("treemapOptions") }}</h4>
|
||||||
<b-card>
|
<b-card>
|
||||||
<label>{{ $t("opt.treemapType") }}</label>
|
<label>{{ $t("opt.treemapType") }}</label>
|
||||||
@ -311,6 +327,8 @@ export default {
|
|||||||
"optSimpleLightbox",
|
"optSimpleLightbox",
|
||||||
"optShowTagPickerFilter",
|
"optShowTagPickerFilter",
|
||||||
"optFeaturedFields",
|
"optFeaturedFields",
|
||||||
|
"optMlRepositories",
|
||||||
|
"optAutoAnalyze",
|
||||||
]),
|
]),
|
||||||
clientWidth() {
|
clientWidth() {
|
||||||
return window.innerWidth;
|
return window.innerWidth;
|
||||||
@ -355,6 +373,8 @@ export default {
|
|||||||
"setOptSimpleLightbox",
|
"setOptSimpleLightbox",
|
||||||
"setOptShowTagPickerFilter",
|
"setOptShowTagPickerFilter",
|
||||||
"setOptFeaturedFields",
|
"setOptFeaturedFields",
|
||||||
|
"setOptMlRepositories",
|
||||||
|
"setOptAutoAnalyze",
|
||||||
]),
|
]),
|
||||||
onResetClick() {
|
onResetClick() {
|
||||||
localStorage.removeItem("sist2_configuration");
|
localStorage.removeItem("sist2_configuration");
|
||||||
|
@ -51,11 +51,11 @@
|
|||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include "git_hash.h"
|
#include "git_hash.h"
|
||||||
|
|
||||||
#define VERSION "3.0.3"
|
#define VERSION "3.0.4"
|
||||||
static const char *const Version = VERSION;
|
static const char *const Version = VERSION;
|
||||||
static const int VersionMajor = 3;
|
static const int VersionMajor = 3;
|
||||||
static const int VersionMinor = 0;
|
static const int VersionMinor = 0;
|
||||||
static const int VersionPatch = 3;
|
static const int VersionPatch = 4;
|
||||||
|
|
||||||
#ifndef SIST_PLATFORM
|
#ifndef SIST_PLATFORM
|
||||||
#define SIST_PLATFORM unknown
|
#define SIST_PLATFORM unknown
|
||||||
|
Loading…
x
Reference in New Issue
Block a user