mirror of
https://github.com/simon987/sist2.git
synced 2025-04-17 09:16:44 +00:00
Add NER support
This commit is contained in:
parent
b5cdd9a5df
commit
dc39c0ec4b
26
README.md
26
README.md
@ -24,10 +24,12 @@ sist2 (Simple incremental search tool)
|
||||
* Recursive scan inside archive files \*\*
|
||||
* OCR support with tesseract \*\*\*
|
||||
* Stats page & disk utilisation visualization
|
||||
* Named-entity recognition (client-side) \*\*\*\*
|
||||
|
||||
\* See [format support](#format-support)
|
||||
\*\* See [Archive files](#archive-files)
|
||||
\*\*\* See [OCR](#ocr)
|
||||
\*\*\*\* See [Named-Entity Recognition](#NER)
|
||||
|
||||
## Getting Started
|
||||
|
||||
@ -71,7 +73,6 @@ Navigate to http://localhost:8080/ to configure sist2-admin.
|
||||
Select the file corresponding to your CPU architecture and mark the binary as executable with `chmod +x`.
|
||||
3. See [usage guide](docs/USAGE.md) for command line usage.
|
||||
|
||||
|
||||
Example usage:
|
||||
|
||||
1. Scan a directory: `sist2 scan ~/Documents --output ./documents.sist2`
|
||||
@ -135,6 +136,29 @@ sist2 scan --ocr-images --ocr-lang eng ~/Images/Screenshots/
|
||||
sist2 scan --ocr-ebooks --ocr-images --ocr-lang eng+chi_sim ~/Chinese-Bilingual/
|
||||
```
|
||||
|
||||
### NER
|
||||
|
||||
sist2 v3.0.4+ supports named-entity recognition (NER). Simply add a supported repository URL to
|
||||
**Configuration** > **Machine learning options** > **Model repositories**
|
||||
to enable it.
|
||||
|
||||
The text processing is done in your browser, no data is sent to any third-party services.
|
||||
See [simon987/sist2-ner-models](https://raw.githubusercontent.com/simon987/sist2-ner-models/main/repo.json) for more details.
|
||||
|
||||
#### List of available repositories:
|
||||
|
||||
| URL | Maintainer | Purpose |
|
||||
|---------------------------------------------------------------------------------------------------------|-----------------------------------------|---------|
|
||||
| [simon987/sist2-ner-models](https://raw.githubusercontent.com/simon987/sist2-ner-models/main/repo.json) | [simon987](https://github.com/simon987) | General |
|
||||
|
||||
|
||||
<details>
|
||||
<summary>Screenshot</summary>
|
||||
|
||||

|
||||
|
||||
</details>
|
||||
|
||||
## Build from source
|
||||
|
||||
You can compile **sist2** by yourself if you don't want to use the pre-compiled binaries
|
||||
|
BIN
docs/ner.png
Normal file
BIN
docs/ner.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 448 KiB |
720
sist2-vue/package-lock.json
generated
720
sist2-vue/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -9,6 +9,7 @@
|
||||
"dependencies": {
|
||||
"@auth0/auth0-spa-js": "^2.0.2",
|
||||
"@egjs/vue-infinitegrid": "3.3.0",
|
||||
"@tensorflow/tfjs": "^4.4.0",
|
||||
"axios": "^0.25.0",
|
||||
"bootstrap-vue": "^2.21.2",
|
||||
"core-js": "^3.6.5",
|
||||
|
@ -19,6 +19,7 @@
|
||||
import NavBar from "@/components/NavBar";
|
||||
import {mapActions, mapGetters, mapMutations} from "vuex";
|
||||
import Sist2Api from "@/Sist2Api";
|
||||
import ModelsRepo from "@/ml/modelsRepo";
|
||||
import {setupAuth0} from "@/main";
|
||||
|
||||
export default {
|
||||
@ -36,6 +37,17 @@ export default {
|
||||
mounted() {
|
||||
this.$store.dispatch("loadConfiguration").then(() => {
|
||||
this.$root.$i18n.locale = this.$store.state.optLang;
|
||||
ModelsRepo.init(this.$store.getters.mlRepositoryList).catch(err => {
|
||||
this.$bvToast.toast(
|
||||
this.$t("ml.repoFetchError"),
|
||||
{
|
||||
title: this.$t("ml.repoFetchErrorTitle"),
|
||||
noAutoHide: true,
|
||||
toaster: "b-toaster-bottom-right",
|
||||
headerClass: "toast-header-warning",
|
||||
bodyClass: "toast-body-warning",
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
this.$store.subscribe((mutation) => {
|
||||
|
21
sist2-vue/src/components/AnalyzedContentSpan.vue
Normal file
21
sist2-vue/src/components/AnalyzedContentSpan.vue
Normal file
@ -0,0 +1,21 @@
|
||||
<template>
|
||||
<span :style="getStyle()">{{span.text}}</span>
|
||||
</template>
|
||||
|
||||
<script>
|
||||
|
||||
|
||||
import ModelsRepo from "@/ml/modelsRepo";
|
||||
|
||||
export default {
|
||||
name: "AnalyzedContentSpan",
|
||||
props: ["span", "text"],
|
||||
methods: {
|
||||
getStyle() {
|
||||
return ModelsRepo.data[this.$store.getters.mlModel.name].labelStyles[this.span.label];
|
||||
}
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
<style scoped></style>
|
75
sist2-vue/src/components/AnalyzedContentSpanContainer.vue
Normal file
75
sist2-vue/src/components/AnalyzedContentSpanContainer.vue
Normal file
@ -0,0 +1,75 @@
|
||||
<template>
|
||||
<div>
|
||||
<b-card class="mb-2">
|
||||
<AnalyzedContentSpan v-for="span of legend" :key="span.id" :span="span"
|
||||
class="mr-2"></AnalyzedContentSpan>
|
||||
</b-card>
|
||||
<div class="content-div">
|
||||
<AnalyzedContentSpan v-for="span of mergedSpans" :key="span.id" :span="span"></AnalyzedContentSpan>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script>
|
||||
|
||||
|
||||
import AnalyzedContentSpan from "@/components/AnalyzedContentSpan.vue";
|
||||
import ModelsRepo from "@/ml/modelsRepo";
|
||||
|
||||
export default {
|
||||
name: "AnalyzedContentSpanContainer",
|
||||
components: {AnalyzedContentSpan},
|
||||
props: ["spans", "text"],
|
||||
computed: {
|
||||
legend() {
|
||||
return Object.entries(ModelsRepo.data[this.$store.state.mlModel.name].legend)
|
||||
.map(([label, name]) => ({
|
||||
text: name,
|
||||
id: label,
|
||||
label: label
|
||||
}));
|
||||
},
|
||||
mergedSpans() {
|
||||
const spans = this.spans;
|
||||
|
||||
const merged = [];
|
||||
|
||||
let lastLabel = null;
|
||||
let fixSpace = false;
|
||||
for (let i = 0; i < spans.length; i++) {
|
||||
|
||||
if (spans[i].label !== lastLabel) {
|
||||
let start = spans[i].wordIndex;
|
||||
const nextSpan = spans.slice(i + 1).find(s => s.label !== spans[i].label)
|
||||
let end = nextSpan ? nextSpan.wordIndex : undefined;
|
||||
|
||||
if (end !== undefined && this.text[end - 1] === " ") {
|
||||
end -= 1;
|
||||
fixSpace = true;
|
||||
}
|
||||
|
||||
merged.push({
|
||||
text: this.text.slice(start, end),
|
||||
label: spans[i].label,
|
||||
id: spans[i].wordIndex
|
||||
});
|
||||
|
||||
if (fixSpace) {
|
||||
merged.push({
|
||||
text: " ",
|
||||
label: "O",
|
||||
id: end
|
||||
});
|
||||
fixSpace = false;
|
||||
}
|
||||
lastLabel = spans[i].label;
|
||||
}
|
||||
}
|
||||
|
||||
return merged;
|
||||
},
|
||||
},
|
||||
}
|
||||
</script>
|
||||
|
||||
<style scoped></style>
|
@ -1,6 +1,36 @@
|
||||
<template>
|
||||
<Preloader v-if="loading"></Preloader>
|
||||
<div v-else-if="content" class="content-div" v-html="content"></div>
|
||||
<div v-else-if="content">
|
||||
<b-form inline class="my-2" v-if="ModelsRepo.getOptions().length > 0">
|
||||
<b-checkbox class="ml-auto mr-2" :checked="optAutoAnalyze"
|
||||
@input="setOptAutoAnalyze($event); $store.dispatch('updateConfiguration')">
|
||||
{{ $t("ml.auto") }}
|
||||
</b-checkbox>
|
||||
<b-button :disabled="mlPredictionsLoading || mlLoading" @click="mlAnalyze" variant="primary"
|
||||
>{{ $t("ml.analyzeText") }}
|
||||
</b-button>
|
||||
<b-select :disabled="mlPredictionsLoading || mlLoading" class="ml-2" v-model="mlModel">
|
||||
<b-select-option :value="opt.value" v-for="opt of ModelsRepo.getOptions()">{{ opt.text }}
|
||||
</b-select-option>
|
||||
</b-select>
|
||||
</b-form>
|
||||
|
||||
<b-progress v-if="mlLoading" variant="warning" show-progress :max="1" class="mb-3"
|
||||
>
|
||||
<b-progress-bar :value="modelLoadingProgress">
|
||||
<strong>{{ ((modelLoadingProgress * modelSize) / (1024*1024)).toFixed(1) }}MB / {{
|
||||
(modelSize / (1024 * 1024)).toFixed(1)
|
||||
}}MB</strong>
|
||||
</b-progress-bar>
|
||||
</b-progress>
|
||||
|
||||
<b-progress v-if="mlPredictionsLoading" variant="primary" :value="modelPredictionProgress"
|
||||
:max="content.length" class="mb-3"></b-progress>
|
||||
|
||||
<AnalyzedContentSpansContainer v-if="analyzedContentSpans.length > 0"
|
||||
:spans="analyzedContentSpans" :text="rawContent"></AnalyzedContentSpansContainer>
|
||||
<div v-else class="content-div" v-html="content"></div>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script>
|
||||
@ -8,22 +38,40 @@ import Sist2Api from "@/Sist2Api";
|
||||
import Preloader from "@/components/Preloader";
|
||||
import Sist2Query from "@/Sist2Query";
|
||||
import store from "@/store";
|
||||
import BertNerModel from "@/ml/BertNerModel";
|
||||
import AnalyzedContentSpansContainer from "@/components/AnalyzedContentSpanContainer.vue";
|
||||
import ModelsRepo from "@/ml/modelsRepo";
|
||||
import {mapGetters, mapMutations} from "vuex";
|
||||
|
||||
export default {
|
||||
name: "LazyContentDiv",
|
||||
components: {Preloader},
|
||||
components: {AnalyzedContentSpansContainer, Preloader},
|
||||
props: ["docId"],
|
||||
data() {
|
||||
return {
|
||||
ModelsRepo,
|
||||
content: "",
|
||||
loading: true
|
||||
rawContent: "",
|
||||
loading: true,
|
||||
modelLoadingProgress: 0,
|
||||
modelPredictionProgress: 0,
|
||||
mlPredictionsLoading: false,
|
||||
mlLoading: false,
|
||||
mlModel: null,
|
||||
analyzedContentSpans: []
|
||||
}
|
||||
},
|
||||
mounted() {
|
||||
|
||||
if (this.$store.getters.optMlDefaultModel) {
|
||||
this.mlModel = this.$store.getters.optMlDefaultModel
|
||||
} else {
|
||||
this.mlModel = ModelsRepo.getDefaultModel();
|
||||
}
|
||||
|
||||
const query = Sist2Query.searchQuery();
|
||||
|
||||
if (this.$store.state.optHighlight) {
|
||||
|
||||
const fields = this.$store.state.fuzzy
|
||||
? {"content.nGram": {}}
|
||||
: {content: {}};
|
||||
@ -67,14 +115,28 @@ export default {
|
||||
this.loading = false;
|
||||
if (resp.hits.hits.length === 1) {
|
||||
this.content = this.getContent(resp.hits.hits[0]);
|
||||
} else {
|
||||
console.log("FIXME: could not get content")
|
||||
console.log(resp)
|
||||
}
|
||||
|
||||
if (this.optAutoAnalyze) {
|
||||
this.mlAnalyze();
|
||||
}
|
||||
});
|
||||
},
|
||||
computed: {
|
||||
...mapGetters(["optAutoAnalyze"]),
|
||||
modelSize() {
|
||||
const modelData = ModelsRepo.data[this.mlModel];
|
||||
if (!modelData) {
|
||||
return 0;
|
||||
}
|
||||
return modelData.size;
|
||||
}
|
||||
},
|
||||
methods: {
|
||||
...mapMutations(["setOptAutoAnalyze"]),
|
||||
getContent(doc) {
|
||||
this.rawContent = doc._source.content;
|
||||
|
||||
if (!doc.highlight) {
|
||||
return doc._source.content;
|
||||
}
|
||||
@ -85,10 +147,60 @@ export default {
|
||||
if (doc.highlight.content) {
|
||||
return doc.highlight.content[0];
|
||||
}
|
||||
},
|
||||
async getMlModel() {
|
||||
if (this.$store.getters.mlModel.name !== this.mlModel) {
|
||||
this.mlLoading = true;
|
||||
this.modelLoadingProgress = 0;
|
||||
const modelInfo = ModelsRepo.data[this.mlModel];
|
||||
|
||||
const model = new BertNerModel(
|
||||
modelInfo.vocabUrl,
|
||||
modelInfo.modelUrl,
|
||||
modelInfo.id2label,
|
||||
)
|
||||
|
||||
await model.init(progress => this.modelLoadingProgress = progress);
|
||||
this.$store.commit("setMlModel", {model, name: this.mlModel});
|
||||
|
||||
this.mlLoading = false;
|
||||
return model
|
||||
}
|
||||
|
||||
return this.$store.getters.mlModel.model;
|
||||
},
|
||||
async mlAnalyze() {
|
||||
if (!this.content) {
|
||||
return;
|
||||
}
|
||||
|
||||
const modelInfo = ModelsRepo.data[this.mlModel];
|
||||
if (modelInfo === undefined) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.$store.commit("setOptMlDefaultModel", this.mlModel);
|
||||
await this.$store.dispatch("updateConfiguration");
|
||||
|
||||
const model = await this.getMlModel();
|
||||
|
||||
this.analyzedContentSpans = [];
|
||||
|
||||
this.mlPredictionsLoading = true;
|
||||
|
||||
await model.predict(this.rawContent, results => {
|
||||
results.forEach(result => result.label = modelInfo.humanLabels[result.label]);
|
||||
this.analyzedContentSpans.push(...results);
|
||||
this.modelPredictionProgress = results[results.length - 1].wordIndex;
|
||||
});
|
||||
this.mlPredictionsLoading = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
<style>
|
||||
.progress-bar {
|
||||
transition: none;
|
||||
}
|
||||
</style>
|
@ -49,6 +49,7 @@ export default {
|
||||
configReset: "Reset configuration",
|
||||
searchOptions: "Search options",
|
||||
treemapOptions: "Treemap options",
|
||||
mlOptions: "Machine learning options",
|
||||
displayOptions: "Display options",
|
||||
opt: {
|
||||
lang: "Language",
|
||||
@ -78,7 +79,10 @@ export default {
|
||||
simpleLightbox: "Disable animations in image viewer",
|
||||
showTagPickerFilter: "Display the tag filter bar",
|
||||
featuredFields: "Featured fields Javascript template string. Will appear in the search results.",
|
||||
featuredFieldsList: "Available variables"
|
||||
featuredFieldsList: "Available variables",
|
||||
autoAnalyze: "Automatically analyze text",
|
||||
defaultModel: "Default model",
|
||||
mlRepositories: "Model repositories (one per line)"
|
||||
},
|
||||
queryMode: {
|
||||
simple: "Simple",
|
||||
@ -171,6 +175,12 @@ export default {
|
||||
selectedIndex: "selected index",
|
||||
selectedIndices: "selected indices",
|
||||
},
|
||||
ml: {
|
||||
analyzeText: "Analyze",
|
||||
auto: "Auto",
|
||||
repoFetchError: "Failed to get list of models. Check browser console for more details.",
|
||||
repoFetchErrorTitle: "Could not fetch model repositories",
|
||||
}
|
||||
},
|
||||
de: {
|
||||
filePage: {
|
||||
|
77
sist2-vue/src/ml/BertNerModel.js
Normal file
77
sist2-vue/src/ml/BertNerModel.js
Normal file
@ -0,0 +1,77 @@
|
||||
import BertTokenizer from "@/ml/BertTokenizer";
|
||||
import * as tf from "@tensorflow/tfjs";
|
||||
import axios from "axios";
|
||||
|
||||
export default class BertNerModel {
|
||||
vocabUrl;
|
||||
modelUrl;
|
||||
|
||||
id2label;
|
||||
_tokenizer;
|
||||
_model;
|
||||
inputSize = 128;
|
||||
|
||||
_previousWordId = null;
|
||||
|
||||
constructor(vocabUrl, modelUrl, id2label) {
|
||||
this.vocabUrl = vocabUrl;
|
||||
this.modelUrl = modelUrl;
|
||||
this.id2label = id2label;
|
||||
}
|
||||
|
||||
async init(onProgress) {
|
||||
await Promise.all([this.loadTokenizer(), this.loadModel(onProgress)]);
|
||||
}
|
||||
|
||||
async loadTokenizer() {
|
||||
const vocab = (await axios.get(this.vocabUrl)).data;
|
||||
this._tokenizer = new BertTokenizer(vocab);
|
||||
}
|
||||
|
||||
async loadModel(onProgress) {
|
||||
this._model = await tf.loadGraphModel(this.modelUrl, {onProgress});
|
||||
}
|
||||
|
||||
alignLabels(labels, wordIds, words) {
|
||||
const result = [];
|
||||
|
||||
for (let i = 0; i < this.inputSize; i++) {
|
||||
const label = labels[i];
|
||||
const wordId = wordIds[i];
|
||||
|
||||
if (wordId === -1) {
|
||||
continue;
|
||||
}
|
||||
if (wordId === this._previousWordId) {
|
||||
continue;
|
||||
}
|
||||
|
||||
result.push({
|
||||
word: words[wordId].text, wordIndex: words[wordId].index, label: label
|
||||
});
|
||||
this._previousWordId = wordId;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
async predict(text, callback) {
|
||||
this._previousWordId = null;
|
||||
const encoded = this._tokenizer.encodeText(text, this.inputSize)
|
||||
|
||||
for (let chunk of encoded.inputChunks) {
|
||||
const rawResult = tf.tidy(() => this._model.execute({
|
||||
input_ids: tf.tensor2d(chunk.inputIds, [1, this.inputSize], "int32"),
|
||||
token_type_ids: tf.tensor2d(chunk.segmentIds, [1, this.inputSize], "int32"),
|
||||
attention_mask: tf.tensor2d(chunk.inputMask, [1, this.inputSize], "int32"),
|
||||
}));
|
||||
|
||||
const labelIds = await tf.argMax(rawResult, -1);
|
||||
const labelIdsArray = await labelIds.array();
|
||||
const labels = labelIdsArray[0].map(id => this.id2label[id]);
|
||||
rawResult.dispose()
|
||||
|
||||
callback(this.alignLabels(labels, chunk.wordIds, encoded.words))
|
||||
}
|
||||
}
|
||||
}
|
184
sist2-vue/src/ml/BertTokenizer.js
Normal file
184
sist2-vue/src/ml/BertTokenizer.js
Normal file
@ -0,0 +1,184 @@
|
||||
import {zip, chunk} from "underscore";
|
||||
|
||||
const UNK_INDEX = 100;
|
||||
const CLS_INDEX = 101;
|
||||
const SEP_INDEX = 102;
|
||||
const CONTINUING_SUBWORD_PREFIX = "##";
|
||||
|
||||
function isWhitespace(ch) {
|
||||
return /\s/.test(ch);
|
||||
}
|
||||
|
||||
function isInvalid(ch) {
|
||||
return (ch.charCodeAt(0) === 0 || ch.charCodeAt(0) === 0xfffd);
|
||||
}
|
||||
|
||||
const punctuations = '[~`!@#$%^&*(){}[];:"\'<,.>?/\\|-_+=';
|
||||
|
||||
/** To judge whether it's a punctuation. */
|
||||
function isPunctuation(ch) {
|
||||
return punctuations.indexOf(ch) !== -1;
|
||||
}
|
||||
|
||||
export default class BertTokenizer {
|
||||
vocab;
|
||||
|
||||
constructor(vocab) {
|
||||
this.vocab = vocab;
|
||||
}
|
||||
|
||||
tokenize(text) {
|
||||
const charOriginalIndex = [];
|
||||
const cleanedText = this.cleanText(text, charOriginalIndex);
|
||||
const origTokens = cleanedText.split(' ');
|
||||
|
||||
let charCount = 0;
|
||||
const tokens = origTokens.map((token) => {
|
||||
token = token.toLowerCase();
|
||||
const tokens = this.runSplitOnPunctuation(token, charCount, charOriginalIndex);
|
||||
charCount += token.length + 1;
|
||||
return tokens;
|
||||
});
|
||||
|
||||
let flattenTokens = [];
|
||||
for (let index = 0; index < tokens.length; index++) {
|
||||
flattenTokens = flattenTokens.concat(tokens[index]);
|
||||
}
|
||||
return flattenTokens;
|
||||
}
|
||||
|
||||
/* Performs invalid character removal and whitespace cleanup on text. */
|
||||
cleanText(text, charOriginalIndex) {
|
||||
text = text.replace(/\?/g, "").trim();
|
||||
|
||||
const stringBuilder = [];
|
||||
let originalCharIndex = 0;
|
||||
let newCharIndex = 0;
|
||||
|
||||
for (const ch of text) {
|
||||
// Skip the characters that cannot be used.
|
||||
if (isInvalid(ch)) {
|
||||
originalCharIndex += ch.length;
|
||||
continue;
|
||||
}
|
||||
if (isWhitespace(ch)) {
|
||||
if (stringBuilder.length > 0 && stringBuilder[stringBuilder.length - 1] !== ' ') {
|
||||
stringBuilder.push(' ');
|
||||
charOriginalIndex[newCharIndex] = originalCharIndex;
|
||||
originalCharIndex += ch.length;
|
||||
} else {
|
||||
originalCharIndex += ch.length;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
stringBuilder.push(ch);
|
||||
charOriginalIndex[newCharIndex] = originalCharIndex;
|
||||
originalCharIndex += ch.length;
|
||||
}
|
||||
newCharIndex++;
|
||||
}
|
||||
return stringBuilder.join('');
|
||||
}
|
||||
|
||||
/* Splits punctuation on a piece of text. */
|
||||
runSplitOnPunctuation(text, count, charOriginalIndex) {
|
||||
const tokens = [];
|
||||
let startNewWord = true;
|
||||
for (const ch of text) {
|
||||
if (isPunctuation(ch)) {
|
||||
tokens.push({text: ch, index: charOriginalIndex[count]});
|
||||
count += ch.length;
|
||||
startNewWord = true;
|
||||
} else {
|
||||
if (startNewWord) {
|
||||
tokens.push({text: '', index: charOriginalIndex[count]});
|
||||
startNewWord = false;
|
||||
}
|
||||
tokens[tokens.length - 1].text += ch;
|
||||
count += ch.length;
|
||||
}
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
encode(words) {
|
||||
let outputTokens = [];
|
||||
const wordIds = [];
|
||||
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
let chars = [...words[i].text];
|
||||
|
||||
let isUnknown = false;
|
||||
let start = 0;
|
||||
let subTokens = [];
|
||||
|
||||
while (start < chars.length) {
|
||||
let end = chars.length;
|
||||
let currentSubstring = null;
|
||||
while (start < end) {
|
||||
let substr = chars.slice(start, end).join('');
|
||||
|
||||
if (start > 0) {
|
||||
substr = CONTINUING_SUBWORD_PREFIX + substr;
|
||||
}
|
||||
if (this.vocab.includes(substr)) {
|
||||
currentSubstring = this.vocab.indexOf(substr);
|
||||
break;
|
||||
}
|
||||
|
||||
--end;
|
||||
}
|
||||
if (currentSubstring == null) {
|
||||
isUnknown = true;
|
||||
break;
|
||||
}
|
||||
subTokens.push(currentSubstring);
|
||||
start = end;
|
||||
}
|
||||
|
||||
if (isUnknown) {
|
||||
outputTokens.push(UNK_INDEX);
|
||||
wordIds.push(i);
|
||||
} else {
|
||||
subTokens.forEach(tok => {
|
||||
outputTokens.push(tok);
|
||||
wordIds.push(i)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {tokens: outputTokens, wordIds};
|
||||
}
|
||||
|
||||
encodeText(inputText, inputSize) {
|
||||
|
||||
const tokenized = this.tokenize(inputText);
|
||||
const encoded = this.encode(tokenized);
|
||||
|
||||
const encodedTokenChunks = chunk(encoded.tokens, inputSize - 2);
|
||||
const encodedWordIdChunks = chunk(encoded.wordIds, inputSize - 2);
|
||||
|
||||
const chunks = [];
|
||||
|
||||
zip(encodedTokenChunks, encodedWordIdChunks).forEach(([tokens, wordIds]) => {
|
||||
const inputIds = [CLS_INDEX, ...tokens, SEP_INDEX];
|
||||
const segmentIds = Array(inputIds.length).fill(0);
|
||||
const inputMask = Array(inputIds.length).fill(1);
|
||||
wordIds = [-1, ...wordIds, -1];
|
||||
|
||||
while (inputIds.length < inputSize) {
|
||||
inputIds.push(0);
|
||||
inputMask.push(0);
|
||||
segmentIds.push(0);
|
||||
wordIds.push(-1);
|
||||
}
|
||||
|
||||
chunks.push({inputIds, inputMask, segmentIds, wordIds})
|
||||
});
|
||||
|
||||
return {
|
||||
inputChunks: chunks,
|
||||
words: tokenized
|
||||
};
|
||||
}
|
||||
}
|
43
sist2-vue/src/ml/modelsRepo.js
Normal file
43
sist2-vue/src/ml/modelsRepo.js
Normal file
@ -0,0 +1,43 @@
|
||||
import axios from "axios";
|
||||
|
||||
class ModelsRepo {
|
||||
_repositories;
|
||||
data = {};
|
||||
|
||||
async init(repositories) {
|
||||
this._repositories = repositories;
|
||||
|
||||
const data = await Promise.all(this._repositories.map(this._loadRepository));
|
||||
|
||||
data.forEach(models => {
|
||||
models.forEach(model => {
|
||||
this.data[model.name] = model;
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
async _loadRepository(repository) {
|
||||
const data = (await axios.get(repository)).data;
|
||||
data.forEach(model => {
|
||||
model["modelUrl"] = new URL(model["modelPath"], repository).href;
|
||||
model["vocabUrl"] = new URL(model["vocabPath"], repository).href;
|
||||
});
|
||||
return data;
|
||||
}
|
||||
|
||||
getOptions() {
|
||||
return Object.values(this.data).map(model => ({
|
||||
text: `${model.name} (${Math.round(model.size / (1024*1024))}MB)`,
|
||||
value: model.name
|
||||
}));
|
||||
}
|
||||
|
||||
getDefaultModel() {
|
||||
if (Object.values(this.data).length === 0) {
|
||||
return null;
|
||||
}
|
||||
return Object.values(this.data).find(model => model.default).name;
|
||||
}
|
||||
}
|
||||
|
||||
export default new ModelsRepo();
|
@ -57,6 +57,9 @@ export default new Vuex.Store({
|
||||
optVidPreviewInterval: 700,
|
||||
optSimpleLightbox: true,
|
||||
optShowTagPickerFilter: true,
|
||||
optMlRepositories: "https://raw.githubusercontent.com/simon987/sist2-ner-models/main/repo.json",
|
||||
optAutoAnalyze: false,
|
||||
optMlDefaultModel: null,
|
||||
|
||||
_onLoadSelectedIndices: [] as string[],
|
||||
_onLoadSelectedMimeTypes: [] as string[],
|
||||
@ -86,7 +89,11 @@ export default new Vuex.Store({
|
||||
|
||||
uiMimeMap: [] as any[],
|
||||
|
||||
auth0Token: null
|
||||
auth0Token: null,
|
||||
mlModel: {
|
||||
model: null,
|
||||
name: null
|
||||
},
|
||||
},
|
||||
mutations: {
|
||||
setUiShowDetails: (state, val) => state.uiShowDetails = val,
|
||||
@ -172,6 +179,9 @@ export default new Vuex.Store({
|
||||
setOptVidPreviewInterval: (state, val) => state.optVidPreviewInterval = val,
|
||||
setOptSimpleLightbox: (state, val) => state.optSimpleLightbox = val,
|
||||
setOptShowTagPickerFilter: (state, val) => state.optShowTagPickerFilter = val,
|
||||
setOptAutoAnalyze: (state, val) => {state.optAutoAnalyze = val},
|
||||
setOptMlRepositories: (state, val) => {state.optMlRepositories = val},
|
||||
setOptMlDefaultModel: (state, val) => {state.optMlDefaultModel = val},
|
||||
|
||||
setOptLightboxLoadOnlyCurrent: (state, val) => state.optLightboxLoadOnlyCurrent = val,
|
||||
setOptLightboxSlideDuration: (state, val) => state.optLightboxSlideDuration = val,
|
||||
@ -194,6 +204,7 @@ export default new Vuex.Store({
|
||||
// noop
|
||||
},
|
||||
setAuth0Token: (state, val) => state.auth0Token = val,
|
||||
setMlModel: (state, val) => state.mlModel = val,
|
||||
},
|
||||
actions: {
|
||||
setSist2Info: (store, val) => {
|
||||
@ -350,6 +361,7 @@ export default new Vuex.Store({
|
||||
},
|
||||
modules: {},
|
||||
getters: {
|
||||
mlModel: (state) => state.mlModel,
|
||||
seed: (state) => state.seed,
|
||||
getPathText: (state) => state.pathText,
|
||||
indices: state => state.indices,
|
||||
@ -416,5 +428,12 @@ export default new Vuex.Store({
|
||||
optSimpleLightbox: state => state.optSimpleLightbox,
|
||||
optShowTagPickerFilter: state => state.optShowTagPickerFilter,
|
||||
optFeaturedFields: state => state.optFeaturedFields,
|
||||
optMlRepositories: state => state.optMlRepositories,
|
||||
mlRepositoryList: state => {
|
||||
const repos = state.optMlRepositories.split("\n")
|
||||
return repos[0] == "" ? [] : repos;
|
||||
},
|
||||
optMlDefaultModel: state => state.optMlDefaultModel,
|
||||
optAutoAnalyze: state => state.optAutoAnalyze,
|
||||
}
|
||||
})
|
@ -25,7 +25,8 @@
|
||||
<b-form-select :options="themeOptions" :value="optTheme" @input="setOptTheme"></b-form-select>
|
||||
|
||||
<label>{{ $t("opt.displayMode") }}</label>
|
||||
<b-form-select :options="displayModeOptions" :value="optDisplay" @input="setOptDisplay"></b-form-select>
|
||||
<b-form-select :options="displayModeOptions" :value="optDisplay"
|
||||
@input="setOptDisplay"></b-form-select>
|
||||
|
||||
<label>{{ $t("opt.columns") }}</label>
|
||||
<b-form-select :options="columnsOptions" :value="optColumns" @input="setOptColumns"></b-form-select>
|
||||
@ -123,7 +124,10 @@
|
||||
}}
|
||||
</b-form-checkbox>
|
||||
|
||||
<b-form-checkbox :checked="optHighlight" @input="setOptHighlight">{{ $t("opt.highlight") }}</b-form-checkbox>
|
||||
<b-form-checkbox :checked="optHighlight" @input="setOptHighlight">{{
|
||||
$t("opt.highlight")
|
||||
}}
|
||||
</b-form-checkbox>
|
||||
<b-form-checkbox :checked="optTagOrOperator" @input="setOptTagOrOperator">{{
|
||||
$t("opt.tagOrOperator")
|
||||
}}
|
||||
@ -148,7 +152,8 @@
|
||||
@input="setOptResultSize"></b-form-input>
|
||||
|
||||
<label>{{ $t("opt.queryMode") }}</label>
|
||||
<b-form-select :options="queryModeOptions" :value="optQueryMode" @input="setOptQueryMode"></b-form-select>
|
||||
<b-form-select :options="queryModeOptions" :value="optQueryMode"
|
||||
@input="setOptQueryMode"></b-form-select>
|
||||
|
||||
<label>{{ $t("opt.slideDuration") }}</label>
|
||||
<b-form-input :value="optLightboxSlideDuration" type="number" min="1"
|
||||
@ -159,6 +164,17 @@
|
||||
@input="setOptVidPreviewInterval"></b-form-input>
|
||||
</b-card>
|
||||
|
||||
<h4 class="mt-3">{{ $t("mlOptions") }}</h4>
|
||||
<b-card>
|
||||
<label>{{ $t("opt.mlRepositories") }}</label>
|
||||
<b-textarea rows="3" :value="optMlRepositories" @input="setOptMlRepositories"></b-textarea>
|
||||
<br>
|
||||
<b-form-checkbox :checked="optAutoAnalyze" @input="setOptAutoAnalyze">{{
|
||||
$t("opt.autoAnalyze")
|
||||
}}
|
||||
</b-form-checkbox>
|
||||
</b-card>
|
||||
|
||||
<h4 class="mt-3">{{ $t("treemapOptions") }}</h4>
|
||||
<b-card>
|
||||
<label>{{ $t("opt.treemapType") }}</label>
|
||||
@ -311,6 +327,8 @@ export default {
|
||||
"optSimpleLightbox",
|
||||
"optShowTagPickerFilter",
|
||||
"optFeaturedFields",
|
||||
"optMlRepositories",
|
||||
"optAutoAnalyze",
|
||||
]),
|
||||
clientWidth() {
|
||||
return window.innerWidth;
|
||||
@ -355,6 +373,8 @@ export default {
|
||||
"setOptSimpleLightbox",
|
||||
"setOptShowTagPickerFilter",
|
||||
"setOptFeaturedFields",
|
||||
"setOptMlRepositories",
|
||||
"setOptAutoAnalyze",
|
||||
]),
|
||||
onResetClick() {
|
||||
localStorage.removeItem("sist2_configuration");
|
||||
|
@ -51,11 +51,11 @@
|
||||
#include <ctype.h>
|
||||
#include "git_hash.h"
|
||||
|
||||
#define VERSION "3.0.3"
|
||||
#define VERSION "3.0.4"
|
||||
static const char *const Version = VERSION;
|
||||
static const int VersionMajor = 3;
|
||||
static const int VersionMinor = 0;
|
||||
static const int VersionPatch = 3;
|
||||
static const int VersionPatch = 4;
|
||||
|
||||
#ifndef SIST_PLATFORM
|
||||
#define SIST_PLATFORM unknown
|
||||
|
Loading…
x
Reference in New Issue
Block a user