mirror of
https://github.com/simon987/sist2.git
synced 2025-12-13 07:19:06 +00:00
Compare commits
31 Commits
process-po
...
3.0.5
| Author | SHA1 | Date | |
|---|---|---|---|
| 2d8685f8f5 | |||
| c930ef7840 | |||
| d32bda0d68 | |||
| 499ed0be79 | |||
| dc39c0ec4b | |||
| b5cdd9a5df | |||
| a8b6886f7b | |||
| a7e9b6af96 | |||
| 0710dc6d3d | |||
| 75b66b5982 | |||
| 9813646c11 | |||
| ebc9468251 | |||
| 7baaca5078 | |||
| 6c4bdc87cf | |||
| 1ea78887c3 | |||
| 886fa720ec | |||
| d43aac735f | |||
| faf438a798 | |||
| 5b3b9911bd | |||
| 237d55ec9c | |||
|
|
ced4c7de88 | ||
| 90ee318981 | |||
| 785121e46c | |||
| 585c57a2ad | |||
| 42abbbce95 | |||
|
|
e8607df26f | ||
| f1726ca0a9 | |||
| 3ef675abcf | |||
|
|
81658efb19 | ||
| 60c77678b4 | |||
|
|
bf1d2f7d55 |
3
.gitattributes
vendored
3
.gitattributes
vendored
@@ -1,3 +0,0 @@
|
||||
CMakeModules/* linguist-vendored
|
||||
**/*_generated.c linguist-vendored
|
||||
**/*_generated.h linguist-vendored
|
||||
@@ -5,6 +5,7 @@ set(CMAKE_C_STANDARD 11)
|
||||
|
||||
option(SIST_DEBUG "Build a debug executable" on)
|
||||
option(SIST_FAST "Enable more optimisation flags" off)
|
||||
option(SIST_DEBUG_INFO "Turn on debug information in web interface" on)
|
||||
|
||||
add_compile_definitions(
|
||||
"SIST_PLATFORM=${SIST_PLATFORM}"
|
||||
@@ -14,8 +15,18 @@ if (SIST_DEBUG)
|
||||
add_compile_definitions(
|
||||
"SIST_DEBUG=${SIST_DEBUG}"
|
||||
)
|
||||
set(VCPKG_BUILD_TYPE debug)
|
||||
else ()
|
||||
set(VCPKG_BUILD_TYPE release)
|
||||
endif ()
|
||||
|
||||
if (SIST_DEBUG_INFO)
|
||||
add_compile_definitions(
|
||||
"SIST_DEBUG_INFO=${SIST_DEBUG_INFO}"
|
||||
)
|
||||
endif ()
|
||||
|
||||
|
||||
add_subdirectory(third-party/libscan)
|
||||
set(ARGPARSE_SHARED off)
|
||||
add_subdirectory(third-party/argparse)
|
||||
@@ -47,7 +58,7 @@ add_executable(sist2
|
||||
|
||||
src/auth0/auth0_c_api.h src/auth0/auth0_c_api.cpp
|
||||
|
||||
src/database/database_stats.c src/database/database_stats.h src/database/database_schema.c)
|
||||
src/database/database_stats.c src/database/database_schema.c)
|
||||
set_target_properties(sist2 PROPERTIES LINKER_LANGUAGE C)
|
||||
|
||||
target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/)
|
||||
|
||||
30
Dockerfile
30
Dockerfile
@@ -19,13 +19,12 @@ COPY sist2-admin sist2-admin
|
||||
RUN cd sist2-vue/ && npm install && npm run build
|
||||
RUN cd sist2-admin/frontend/ && npm install && npm run build
|
||||
|
||||
RUN mkdir build && cd build && cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake ..
|
||||
RUN mkdir build && cd build && cmake -DSIST_PLATFORM=x64_linux_docker -DSIST_DEBUG_INFO=on -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake ..
|
||||
RUN cd build && make -j$(nproc)
|
||||
RUN strip build/sist2 || mv build/sist2_debug build/sist2
|
||||
|
||||
FROM --platform="linux/amd64" ubuntu@sha256:965fbcae990b0467ed5657caceaec165018ef44a4d2d46c7cdea80a9dff0d1ea
|
||||
|
||||
WORKDIR /root
|
||||
|
||||
ENV LANG C.UTF-8
|
||||
ENV LC_ALL C.UTF-8
|
||||
@@ -37,21 +36,22 @@ RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y curl libasan5 li
|
||||
|
||||
RUN mkdir -p /usr/share/tessdata && \
|
||||
cd /usr/share/tessdata/ && \
|
||||
curl -o /usr/share/tessdata/hin.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/hin.traineddata &&\
|
||||
curl -o /usr/share/tessdata/jpn.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/jpn.traineddata &&\
|
||||
curl -o /usr/share/tessdata/eng.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/eng.traineddata &&\
|
||||
curl -o /usr/share/tessdata/fra.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/fra.traineddata &&\
|
||||
curl -o /usr/share/tessdata/rus.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/rus.traineddata &&\
|
||||
curl -o /usr/share/tessdata/osd.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/osd.traineddata &&\
|
||||
curl -o /usr/share/tessdata/spa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata &&\
|
||||
curl -o /usr/share/tessdata/deu.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/deu.traineddata &&\
|
||||
curl -o /usr/share/tessdata/equ.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/equ.traineddata &&\
|
||||
curl -o /usr/share/tessdata/chi_sim.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/chi_sim.traineddata
|
||||
curl -o /usr/share/tesseract-ocr/4.00/tessdata/hin.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/hin.traineddata &&\
|
||||
curl -o /usr/share/tesseract-ocr/4.00/tessdata/jpn.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/jpn.traineddata &&\
|
||||
curl -o /usr/share/tesseract-ocr/4.00/tessdata/ng.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/eng.traineddata &&\
|
||||
curl -o /usr/share/tesseract-ocr/4.00/tessdata/ra.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/fra.traineddata &&\
|
||||
curl -o /usr/share/tesseract-ocr/4.00/tessdata/us.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/rus.traineddata &&\
|
||||
curl -o /usr/share/tesseract-ocr/4.00/tessdata/sd.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/osd.traineddata &&\
|
||||
curl -o /usr/share/tesseract-ocr/4.00/tessdata/pa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata &&\
|
||||
curl -o /usr/share/tesseract-ocr/4.00/tessdata/eu.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/deu.traineddata &&\
|
||||
curl -o /usr/share/tesseract-ocr/4.00/tessdata/qu.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/equ.traineddata &&\
|
||||
curl -o /usr/share/tesseract-ocr/4.00/tessdata/hi_sim.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/chi_sim.traineddata
|
||||
|
||||
# sist2
|
||||
COPY --from=build /build/build/sist2 /root/sist2
|
||||
|
||||
# sist2-admin
|
||||
COPY sist2-admin/requirements.txt sist2-admin/
|
||||
RUN python3 -m pip install --no-cache -r sist2-admin/requirements.txt
|
||||
COPY --from=build /build/sist2-admin/ sist2-admin/
|
||||
WORKDIR /root/sist2-admin
|
||||
COPY sist2-admin/requirements.txt /root/sist2-admin/
|
||||
RUN python3 -m pip install --no-cache -r /root/sist2-admin/requirements.txt
|
||||
COPY --from=build /build/sist2-admin/ /root/sist2-admin/
|
||||
|
||||
@@ -3,7 +3,7 @@ MAINTAINER simon987 <me@simon987.net>
|
||||
|
||||
WORKDIR /build/
|
||||
ADD . /build/
|
||||
RUN mkdir build && cd build && cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake ..
|
||||
RUN mkdir build && cd build && cmake -DSIST_PLATFORM=arm64_linux_docker -DSIST_DEBUG_INFO=on -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake ..
|
||||
RUN cd build && make -j$(nproc)
|
||||
RUN strip build/sist2 || mv build/sist2_debug build/sist2
|
||||
|
||||
|
||||
108
README.md
108
README.md
@@ -10,13 +10,13 @@ sist2 (Simple incremental search tool)
|
||||
|
||||
*Warning: sist2 is in early development*
|
||||
|
||||

|
||||

|
||||
|
||||
## Features
|
||||
|
||||
* Fast, low memory usage, multi-threaded
|
||||
* Manage & schedule scan jobs with simple web interface (Docker only)
|
||||
* Mobile-friendly Web interface
|
||||
* Portable (all its features are packaged in a single executable)
|
||||
* Extracts text and metadata from common file types \*
|
||||
* Generates thumbnails \*
|
||||
* Incremental scanning
|
||||
@@ -24,47 +24,60 @@ sist2 (Simple incremental search tool)
|
||||
* Recursive scan inside archive files \*\*
|
||||
* OCR support with tesseract \*\*\*
|
||||
* Stats page & disk utilisation visualization
|
||||
* Named-entity recognition (client-side) \*\*\*\*
|
||||
|
||||
\* See [format support](#format-support)
|
||||
\*\* See [Archive files](#archive-files)
|
||||
\*\*\* See [OCR](#ocr)
|
||||
|
||||

|
||||
\*\*\*\* See [Named-Entity Recognition](#NER)
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Using Docker Compose *(Windows/Linux/Mac)*
|
||||
|
||||
```yaml
|
||||
version: "3"
|
||||
|
||||
services:
|
||||
elasticsearch:
|
||||
image: elasticsearch:7.17.9
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- "discovery.type=single-node"
|
||||
- "ES_JAVA_OPTS=-Xms2g -Xmx2g"
|
||||
sist2-admin:
|
||||
image: simon987/sist2:3.0.4-x64-linux
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./sist2-admin-data/:/sist2-admin/
|
||||
- /:/host
|
||||
ports:
|
||||
- 4090:4090 # sist2
|
||||
- 8080:8080 # sist2-admin
|
||||
working_dir: /root/sist2-admin/
|
||||
entrypoint: python3 /root/sist2-admin/sist2_admin/app.py
|
||||
```
|
||||
|
||||
Navigate to http://localhost:8080/ to configure sist2-admin.
|
||||
|
||||
### Using the executable file *(Linux/WSL only)*
|
||||
|
||||
1. Have an Elasticsearch (>= 6.8.X, ideally >=7.14.0) instance running
|
||||
1. Download [from official website](https://www.elastic.co/downloads/elasticsearch)
|
||||
1. *(or)* Run using docker:
|
||||
2. *(or)* Run using docker:
|
||||
```bash
|
||||
docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.17.9
|
||||
```
|
||||
1. *(or)* Run using docker-compose:
|
||||
```yaml
|
||||
elasticsearch:
|
||||
image: docker.elastic.co/elasticsearch/elasticsearch:7.17.9
|
||||
environment:
|
||||
- discovery.type=single-node
|
||||
- "ES_JAVA_OPTS=-Xms1G -Xmx2G"
|
||||
```
|
||||
1. Download sist2 executable
|
||||
1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases).
|
||||
Select the file corresponding to your CPU architecture and mark the binary as executable with `chmod +x` *
|
||||
2. *(or)* Download a [development snapshot](https://files.simon987.net/.gate/sist2/simon987_sist2/) *(Not
|
||||
recommended!)*
|
||||
3. *(or)* `docker pull simon987/sist2:2.12.1-x64-linux`
|
||||
|
||||
1. See [Usage guide](docs/USAGE.md)
|
||||
2. Download the [latest sist2 release](https://github.com/simon987/sist2/releases).
|
||||
Select the file corresponding to your CPU architecture and mark the binary as executable with `chmod +x`.
|
||||
3. See [usage guide](docs/USAGE.md) for command line usage.
|
||||
|
||||
\* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)
|
||||
Example usage:
|
||||
|
||||
## Example usage
|
||||
|
||||
See [Usage guide](docs/USAGE.md) for more details
|
||||
|
||||
1. Scan a directory: `sist2 scan ~/Documents -o ./docs_idx`
|
||||
1. Push index to Elasticsearch: `sist2 index ./docs_idx`
|
||||
1. Start web interface: `sist2 web ./docs_idx`
|
||||
1. Scan a directory: `sist2 scan ~/Documents --output ./documents.sist2`
|
||||
2. Push index to Elasticsearch: `sist2 index ./documents.sist2`
|
||||
3. Start web interface: `sist2 web ./documents.sist2`
|
||||
|
||||
## Format support
|
||||
|
||||
@@ -82,7 +95,7 @@ See [Usage guide](docs/USAGE.md) for more details
|
||||
| tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
|
||||
| docx, xlsx, pptx | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | if embedded | creator, modified_by, title |
|
||||
| doc (MS Word 97-2003) | antiword | yes | no | author, title |
|
||||
| mobi, azw, azw3 | libmobi | yes | no | author, title |
|
||||
| mobi, azw, azw3 | libmobi | yes | yes | author, title |
|
||||
| wpd (WordPerfect) | libwpd | yes | no | *planned* |
|
||||
| json, jsonl, ndjson | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | - | - |
|
||||
|
||||
@@ -123,20 +136,44 @@ sist2 scan --ocr-images --ocr-lang eng ~/Images/Screenshots/
|
||||
sist2 scan --ocr-ebooks --ocr-images --ocr-lang eng+chi_sim ~/Chinese-Bilingual/
|
||||
```
|
||||
|
||||
### NER
|
||||
|
||||
sist2 v3.0.4+ supports named-entity recognition (NER). Simply add a supported repository URL to
|
||||
**Configuration** > **Machine learning options** > **Model repositories**
|
||||
to enable it.
|
||||
|
||||
The text processing is done in your browser, no data is sent to any third-party services.
|
||||
See [simon987/sist2-ner-models](https://github.com/simon987/sist2-ner-models) for more details.
|
||||
|
||||
#### List of available repositories:
|
||||
|
||||
| URL | Maintainer | Purpose |
|
||||
|---------------------------------------------------------------------------------------------------------|-----------------------------------------|---------|
|
||||
| [simon987/sist2-ner-models](https://raw.githubusercontent.com/simon987/sist2-ner-models/main/repo.json) | [simon987](https://github.com/simon987) | General |
|
||||
|
||||
|
||||
<details>
|
||||
<summary>Screenshot</summary>
|
||||
|
||||

|
||||
|
||||
</details>
|
||||
|
||||
## Build from source
|
||||
|
||||
You can compile **sist2** by yourself if you don't want to use the pre-compiled binaries
|
||||
|
||||
### With docker (recommended)
|
||||
### Using docker
|
||||
|
||||
```bash
|
||||
git clone --recursive https://github.com/simon987/sist2/
|
||||
cd sist2
|
||||
docker build . -f ./Dockerfile -t my-sist2-image
|
||||
docker build . -t my-sist2-image
|
||||
# Copy sist2 executable from docker image
|
||||
docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
|
||||
```
|
||||
|
||||
### On a linux computer
|
||||
### Using a linux computer
|
||||
|
||||
1. Install compile-time dependencies
|
||||
|
||||
@@ -144,15 +181,14 @@ docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
|
||||
apt install gcc g++ python3 yasm ragel automake autotools-dev wget libtool libssl-dev curl zip unzip tar xorg-dev libglu1-mesa-dev libxcursor-dev libxml2-dev libxinerama-dev gettext nasm git nodejs
|
||||
```
|
||||
|
||||
1. Apply vcpkg patches, as per [sist2-build](https://github.com/simon987/sist2-build) Dockerfile
|
||||
|
||||
1. Install vcpkg dependencies
|
||||
2. Install vcpkg using my fork: https://github.com/simon987/vcpkg
|
||||
3. Install vcpkg dependencies
|
||||
|
||||
```bash
|
||||
vcpkg install curl[core,openssl] sqlite3 cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample]
|
||||
```
|
||||
|
||||
1. Build
|
||||
4. Build
|
||||
```bash
|
||||
git clone --recursive https://github.com/simon987/sist2/
|
||||
(cd sist2-vue; npm install; npm run build)
|
||||
|
||||
231
docs/USAGE.md
231
docs/USAGE.md
@@ -1,78 +1,64 @@
|
||||
# Usage
|
||||
|
||||
*More examples (specifically with docker/compose) are in progress*
|
||||
|
||||
* [scan](#scan)
|
||||
* [options](#scan-options)
|
||||
* [examples](#scan-examples)
|
||||
* [index format](#index-format)
|
||||
* [index](#index)
|
||||
* [options](#index-options)
|
||||
* [examples](#index-examples)
|
||||
* [web](#web)
|
||||
* [options](#web-options)
|
||||
* [examples](#web-examples)
|
||||
* [rewrite_url](#rewrite_url)
|
||||
* [elasticsearch](#elasticsearch)
|
||||
* [exec-script](#exec-script)
|
||||
* [tagging](#tagging)
|
||||
* [sidecar files](#sidecar-files)
|
||||
|
||||
```
|
||||
Usage: sist2 scan [OPTION]... PATH
|
||||
or: sist2 index [OPTION]... INDEX
|
||||
or: sist2 web [OPTION]... INDEX...
|
||||
or: sist2 exec-script [OPTION]... INDEX
|
||||
|
||||
Lightning-fast file system indexer and search tool.
|
||||
|
||||
-h, --help show this help message and exit
|
||||
-v, --version Show version and exit
|
||||
--verbose Turn on logging
|
||||
--very-verbose Turn on debug messages
|
||||
-v, --version Print version and exit.
|
||||
--verbose Turn on logging.
|
||||
--very-verbose Turn on debug messages.
|
||||
--json-logs Output logs in JSON format.
|
||||
|
||||
Scan options
|
||||
-t, --threads=<int> Number of threads. DEFAULT=1
|
||||
--mem-throttle=<int> Total memory threshold in MiB for scan throttling. DEFAULT=0
|
||||
-q, --thumbnail-quality=<int> Thumbnail quality, on a scale of 2 to 31, 2 being the best. DEFAULT=2
|
||||
--thumbnail-size=<int> Thumbnail size, in pixels. DEFAULT=500
|
||||
--thumbnail-count=<int> Number of thumbnails to generate. Set a value > 1 to create video previews, set to 0 to disable thumbnails. DEFAULT=1
|
||||
--content-size=<int> Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT=32768
|
||||
--incremental=<str> Reuse an existing index and only scan modified files.
|
||||
-o, --output=<str> Output directory. DEFAULT=index.sist2/
|
||||
-t, --threads=<int> Number of threads. DEFAULT: 1
|
||||
-q, --thumbnail-quality=<int> Thumbnail quality, on a scale of 2 to 31, 2 being the best. DEFAULT: 2
|
||||
--thumbnail-size=<int> Thumbnail size, in pixels. DEFAULT: 552
|
||||
--thumbnail-count=<int> Number of thumbnails to generate. Set a value > 1 to create video previews, set to 0 to disable thumbnails. DEFAULT: 1
|
||||
--content-size=<int> Number of bytes to be extracted from text documents. Set to 0 to disable. DEFAULT: 32768
|
||||
-o, --output=<str> Output index file path. DEFAULT: index.sist2
|
||||
--incremental If the output file path exists, only scan new or modified files.
|
||||
--optimize-index Defragment index file after scan to reduce its file size.
|
||||
--rewrite-url=<str> Serve files from this url instead of from disk.
|
||||
--name=<str> Index display name. DEFAULT: (name of the directory)
|
||||
--name=<str> Index display name. DEFAULT: index
|
||||
--depth=<int> Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1
|
||||
--archive=<str> Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse
|
||||
--archive=<str> Archive file mode (skip|list|shallow|recurse). skip: don't scan, list: only save file names as text, shallow: don't scan archives inside archives. DEFAULT: recurse
|
||||
--archive-passphrase=<str> Passphrase for encrypted archive files
|
||||
--ocr-lang=<str> Tesseract language (use 'tesseract --list-langs' to see which are installed on your machine)
|
||||
--ocr-images Enable OCR'ing of image files.
|
||||
--ocr-ebooks Enable OCR'ing of ebook files.
|
||||
-e, --exclude=<str> Files that match this regex will not be scanned
|
||||
--fast Only index file names & mime type
|
||||
-e, --exclude=<str> Files that match this regex will not be scanned.
|
||||
--fast Only index file names & mime type.
|
||||
--treemap-threshold=<str> Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
|
||||
--mem-buffer=<int> Maximum memory buffer size per thread in MiB for files inside archives (see USAGE.md). DEFAULT: 2000
|
||||
--read-subtitles Read subtitles from media files.
|
||||
--fast-epub Faster but less accurate EPUB parsing (no thumbnails, metadata)
|
||||
--fast-epub Faster but less accurate EPUB parsing (no thumbnails, metadata).
|
||||
--checksums Calculate file checksums when scanning.
|
||||
--list-file=<str> Specify a list of newline-delimited paths to be scanned instead of normal directory traversal. Use '-' to read from stdin.
|
||||
|
||||
Index options
|
||||
-t, --threads=<int> Number of threads. DEFAULT=1
|
||||
--es-url=<str> Elasticsearch url with port. DEFAULT=http://localhost:9200
|
||||
--es-index=<str> Elasticsearch index name. DEFAULT=sist2
|
||||
-p, --print Just print JSON documents to stdout.
|
||||
--incremental-index Conduct incremental indexing, assumes that the old index is already digested by Elasticsearch.
|
||||
-t, --threads=<int> Number of threads. DEFAULT: 1
|
||||
--es-url=<str> Elasticsearch url with port. DEFAULT: http://localhost:9200
|
||||
--es-insecure-ssl Do not verify SSL connections to Elasticsearch.
|
||||
--es-index=<str> Elasticsearch index name. DEFAULT: sist2
|
||||
-p, --print Print JSON documents to stdout instead of indexing to elasticsearch.
|
||||
--incremental-index Conduct incremental indexing. Assumes that the old index is already ingested in Elasticsearch.
|
||||
--script-file=<str> Path to user script.
|
||||
--mappings-file=<str> Path to Elasticsearch mappings.
|
||||
--settings-file=<str> Path to Elasticsearch settings.
|
||||
--async-script Execute user script asynchronously.
|
||||
--batch-size=<int> Index batch size. DEFAULT: 100
|
||||
-f, --force-reset Reset Elasticsearch mappings and settings. (You must use this option the first time you use the index command)
|
||||
--batch-size=<int> Index batch size. DEFAULT: 70
|
||||
-f, --force-reset Reset Elasticsearch mappings and settings.
|
||||
|
||||
Web options
|
||||
--es-url=<str> Elasticsearch url. DEFAULT=http://localhost:9200
|
||||
--es-index=<str> Elasticsearch index name. DEFAULT=sist2
|
||||
--bind=<str> Listen on this address. DEFAULT=localhost:4090
|
||||
--es-url=<str> Elasticsearch url. DEFAULT: http://localhost:9200
|
||||
--es-insecure-ssl Do not verify SSL connections to Elasticsearch.
|
||||
--es-index=<str> Elasticsearch index name. DEFAULT: sist2
|
||||
--bind=<str> Listen for connections on this address. DEFAULT: localhost:4090
|
||||
--auth=<str> Basic auth in user:password format
|
||||
--auth0-audience=<str> API audience/identifier
|
||||
--auth0-domain=<str> Application domain
|
||||
@@ -84,77 +70,15 @@ Web options
|
||||
--lang=<str> Default UI language. Can be changed by the user
|
||||
|
||||
Exec-script options
|
||||
--es-url=<str> Elasticsearch url. DEFAULT=http://localhost:9200
|
||||
--es-index=<str> Elasticsearch index name. DEFAULT=sist2
|
||||
--es-url=<str> Elasticsearch url. DEFAULT: http://localhost:9200
|
||||
--es-insecure-ssl Do not verify SSL connections to Elasticsearch.
|
||||
--es-index=<str> Elasticsearch index name. DEFAULT: sist2
|
||||
--script-file=<str> Path to user script.
|
||||
--async-script Execute user script asynchronously.
|
||||
|
||||
Made by simon987 <me@simon987.net>. Released under GPL-3.0
|
||||
```
|
||||
|
||||
## Scan
|
||||
|
||||
### Scan options
|
||||
|
||||
* `-t, --threads`
|
||||
Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-CimInstance Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
|
||||
* `--mem-throttle`
|
||||
Total memory threshold in MiB for scan throttling. Worker threads will not start a new parse job
|
||||
until the total memory usage of sist2 is below this threshold. Set to 0 to disable. DEFAULT=0
|
||||
* `-q, --thumbnail-quality`
|
||||
Thumbnail quality, on a scale of 2 to 32, 2 being the best. See section below for a rough estimate of thumbnail database size
|
||||
* `--thumbnail-size`
|
||||
Thumbnail size in pixels.
|
||||
* `--thumbnail-count`
|
||||
Maximum number of thumbnails to generate. When set to a value >= 2, thumbnails for video previews
|
||||
will be generated. The actual number of thumbnails generated depends on the length of the video (maximum 1 image
|
||||
every ~7s). Set to 0 to completely disable thumbnails.
|
||||
* `--content-size`
|
||||
Number of bytes of text to be extracted from the content of files (plain text, PDFs etc.).
|
||||
Repeated whitespace and special characters do not count toward this limit.
|
||||
Set to 0 to completely disable content parsing.
|
||||
* `--incremental`
|
||||
Specify an existing index. Information about files in this index that were not modified (based on *mtime* attribute)
|
||||
will be copied to the new index and will not be parsed again.
|
||||
* `-o, --output` Output directory.
|
||||
* `--rewrite-url` Set the `rewrite_url` option for the web module (See [rewrite_url](#rewrite_url))
|
||||
* `--name` Set the `name` option for the web module
|
||||
* `--depth` Maximum scan dept. Set to 0 only scan files directly in the root directory, set to -1 for infinite depth
|
||||
* `--archive` Archive file mode.
|
||||
* skip: Don't parse
|
||||
* list: Only get file names as text
|
||||
* shallow: Don't parse archives inside archives.
|
||||
* recurse: Scan archives recursively (default)
|
||||
* `--ocr-lang`, `--ocr-ebooks`, `--ocr-images` See [OCR](../README.md#OCR)
|
||||
* `-e, --exclude` Regex pattern to exclude files. A file is excluded if the pattern matches any
|
||||
part of the full absolute path.
|
||||
|
||||
Examples:
|
||||
* `-e ".*\.ttf"`: Ignore ttf files
|
||||
* `-e ".*\.(ttf|rar)"`: Ignore ttf and rar files
|
||||
* `-e "^/mnt/backups/"`: Ignore all files in the `/mnt/backups/` directory
|
||||
* `-e "^/mnt/Data[12]/"`: Ignore all files in the `/mnt/Data1/` and `/mnt/Data2/` directory
|
||||
* `-e "(^/usr/)|(^/var/)|(^/media/DRIVE-A/tmp/)|(^/media/DRIVE-B/Trash/)"` Exclude the
|
||||
`/usr`, `/var`, `/media/DRIVE-A/tmp`, `/media/DRIVE-B/Trash` directories
|
||||
* `--fast` Only index file names and mime type
|
||||
* `--treemap-threshold` Directories smaller than (`treemap-threshold` * `<total size of the index>`)
|
||||
will not be considered for the disk utilisation visualization; their size will be added to
|
||||
the parent directory. If the parent directory is still smaller than the threshold, it will also be "merged upwards"
|
||||
and so on.
|
||||
|
||||
In effect, smaller `treemap-threshold` values will yield a more detailed
|
||||
(but also a more cluttered and harder to read) visualization.
|
||||
|
||||
* `--mem-buffer` Maximum memory buffer size in MiB (per thread) for files inside archives. Media files
|
||||
larger than this number will be read sequentially and no *seek* operations will be supported.
|
||||
|
||||
To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
|
||||
* `--read-subtitles` When enabled, will attempt to read the subtitles stream from media files.
|
||||
* `--fast-epub` Much faster but less accurate EPUB parsing. When enabled, sist2 will use a simple HTML parser to read epub files instead of the MuPDF library. No thumbnails are generated and author/title metadata are not parsed.
|
||||
* `--checksums` Calculate file checksums (SHA1) when scanning files. This option does not cause any additional read
|
||||
operations. Checksums are not calculated for all file types, unless the file is inside an archive. When enabled, duplicate
|
||||
files are hidden in the web UI (this behaviour can be toggled in the Configuration page).
|
||||
|
||||
|
||||
#### Thumbnail database size estimation
|
||||
|
||||
See chart below for rough estimate of thumbnail size vs. thumbnail size & quality arguments:
|
||||
@@ -164,8 +88,6 @@ that is about `8000000 * 36kB = 288GB`.
|
||||
|
||||

|
||||
|
||||
// TODO: add note about LMDB page size 4096
|
||||
|
||||
### Scan examples
|
||||
|
||||
Simple scan
|
||||
@@ -175,82 +97,19 @@ sist2 scan ~/Documents
|
||||
sist2 scan \
|
||||
--threads 4 --content-size 16000000 --thumbnail-quality 2 --archive shallow \
|
||||
--name "My Documents" --rewrite-url "http://nas.domain.local/My Documents/" \
|
||||
~/Documents -o ./documents.idx/
|
||||
~/Documents -o ./documents.sist2
|
||||
```
|
||||
|
||||
Incremental scan
|
||||
|
||||
If the index file does not exist, `--incremental` has no effect.
|
||||
```bash
|
||||
sist scan ~/Documents -o ./documents.sist2
|
||||
sist scan ~/Documents -o ./documents.sist2 --incremental
|
||||
# or
|
||||
sist scan ~/Documents -o ./documents.sist2 --incremental
|
||||
sist scan ~/Documents -o ./documents.sist2 --incremental
|
||||
```
|
||||
sist2 scan --incremental ./orig_idx/ -o ./updated_idx/ ~/Documents
|
||||
```
|
||||
|
||||
### Index format
|
||||
|
||||
A typical `ndjson` type index structure looks like this:
|
||||
```
|
||||
documents.idx/
|
||||
├── descriptor.json
|
||||
├── _index_main.ndjson.zst
|
||||
├── treemap.csv
|
||||
├── agg_mime.csv
|
||||
├── agg_date.csv
|
||||
├── add_size.csv
|
||||
├── thumbs/
|
||||
| ├── data.mdb
|
||||
| └── lock.mdb
|
||||
├── tags/
|
||||
| ├── data.mdb
|
||||
| └── lock.mdb
|
||||
└── meta/
|
||||
├── data.mdb
|
||||
└── lock.mdb
|
||||
```
|
||||
|
||||
The `_index_*.ndjson.zst` files contain the document data in JSON format, in a compressed newline-delemited file.
|
||||
|
||||
The `thumbs/` folder is a [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database)
|
||||
database containing the thumbnails.
|
||||
|
||||
The `descriptor.json` file contains general information about the index. The
|
||||
following fields are safe to modify manually: `root`, `name`, [rewrite_url](#rewrite_url) and `timestamp`.
|
||||
|
||||
The `.csv` are pre-computed aggregations necessary for the stats page.
|
||||
|
||||
*thumbs/*:
|
||||
|
||||
LMDB key-value store. Keys are **binary** 16-byte md5 hash* (`_id` field)
|
||||
and values are raw image bytes.
|
||||
|
||||
*\* Hash is calculated from the full path of the file, including the extension, relative to the index root*
|
||||
|
||||
|
||||
## Index
|
||||
### Index options
|
||||
* `--es-url`
|
||||
Elasticsearch url and port. If you are using docker, make sure that both containers are on the
|
||||
same network.
|
||||
* `--es-index`
|
||||
Elasticsearch index name. DEFAULT=sist2
|
||||
* `-p, --print`
|
||||
Print index in JSON format to stdout.
|
||||
* `--incremental-index`
|
||||
Conduct incremental indexing. Assumes that the old index is already ingested in Elasticsearch.
|
||||
Only the new changes since the last scan will be sent.
|
||||
* `--script-file`
|
||||
Path to user script. See [Scripting](scripting.md).
|
||||
* `--mappings-file`
|
||||
Path to custom Elasticsearch mappings. If none is specified, [the bundled mappings](https://github.com/simon987/sist2/tree/master/schema) will be used.
|
||||
* `--settings-file`
|
||||
Path to custom Elasticsearch settings. *(See above)*
|
||||
* `--async-script`
|
||||
Use `wait_for_completion=false` elasticsearch option while executing user script.
|
||||
(See [Elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/tasks.html))
|
||||
* `--batch-size=<int>`
|
||||
Index batch size. Indexing is generally faster with larger batches, but payloads that
|
||||
are too large will fail and additional overhead for retrying with smaller sizes may slow
|
||||
down the process.
|
||||
* `-f, --force-reset`
|
||||
Reset Elasticsearch mappings and settings.
|
||||
* `-t, --threads` Number of threads to use. Ideally, choose a number equal to the number of logical cores of the machine hosting Elasticsearch.
|
||||
|
||||
### Index examples
|
||||
|
||||
@@ -380,8 +239,8 @@ The sidecar file must have exactly the same file path and the `.s2meta` suffix.
|
||||
```
|
||||
|
||||
```
|
||||
sist2 scan ~/Documents -o ./docs.idx
|
||||
sist2 index ./docs.idx
|
||||
sist2 scan ~/Documents -o ./docs.sist2
|
||||
sist2 index ./docs.sist2
|
||||
```
|
||||
|
||||
*NOTE*: It is technically possible to overwrite the `tag` value using sidecar files, however,
|
||||
|
||||
BIN
docs/ner.png
Normal file
BIN
docs/ner.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 448 KiB |
BIN
docs/sist2.gif
Normal file
BIN
docs/sist2.gif
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 3.7 MiB |
BIN
docs/sist2.png
BIN
docs/sist2.png
Binary file not shown.
|
Before Width: | Height: | Size: 1011 KiB |
@@ -4,14 +4,20 @@ VCPKG_ROOT="/vcpkg"
|
||||
|
||||
git submodule update --init --recursive
|
||||
|
||||
rm -rf CMakeFiles CMakeCache.txt
|
||||
cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
|
||||
mkdir build
|
||||
(
|
||||
cd build
|
||||
cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG_INFO=on -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" ..
|
||||
make -j $(nproc)
|
||||
strip sist2
|
||||
./sist2 -v > VERSION
|
||||
mv sist2 sist2-x64-linux
|
||||
)
|
||||
mv build/sist2 sist2-x64-linux
|
||||
|
||||
(
|
||||
cd build
|
||||
rm -rf CMakeFiles CMakeCache.txt
|
||||
cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
|
||||
cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG_INFO=on -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" ..
|
||||
make -j $(nproc)
|
||||
mv sist2_debug sist2-x64-linux-debug
|
||||
)
|
||||
mv build/sist2_debug sist2-x64-linux-debug
|
||||
@@ -4,14 +4,19 @@ VCPKG_ROOT="/vcpkg"
|
||||
|
||||
git submodule update --init --recursive
|
||||
|
||||
rm -rf CMakeFiles CMakeCache.txt
|
||||
cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
|
||||
mkdir build
|
||||
(
|
||||
cd build
|
||||
cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG_INFO=on -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" ..
|
||||
make -j $(nproc)
|
||||
strip sist2
|
||||
mv sist2 sist2-arm64-linux
|
||||
)
|
||||
mv build/sist2 sist2-arm64-linux
|
||||
|
||||
rm -rf CMakeFiles CMakeCache.txt
|
||||
cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" .
|
||||
(
|
||||
cd build
|
||||
cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG_INFO=on -DSIST_DEBUG=on -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE="${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" ..
|
||||
make -j $(nproc)
|
||||
strip sist2
|
||||
mv sist2_debug sist2-arm64-linux-debug
|
||||
)
|
||||
mv build/sist2_debug sist2-arm64-linux-debug
|
||||
@@ -1,3 +1,3 @@
|
||||
docker run --rm -it --name "sist2-dev-es"\
|
||||
-p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" \
|
||||
-e "ES_JAVA_OPTS=-Xms8g -Xmx8g" elasticsearch:8.1.2
|
||||
-e "ES_JAVA_OPTS=-Xms8g -Xmx8g" elasticsearch:8.7.0
|
||||
|
||||
12
sist2-admin/frontend/package-lock.json
generated
12
sist2-admin/frontend/package-lock.json
generated
@@ -10491,9 +10491,9 @@
|
||||
"integrity": "sha1-JFNCdeKnvGvnvIZhHMFq4KVlSHE="
|
||||
},
|
||||
"node_modules/webpack": {
|
||||
"version": "5.75.0",
|
||||
"resolved": "https://registry.npmjs.org/webpack/-/webpack-5.75.0.tgz",
|
||||
"integrity": "sha512-piaIaoVJlqMsPtX/+3KTTO6jfvrSYgauFVdt8cr9LTHKmcq/AMd4mhzsiP7ZF/PGRNPGA8336jldh9l2Kt2ogQ==",
|
||||
"version": "5.78.0",
|
||||
"resolved": "https://registry.npmjs.org/webpack/-/webpack-5.78.0.tgz",
|
||||
"integrity": "sha512-gT5DP72KInmE/3azEaQrISjTvLYlSM0j1Ezhht/KLVkrqtv10JoP/RXhwmX/frrutOPuSq3o5Vq0ehR/4Vmd1g==",
|
||||
"dev": true,
|
||||
"dependencies": {
|
||||
"@types/eslint-scope": "^3.7.3",
|
||||
@@ -18719,9 +18719,9 @@
|
||||
"integrity": "sha1-JFNCdeKnvGvnvIZhHMFq4KVlSHE="
|
||||
},
|
||||
"webpack": {
|
||||
"version": "5.75.0",
|
||||
"resolved": "https://registry.npmjs.org/webpack/-/webpack-5.75.0.tgz",
|
||||
"integrity": "sha512-piaIaoVJlqMsPtX/+3KTTO6jfvrSYgauFVdt8cr9LTHKmcq/AMd4mhzsiP7ZF/PGRNPGA8336jldh9l2Kt2ogQ==",
|
||||
"version": "5.78.0",
|
||||
"resolved": "https://registry.npmjs.org/webpack/-/webpack-5.78.0.tgz",
|
||||
"integrity": "sha512-gT5DP72KInmE/3azEaQrISjTvLYlSM0j1Ezhht/KLVkrqtv10JoP/RXhwmX/frrutOPuSq3o5Vq0ehR/4Vmd1g==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"@types/eslint-scope": "^3.7.3",
|
||||
|
||||
@@ -1390,14 +1390,14 @@
|
||||
thread-loader "^3.0.0"
|
||||
webpack "^5.54.0"
|
||||
|
||||
"@vue/cli-plugin-router@~5.0.8":
|
||||
"@vue/cli-plugin-router@^5.0.8", "@vue/cli-plugin-router@~5.0.8":
|
||||
version "5.0.8"
|
||||
resolved "https://registry.npmjs.org/@vue/cli-plugin-router/-/cli-plugin-router-5.0.8.tgz"
|
||||
integrity sha512-Gmv4dsGdAsWPqVijz3Ux2OS2HkMrWi1ENj2cYL75nUeL+Xj5HEstSqdtfZ0b1q9NCce+BFB6QnHfTBXc/fCvMg==
|
||||
dependencies:
|
||||
"@vue/cli-shared-utils" "^5.0.8"
|
||||
|
||||
"@vue/cli-plugin-vuex@~5.0.8":
|
||||
"@vue/cli-plugin-vuex@^5.0.8", "@vue/cli-plugin-vuex@~5.0.8":
|
||||
version "5.0.8"
|
||||
resolved "https://registry.npmjs.org/@vue/cli-plugin-vuex/-/cli-plugin-vuex-5.0.8.tgz"
|
||||
integrity sha512-HSYWPqrunRE5ZZs8kVwiY6oWcn95qf/OQabwLfprhdpFWAGtLStShjsGED2aDpSSeGAskQETrtR/5h7VqgIlBA==
|
||||
@@ -5492,9 +5492,9 @@ webpack-virtual-modules@^0.4.2:
|
||||
integrity sha512-5tyDlKLqPfMqjT3Q9TAqf2YqjwmnUleZwzJi1A5qXnlBCdj2AtOJ6wAWdglTIDOPgOiOrXeBeFcsQ8+aGQ6QbA==
|
||||
|
||||
webpack@^5.54.0:
|
||||
version "5.75.0"
|
||||
resolved "https://registry.npmjs.org/webpack/-/webpack-5.75.0.tgz"
|
||||
integrity sha512-piaIaoVJlqMsPtX/+3KTTO6jfvrSYgauFVdt8cr9LTHKmcq/AMd4mhzsiP7ZF/PGRNPGA8336jldh9l2Kt2ogQ==
|
||||
version "5.78.0"
|
||||
resolved "https://registry.yarnpkg.com/webpack/-/webpack-5.78.0.tgz#836452a12416af2a7beae906b31644cb2562f9e6"
|
||||
integrity sha512-gT5DP72KInmE/3azEaQrISjTvLYlSM0j1Ezhht/KLVkrqtv10JoP/RXhwmX/frrutOPuSq3o5Vq0ehR/4Vmd1g==
|
||||
dependencies:
|
||||
"@types/eslint-scope" "^3.7.3"
|
||||
"@types/estree" "^0.0.51"
|
||||
|
||||
1615
sist2-vue/package-lock.json
generated
1615
sist2-vue/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -9,10 +9,11 @@
|
||||
"dependencies": {
|
||||
"@auth0/auth0-spa-js": "^2.0.2",
|
||||
"@egjs/vue-infinitegrid": "3.3.0",
|
||||
"@tensorflow/tfjs": "^4.4.0",
|
||||
"axios": "^0.25.0",
|
||||
"bootstrap-vue": "^2.21.2",
|
||||
"core-js": "^3.6.5",
|
||||
"d3": "^5.6.1",
|
||||
"d3": "^7.8.4",
|
||||
"date-fns": "^2.21.3",
|
||||
"dom-to-image": "^2.6.0",
|
||||
"fslightbox-vue": "fslightbox-vue.tgz",
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
import NavBar from "@/components/NavBar";
|
||||
import {mapActions, mapGetters, mapMutations} from "vuex";
|
||||
import Sist2Api from "@/Sist2Api";
|
||||
import ModelsRepo from "@/ml/modelsRepo";
|
||||
import {setupAuth0} from "@/main";
|
||||
|
||||
export default {
|
||||
@@ -36,6 +37,17 @@ export default {
|
||||
mounted() {
|
||||
this.$store.dispatch("loadConfiguration").then(() => {
|
||||
this.$root.$i18n.locale = this.$store.state.optLang;
|
||||
ModelsRepo.init(this.$store.getters.mlRepositoryList).catch(err => {
|
||||
this.$bvToast.toast(
|
||||
this.$t("ml.repoFetchError"),
|
||||
{
|
||||
title: this.$t("ml.repoFetchErrorTitle"),
|
||||
noAutoHide: true,
|
||||
toaster: "b-toaster-bottom-right",
|
||||
headerClass: "toast-header-warning",
|
||||
bodyClass: "toast-body-warning",
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
this.$store.subscribe((mutation) => {
|
||||
|
||||
@@ -361,20 +361,20 @@ class Sist2Api {
|
||||
});
|
||||
}
|
||||
|
||||
getTreemapCsvUrl(indexId: string) {
|
||||
return `${this.baseUrl}s/${indexId}/1`;
|
||||
getTreemapStat(indexId: string) {
|
||||
return `${this.baseUrl}s/${indexId}/TMAP`;
|
||||
}
|
||||
|
||||
getMimeCsvUrl(indexId: string) {
|
||||
return `${this.baseUrl}s/${indexId}/2`;
|
||||
getMimeStat(indexId: string) {
|
||||
return `${this.baseUrl}s/${indexId}/MAGG`;
|
||||
}
|
||||
|
||||
getSizeCsv(indexId: string) {
|
||||
return `${this.baseUrl}s/${indexId}/3`;
|
||||
getSizeStat(indexId: string) {
|
||||
return `${this.baseUrl}s/${indexId}/SAGG`;
|
||||
}
|
||||
|
||||
getDateCsv(indexId: string) {
|
||||
return `${this.baseUrl}s/${indexId}/4`;
|
||||
getDateStat(indexId: string) {
|
||||
return `${this.baseUrl}s/${indexId}/DAGG`;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
21
sist2-vue/src/components/AnalyzedContentSpan.vue
Normal file
21
sist2-vue/src/components/AnalyzedContentSpan.vue
Normal file
@@ -0,0 +1,21 @@
|
||||
<template>
|
||||
<span :style="getStyle()">{{span.text}}</span>
|
||||
</template>
|
||||
|
||||
<script>
|
||||
|
||||
|
||||
import ModelsRepo from "@/ml/modelsRepo";
|
||||
|
||||
export default {
|
||||
name: "AnalyzedContentSpan",
|
||||
props: ["span", "text"],
|
||||
methods: {
|
||||
getStyle() {
|
||||
return ModelsRepo.data[this.$store.getters.mlModel.name].labelStyles[this.span.label];
|
||||
}
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
<style scoped></style>
|
||||
75
sist2-vue/src/components/AnalyzedContentSpanContainer.vue
Normal file
75
sist2-vue/src/components/AnalyzedContentSpanContainer.vue
Normal file
@@ -0,0 +1,75 @@
|
||||
<template>
|
||||
<div>
|
||||
<b-card class="mb-2">
|
||||
<AnalyzedContentSpan v-for="span of legend" :key="span.id" :span="span"
|
||||
class="mr-2"></AnalyzedContentSpan>
|
||||
</b-card>
|
||||
<div class="content-div">
|
||||
<AnalyzedContentSpan v-for="span of mergedSpans" :key="span.id" :span="span"></AnalyzedContentSpan>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script>
|
||||
|
||||
|
||||
import AnalyzedContentSpan from "@/components/AnalyzedContentSpan.vue";
|
||||
import ModelsRepo from "@/ml/modelsRepo";
|
||||
|
||||
export default {
|
||||
name: "AnalyzedContentSpanContainer",
|
||||
components: {AnalyzedContentSpan},
|
||||
props: ["spans", "text"],
|
||||
computed: {
|
||||
legend() {
|
||||
return Object.entries(ModelsRepo.data[this.$store.state.mlModel.name].legend)
|
||||
.map(([label, name]) => ({
|
||||
text: name,
|
||||
id: label,
|
||||
label: label
|
||||
}));
|
||||
},
|
||||
mergedSpans() {
|
||||
const spans = this.spans;
|
||||
|
||||
const merged = [];
|
||||
|
||||
let lastLabel = null;
|
||||
let fixSpace = false;
|
||||
for (let i = 0; i < spans.length; i++) {
|
||||
|
||||
if (spans[i].label !== lastLabel) {
|
||||
let start = spans[i].wordIndex;
|
||||
const nextSpan = spans.slice(i + 1).find(s => s.label !== spans[i].label)
|
||||
let end = nextSpan ? nextSpan.wordIndex : undefined;
|
||||
|
||||
if (end !== undefined && this.text[end - 1] === " ") {
|
||||
end -= 1;
|
||||
fixSpace = true;
|
||||
}
|
||||
|
||||
merged.push({
|
||||
text: this.text.slice(start, end),
|
||||
label: spans[i].label,
|
||||
id: spans[i].wordIndex
|
||||
});
|
||||
|
||||
if (fixSpace) {
|
||||
merged.push({
|
||||
text: " ",
|
||||
label: "O",
|
||||
id: end
|
||||
});
|
||||
fixSpace = false;
|
||||
}
|
||||
lastLabel = spans[i].label;
|
||||
}
|
||||
}
|
||||
|
||||
return merged;
|
||||
},
|
||||
},
|
||||
}
|
||||
</script>
|
||||
|
||||
<style scoped></style>
|
||||
@@ -120,7 +120,7 @@ export default {
|
||||
update(indexId) {
|
||||
const svg = d3.select("#date-histogram");
|
||||
|
||||
d3.csv(Sist2Api.getDateCsv(indexId)).then(tabularData => {
|
||||
d3.json(Sist2Api.getDateStat(indexId)).then(tabularData => {
|
||||
dateHistogram(tabularData.slice(), svg, this.$t("d3.dateHistogram"));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -91,7 +91,7 @@ export default {
|
||||
const mimeSvgCount = d3.select("#agg-mime-count");
|
||||
const fillOpacity = this.$store.state.optTheme === "black" ? 0.9 : 0.6;
|
||||
|
||||
d3.csv(Sist2Api.getMimeCsvUrl(indexId)).then(tabularData => {
|
||||
d3.json(Sist2Api.getMimeStat(indexId)).then(tabularData => {
|
||||
mimeBarCount(tabularData.slice(), mimeSvgCount, fillOpacity, this.$t("d3.mimeCount"));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -90,7 +90,7 @@ export default {
|
||||
const mimeSvgSize = d3.select("#agg-mime-size");
|
||||
const fillOpacity = this.$store.state.optTheme === "black" ? 0.9 : 0.6;
|
||||
|
||||
d3.csv(Sist2Api.getMimeCsvUrl(indexId)).then(tabularData => {
|
||||
d3.json(Sist2Api.getMimeStat(indexId)).then(tabularData => {
|
||||
mimeBarSize(tabularData.slice(), mimeSvgSize, fillOpacity, this.$t("d3.mimeSize"));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -117,7 +117,7 @@ export default {
|
||||
update(indexId) {
|
||||
const svg = d3.select("#size-histogram");
|
||||
|
||||
d3.csv(Sist2Api.getSizeCsv(indexId)).then(tabularData => {
|
||||
d3.json(Sist2Api.getSizeStat(indexId)).then(tabularData => {
|
||||
sizeHistogram(tabularData.slice(), svg, this.$t("d3.sizeHistogram"));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -240,7 +240,7 @@ export default {
|
||||
.style("overflow", "visible")
|
||||
.style("font", "10px sans-serif");
|
||||
|
||||
d3.csv(Sist2Api.getTreemapCsvUrl(indexId)).then(tabularData => {
|
||||
d3.json(Sist2Api.getTreemapStat(indexId)).then(tabularData => {
|
||||
tabularData.forEach(row => {
|
||||
row.taxonomy = row.path.split("/");
|
||||
row.size = Number(row.size);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
<template>
|
||||
<b-card class="mb-4 mt-4">
|
||||
<b-card v-if="$store.state.sist2Info.showDebugInfo" class="mb-4 mt-4">
|
||||
<b-card-title><DebugIcon class="mr-1"></DebugIcon>{{ $t("debug") }}</b-card-title>
|
||||
<p v-html="$t('debugDescription')"></p>
|
||||
|
||||
|
||||
@@ -16,6 +16,10 @@ export default {
|
||||
props: ["doc"],
|
||||
computed: {
|
||||
featuredLineHtml() {
|
||||
if (this.$store.getters.optFeaturedFields === undefined) {
|
||||
return "";
|
||||
}
|
||||
|
||||
const scope = {doc: this.doc._source, humanDate: humanDate, humanFileSize: humanFileSize};
|
||||
|
||||
return this.$store.getters.optFeaturedFields
|
||||
|
||||
@@ -1,6 +1,36 @@
|
||||
<template>
|
||||
<Preloader v-if="loading"></Preloader>
|
||||
<div v-else-if="content" class="content-div" v-html="content"></div>
|
||||
<div v-else-if="content">
|
||||
<b-form inline class="my-2" v-if="ModelsRepo.getOptions().length > 0">
|
||||
<b-checkbox class="ml-auto mr-2" :checked="optAutoAnalyze"
|
||||
@input="setOptAutoAnalyze($event); $store.dispatch('updateConfiguration')">
|
||||
{{ $t("ml.auto") }}
|
||||
</b-checkbox>
|
||||
<b-button :disabled="mlPredictionsLoading || mlLoading" @click="mlAnalyze" variant="primary"
|
||||
>{{ $t("ml.analyzeText") }}
|
||||
</b-button>
|
||||
<b-select :disabled="mlPredictionsLoading || mlLoading" class="ml-2" v-model="mlModel">
|
||||
<b-select-option :value="opt.value" v-for="opt of ModelsRepo.getOptions()">{{ opt.text }}
|
||||
</b-select-option>
|
||||
</b-select>
|
||||
</b-form>
|
||||
|
||||
<b-progress v-if="mlLoading" variant="warning" show-progress :max="1" class="mb-3"
|
||||
>
|
||||
<b-progress-bar :value="modelLoadingProgress">
|
||||
<strong>{{ ((modelLoadingProgress * modelSize) / (1024*1024)).toFixed(1) }}MB / {{
|
||||
(modelSize / (1024 * 1024)).toFixed(1)
|
||||
}}MB</strong>
|
||||
</b-progress-bar>
|
||||
</b-progress>
|
||||
|
||||
<b-progress v-if="mlPredictionsLoading" variant="primary" :value="modelPredictionProgress"
|
||||
:max="content.length" class="mb-3"></b-progress>
|
||||
|
||||
<AnalyzedContentSpansContainer v-if="analyzedContentSpans.length > 0"
|
||||
:spans="analyzedContentSpans" :text="rawContent"></AnalyzedContentSpansContainer>
|
||||
<div v-else class="content-div" v-html="content"></div>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script>
|
||||
@@ -8,22 +38,40 @@ import Sist2Api from "@/Sist2Api";
|
||||
import Preloader from "@/components/Preloader";
|
||||
import Sist2Query from "@/Sist2Query";
|
||||
import store from "@/store";
|
||||
import BertNerModel from "@/ml/BertNerModel";
|
||||
import AnalyzedContentSpansContainer from "@/components/AnalyzedContentSpanContainer.vue";
|
||||
import ModelsRepo from "@/ml/modelsRepo";
|
||||
import {mapGetters, mapMutations} from "vuex";
|
||||
|
||||
export default {
|
||||
name: "LazyContentDiv",
|
||||
components: {Preloader},
|
||||
components: {AnalyzedContentSpansContainer, Preloader},
|
||||
props: ["docId"],
|
||||
data() {
|
||||
return {
|
||||
ModelsRepo,
|
||||
content: "",
|
||||
loading: true
|
||||
rawContent: "",
|
||||
loading: true,
|
||||
modelLoadingProgress: 0,
|
||||
modelPredictionProgress: 0,
|
||||
mlPredictionsLoading: false,
|
||||
mlLoading: false,
|
||||
mlModel: null,
|
||||
analyzedContentSpans: []
|
||||
}
|
||||
},
|
||||
mounted() {
|
||||
|
||||
if (this.$store.getters.optMlDefaultModel) {
|
||||
this.mlModel = this.$store.getters.optMlDefaultModel
|
||||
} else {
|
||||
this.mlModel = ModelsRepo.getDefaultModel();
|
||||
}
|
||||
|
||||
const query = Sist2Query.searchQuery();
|
||||
|
||||
if (this.$store.state.optHighlight) {
|
||||
|
||||
const fields = this.$store.state.fuzzy
|
||||
? {"content.nGram": {}}
|
||||
: {content: {}};
|
||||
@@ -67,14 +115,28 @@ export default {
|
||||
this.loading = false;
|
||||
if (resp.hits.hits.length === 1) {
|
||||
this.content = this.getContent(resp.hits.hits[0]);
|
||||
} else {
|
||||
console.log("FIXME: could not get content")
|
||||
console.log(resp)
|
||||
}
|
||||
|
||||
if (this.optAutoAnalyze) {
|
||||
this.mlAnalyze();
|
||||
}
|
||||
});
|
||||
},
|
||||
computed: {
|
||||
...mapGetters(["optAutoAnalyze"]),
|
||||
modelSize() {
|
||||
const modelData = ModelsRepo.data[this.mlModel];
|
||||
if (!modelData) {
|
||||
return 0;
|
||||
}
|
||||
return modelData.size;
|
||||
}
|
||||
},
|
||||
methods: {
|
||||
...mapMutations(["setOptAutoAnalyze"]),
|
||||
getContent(doc) {
|
||||
this.rawContent = doc._source.content;
|
||||
|
||||
if (!doc.highlight) {
|
||||
return doc._source.content;
|
||||
}
|
||||
@@ -85,10 +147,60 @@ export default {
|
||||
if (doc.highlight.content) {
|
||||
return doc.highlight.content[0];
|
||||
}
|
||||
},
|
||||
async getMlModel() {
|
||||
if (this.$store.getters.mlModel.name !== this.mlModel) {
|
||||
this.mlLoading = true;
|
||||
this.modelLoadingProgress = 0;
|
||||
const modelInfo = ModelsRepo.data[this.mlModel];
|
||||
|
||||
const model = new BertNerModel(
|
||||
modelInfo.vocabUrl,
|
||||
modelInfo.modelUrl,
|
||||
modelInfo.id2label,
|
||||
)
|
||||
|
||||
await model.init(progress => this.modelLoadingProgress = progress);
|
||||
this.$store.commit("setMlModel", {model, name: this.mlModel});
|
||||
|
||||
this.mlLoading = false;
|
||||
return model
|
||||
}
|
||||
|
||||
return this.$store.getters.mlModel.model;
|
||||
},
|
||||
async mlAnalyze() {
|
||||
if (!this.content) {
|
||||
return;
|
||||
}
|
||||
|
||||
const modelInfo = ModelsRepo.data[this.mlModel];
|
||||
if (modelInfo === undefined) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.$store.commit("setOptMlDefaultModel", this.mlModel);
|
||||
await this.$store.dispatch("updateConfiguration");
|
||||
|
||||
const model = await this.getMlModel();
|
||||
|
||||
this.analyzedContentSpans = [];
|
||||
|
||||
this.mlPredictionsLoading = true;
|
||||
|
||||
await model.predict(this.rawContent, results => {
|
||||
results.forEach(result => result.label = modelInfo.humanLabels[result.label]);
|
||||
this.analyzedContentSpans.push(...results);
|
||||
this.modelPredictionProgress = results[results.length - 1].wordIndex;
|
||||
});
|
||||
this.mlPredictionsLoading = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
<style>
|
||||
.progress-bar {
|
||||
transition: none;
|
||||
}
|
||||
</style>
|
||||
@@ -49,6 +49,7 @@ export default {
|
||||
configReset: "Reset configuration",
|
||||
searchOptions: "Search options",
|
||||
treemapOptions: "Treemap options",
|
||||
mlOptions: "Machine learning options",
|
||||
displayOptions: "Display options",
|
||||
opt: {
|
||||
lang: "Language",
|
||||
@@ -78,7 +79,10 @@ export default {
|
||||
simpleLightbox: "Disable animations in image viewer",
|
||||
showTagPickerFilter: "Display the tag filter bar",
|
||||
featuredFields: "Featured fields Javascript template string. Will appear in the search results.",
|
||||
featuredFieldsList: "Available variables"
|
||||
featuredFieldsList: "Available variables",
|
||||
autoAnalyze: "Automatically analyze text",
|
||||
defaultModel: "Default model",
|
||||
mlRepositories: "Model repositories (one per line)"
|
||||
},
|
||||
queryMode: {
|
||||
simple: "Simple",
|
||||
@@ -171,6 +175,12 @@ export default {
|
||||
selectedIndex: "selected index",
|
||||
selectedIndices: "selected indices",
|
||||
},
|
||||
ml: {
|
||||
analyzeText: "Analyze",
|
||||
auto: "Auto",
|
||||
repoFetchError: "Failed to get list of models. Check browser console for more details.",
|
||||
repoFetchErrorTitle: "Could not fetch model repositories",
|
||||
}
|
||||
},
|
||||
de: {
|
||||
filePage: {
|
||||
@@ -250,8 +260,8 @@ export default {
|
||||
vidPreviewInterval: "Videovorschau Framedauer in ms",
|
||||
simpleLightbox: "Schalte Animationen im Image-Viewer ab",
|
||||
showTagPickerFilter: "Zeige die Tag-Filter-Leiste",
|
||||
featuredFields: "Ausgewählte Felder Javascript Vorlage String. Wird in den Suchergebnissen angezeigt.",
|
||||
featuredFieldsList: "Verfügbare Variablen"
|
||||
featuredFields: "Variablen, welche zusätzlich in den Suchergebnissen angezeigt werden können.",
|
||||
featuredFieldsList: "verfügbare Variablen"
|
||||
},
|
||||
queryMode: {
|
||||
simple: "Einfach",
|
||||
@@ -333,10 +343,10 @@ export default {
|
||||
random: "zufällig",
|
||||
},
|
||||
d3: {
|
||||
mimeCount: "Anzahlverteilung nach Medientyp",
|
||||
mimeSize: "Größenverteilung nach Medientyp",
|
||||
dateHistogram: "Verteilung der Änderungszeiten",
|
||||
sizeHistogram: "Verteilung der Dateigrößen",
|
||||
mimeCount: "Anzahl nach Medientyp",
|
||||
mimeSize: "Größen nach Medientyp",
|
||||
dateHistogram: "Änderungszeiten",
|
||||
sizeHistogram: "Dateigrößen",
|
||||
},
|
||||
indexPicker: {
|
||||
selectNone: "keinen auswählen",
|
||||
|
||||
77
sist2-vue/src/ml/BertNerModel.js
Normal file
77
sist2-vue/src/ml/BertNerModel.js
Normal file
@@ -0,0 +1,77 @@
|
||||
import BertTokenizer from "@/ml/BertTokenizer";
|
||||
import * as tf from "@tensorflow/tfjs";
|
||||
import axios from "axios";
|
||||
|
||||
export default class BertNerModel {
|
||||
vocabUrl;
|
||||
modelUrl;
|
||||
|
||||
id2label;
|
||||
_tokenizer;
|
||||
_model;
|
||||
inputSize = 128;
|
||||
|
||||
_previousWordId = null;
|
||||
|
||||
constructor(vocabUrl, modelUrl, id2label) {
|
||||
this.vocabUrl = vocabUrl;
|
||||
this.modelUrl = modelUrl;
|
||||
this.id2label = id2label;
|
||||
}
|
||||
|
||||
async init(onProgress) {
|
||||
await Promise.all([this.loadTokenizer(), this.loadModel(onProgress)]);
|
||||
}
|
||||
|
||||
async loadTokenizer() {
|
||||
const vocab = (await axios.get(this.vocabUrl)).data;
|
||||
this._tokenizer = new BertTokenizer(vocab);
|
||||
}
|
||||
|
||||
async loadModel(onProgress) {
|
||||
this._model = await tf.loadGraphModel(this.modelUrl, {onProgress});
|
||||
}
|
||||
|
||||
alignLabels(labels, wordIds, words) {
|
||||
const result = [];
|
||||
|
||||
for (let i = 0; i < this.inputSize; i++) {
|
||||
const label = labels[i];
|
||||
const wordId = wordIds[i];
|
||||
|
||||
if (wordId === -1) {
|
||||
continue;
|
||||
}
|
||||
if (wordId === this._previousWordId) {
|
||||
continue;
|
||||
}
|
||||
|
||||
result.push({
|
||||
word: words[wordId].text, wordIndex: words[wordId].index, label: label
|
||||
});
|
||||
this._previousWordId = wordId;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
async predict(text, callback) {
|
||||
this._previousWordId = null;
|
||||
const encoded = this._tokenizer.encodeText(text, this.inputSize)
|
||||
|
||||
for (let chunk of encoded.inputChunks) {
|
||||
const rawResult = tf.tidy(() => this._model.execute({
|
||||
input_ids: tf.tensor2d(chunk.inputIds, [1, this.inputSize], "int32"),
|
||||
token_type_ids: tf.tensor2d(chunk.segmentIds, [1, this.inputSize], "int32"),
|
||||
attention_mask: tf.tensor2d(chunk.inputMask, [1, this.inputSize], "int32"),
|
||||
}));
|
||||
|
||||
const labelIds = await tf.argMax(rawResult, -1);
|
||||
const labelIdsArray = await labelIds.array();
|
||||
const labels = labelIdsArray[0].map(id => this.id2label[id]);
|
||||
rawResult.dispose()
|
||||
|
||||
callback(this.alignLabels(labels, chunk.wordIds, encoded.words))
|
||||
}
|
||||
}
|
||||
}
|
||||
184
sist2-vue/src/ml/BertTokenizer.js
Normal file
184
sist2-vue/src/ml/BertTokenizer.js
Normal file
@@ -0,0 +1,184 @@
|
||||
import {zip, chunk} from "underscore";
|
||||
|
||||
const UNK_INDEX = 100;
|
||||
const CLS_INDEX = 101;
|
||||
const SEP_INDEX = 102;
|
||||
const CONTINUING_SUBWORD_PREFIX = "##";
|
||||
|
||||
function isWhitespace(ch) {
|
||||
return /\s/.test(ch);
|
||||
}
|
||||
|
||||
function isInvalid(ch) {
|
||||
return (ch.charCodeAt(0) === 0 || ch.charCodeAt(0) === 0xfffd);
|
||||
}
|
||||
|
||||
const punctuations = '[~`!@#$%^&*(){}[];:"\'<,.>?/\\|-_+=';
|
||||
|
||||
/** To judge whether it's a punctuation. */
|
||||
function isPunctuation(ch) {
|
||||
return punctuations.indexOf(ch) !== -1;
|
||||
}
|
||||
|
||||
export default class BertTokenizer {
|
||||
vocab;
|
||||
|
||||
constructor(vocab) {
|
||||
this.vocab = vocab;
|
||||
}
|
||||
|
||||
tokenize(text) {
|
||||
const charOriginalIndex = [];
|
||||
const cleanedText = this.cleanText(text, charOriginalIndex);
|
||||
const origTokens = cleanedText.split(' ');
|
||||
|
||||
let charCount = 0;
|
||||
const tokens = origTokens.map((token) => {
|
||||
token = token.toLowerCase();
|
||||
const tokens = this.runSplitOnPunctuation(token, charCount, charOriginalIndex);
|
||||
charCount += token.length + 1;
|
||||
return tokens;
|
||||
});
|
||||
|
||||
let flattenTokens = [];
|
||||
for (let index = 0; index < tokens.length; index++) {
|
||||
flattenTokens = flattenTokens.concat(tokens[index]);
|
||||
}
|
||||
return flattenTokens;
|
||||
}
|
||||
|
||||
/* Performs invalid character removal and whitespace cleanup on text. */
|
||||
cleanText(text, charOriginalIndex) {
|
||||
text = text.replace(/\?/g, "").trim();
|
||||
|
||||
const stringBuilder = [];
|
||||
let originalCharIndex = 0;
|
||||
let newCharIndex = 0;
|
||||
|
||||
for (const ch of text) {
|
||||
// Skip the characters that cannot be used.
|
||||
if (isInvalid(ch)) {
|
||||
originalCharIndex += ch.length;
|
||||
continue;
|
||||
}
|
||||
if (isWhitespace(ch)) {
|
||||
if (stringBuilder.length > 0 && stringBuilder[stringBuilder.length - 1] !== ' ') {
|
||||
stringBuilder.push(' ');
|
||||
charOriginalIndex[newCharIndex] = originalCharIndex;
|
||||
originalCharIndex += ch.length;
|
||||
} else {
|
||||
originalCharIndex += ch.length;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
stringBuilder.push(ch);
|
||||
charOriginalIndex[newCharIndex] = originalCharIndex;
|
||||
originalCharIndex += ch.length;
|
||||
}
|
||||
newCharIndex++;
|
||||
}
|
||||
return stringBuilder.join('');
|
||||
}
|
||||
|
||||
/* Splits punctuation on a piece of text. */
|
||||
runSplitOnPunctuation(text, count, charOriginalIndex) {
|
||||
const tokens = [];
|
||||
let startNewWord = true;
|
||||
for (const ch of text) {
|
||||
if (isPunctuation(ch)) {
|
||||
tokens.push({text: ch, index: charOriginalIndex[count]});
|
||||
count += ch.length;
|
||||
startNewWord = true;
|
||||
} else {
|
||||
if (startNewWord) {
|
||||
tokens.push({text: '', index: charOriginalIndex[count]});
|
||||
startNewWord = false;
|
||||
}
|
||||
tokens[tokens.length - 1].text += ch;
|
||||
count += ch.length;
|
||||
}
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
encode(words) {
|
||||
let outputTokens = [];
|
||||
const wordIds = [];
|
||||
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
let chars = [...words[i].text];
|
||||
|
||||
let isUnknown = false;
|
||||
let start = 0;
|
||||
let subTokens = [];
|
||||
|
||||
while (start < chars.length) {
|
||||
let end = chars.length;
|
||||
let currentSubstring = null;
|
||||
while (start < end) {
|
||||
let substr = chars.slice(start, end).join('');
|
||||
|
||||
if (start > 0) {
|
||||
substr = CONTINUING_SUBWORD_PREFIX + substr;
|
||||
}
|
||||
if (this.vocab.includes(substr)) {
|
||||
currentSubstring = this.vocab.indexOf(substr);
|
||||
break;
|
||||
}
|
||||
|
||||
--end;
|
||||
}
|
||||
if (currentSubstring == null) {
|
||||
isUnknown = true;
|
||||
break;
|
||||
}
|
||||
subTokens.push(currentSubstring);
|
||||
start = end;
|
||||
}
|
||||
|
||||
if (isUnknown) {
|
||||
outputTokens.push(UNK_INDEX);
|
||||
wordIds.push(i);
|
||||
} else {
|
||||
subTokens.forEach(tok => {
|
||||
outputTokens.push(tok);
|
||||
wordIds.push(i)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {tokens: outputTokens, wordIds};
|
||||
}
|
||||
|
||||
encodeText(inputText, inputSize) {
|
||||
|
||||
const tokenized = this.tokenize(inputText);
|
||||
const encoded = this.encode(tokenized);
|
||||
|
||||
const encodedTokenChunks = chunk(encoded.tokens, inputSize - 2);
|
||||
const encodedWordIdChunks = chunk(encoded.wordIds, inputSize - 2);
|
||||
|
||||
const chunks = [];
|
||||
|
||||
zip(encodedTokenChunks, encodedWordIdChunks).forEach(([tokens, wordIds]) => {
|
||||
const inputIds = [CLS_INDEX, ...tokens, SEP_INDEX];
|
||||
const segmentIds = Array(inputIds.length).fill(0);
|
||||
const inputMask = Array(inputIds.length).fill(1);
|
||||
wordIds = [-1, ...wordIds, -1];
|
||||
|
||||
while (inputIds.length < inputSize) {
|
||||
inputIds.push(0);
|
||||
inputMask.push(0);
|
||||
segmentIds.push(0);
|
||||
wordIds.push(-1);
|
||||
}
|
||||
|
||||
chunks.push({inputIds, inputMask, segmentIds, wordIds})
|
||||
});
|
||||
|
||||
return {
|
||||
inputChunks: chunks,
|
||||
words: tokenized
|
||||
};
|
||||
}
|
||||
}
|
||||
43
sist2-vue/src/ml/modelsRepo.js
Normal file
43
sist2-vue/src/ml/modelsRepo.js
Normal file
@@ -0,0 +1,43 @@
|
||||
import axios from "axios";
|
||||
|
||||
class ModelsRepo {
|
||||
_repositories;
|
||||
data = {};
|
||||
|
||||
async init(repositories) {
|
||||
this._repositories = repositories;
|
||||
|
||||
const data = await Promise.all(this._repositories.map(this._loadRepository));
|
||||
|
||||
data.forEach(models => {
|
||||
models.forEach(model => {
|
||||
this.data[model.name] = model;
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
async _loadRepository(repository) {
|
||||
const data = (await axios.get(repository)).data;
|
||||
data.forEach(model => {
|
||||
model["modelUrl"] = new URL(model["modelPath"], repository).href;
|
||||
model["vocabUrl"] = new URL(model["vocabPath"], repository).href;
|
||||
});
|
||||
return data;
|
||||
}
|
||||
|
||||
getOptions() {
|
||||
return Object.values(this.data).map(model => ({
|
||||
text: `${model.name} (${Math.round(model.size / (1024*1024))}MB)`,
|
||||
value: model.name
|
||||
}));
|
||||
}
|
||||
|
||||
getDefaultModel() {
|
||||
if (Object.values(this.data).length === 0) {
|
||||
return null;
|
||||
}
|
||||
return Object.values(this.data).find(model => model.default).name;
|
||||
}
|
||||
}
|
||||
|
||||
export default new ModelsRepo();
|
||||
@@ -5,7 +5,7 @@ import {EsHit, EsResult, EsTag, Index, Tag} from "@/Sist2Api";
|
||||
import {deserializeMimes, randomSeed, serializeMimes} from "@/util";
|
||||
import {getInstance} from "@/plugins/auth0.js";
|
||||
|
||||
const CONF_VERSION = 2;
|
||||
const CONF_VERSION = 3;
|
||||
|
||||
Vue.use(Vuex)
|
||||
|
||||
@@ -57,6 +57,9 @@ export default new Vuex.Store({
|
||||
optVidPreviewInterval: 700,
|
||||
optSimpleLightbox: true,
|
||||
optShowTagPickerFilter: true,
|
||||
optMlRepositories: "https://raw.githubusercontent.com/simon987/sist2-ner-models/main/repo.json",
|
||||
optAutoAnalyze: false,
|
||||
optMlDefaultModel: null,
|
||||
|
||||
_onLoadSelectedIndices: [] as string[],
|
||||
_onLoadSelectedMimeTypes: [] as string[],
|
||||
@@ -86,7 +89,11 @@ export default new Vuex.Store({
|
||||
|
||||
uiMimeMap: [] as any[],
|
||||
|
||||
auth0Token: null
|
||||
auth0Token: null,
|
||||
mlModel: {
|
||||
model: null,
|
||||
name: null
|
||||
},
|
||||
},
|
||||
mutations: {
|
||||
setUiShowDetails: (state, val) => state.uiShowDetails = val,
|
||||
@@ -172,6 +179,9 @@ export default new Vuex.Store({
|
||||
setOptVidPreviewInterval: (state, val) => state.optVidPreviewInterval = val,
|
||||
setOptSimpleLightbox: (state, val) => state.optSimpleLightbox = val,
|
||||
setOptShowTagPickerFilter: (state, val) => state.optShowTagPickerFilter = val,
|
||||
setOptAutoAnalyze: (state, val) => {state.optAutoAnalyze = val},
|
||||
setOptMlRepositories: (state, val) => {state.optMlRepositories = val},
|
||||
setOptMlDefaultModel: (state, val) => {state.optMlDefaultModel = val},
|
||||
|
||||
setOptLightboxLoadOnlyCurrent: (state, val) => state.optLightboxLoadOnlyCurrent = val,
|
||||
setOptLightboxSlideDuration: (state, val) => state.optLightboxSlideDuration = val,
|
||||
@@ -194,6 +204,7 @@ export default new Vuex.Store({
|
||||
// noop
|
||||
},
|
||||
setAuth0Token: (state, val) => state.auth0Token = val,
|
||||
setMlModel: (state, val) => state.mlModel = val,
|
||||
},
|
||||
actions: {
|
||||
setSist2Info: (store, val) => {
|
||||
@@ -350,6 +361,7 @@ export default new Vuex.Store({
|
||||
},
|
||||
modules: {},
|
||||
getters: {
|
||||
mlModel: (state) => state.mlModel,
|
||||
seed: (state) => state.seed,
|
||||
getPathText: (state) => state.pathText,
|
||||
indices: state => state.indices,
|
||||
@@ -416,5 +428,12 @@ export default new Vuex.Store({
|
||||
optSimpleLightbox: state => state.optSimpleLightbox,
|
||||
optShowTagPickerFilter: state => state.optShowTagPickerFilter,
|
||||
optFeaturedFields: state => state.optFeaturedFields,
|
||||
optMlRepositories: state => state.optMlRepositories,
|
||||
mlRepositoryList: state => {
|
||||
const repos = state.optMlRepositories.split("\n")
|
||||
return repos[0] == "" ? [] : repos;
|
||||
},
|
||||
optMlDefaultModel: state => state.optMlDefaultModel,
|
||||
optAutoAnalyze: state => state.optAutoAnalyze,
|
||||
}
|
||||
})
|
||||
@@ -25,7 +25,8 @@
|
||||
<b-form-select :options="themeOptions" :value="optTheme" @input="setOptTheme"></b-form-select>
|
||||
|
||||
<label>{{ $t("opt.displayMode") }}</label>
|
||||
<b-form-select :options="displayModeOptions" :value="optDisplay" @input="setOptDisplay"></b-form-select>
|
||||
<b-form-select :options="displayModeOptions" :value="optDisplay"
|
||||
@input="setOptDisplay"></b-form-select>
|
||||
|
||||
<label>{{ $t("opt.columns") }}</label>
|
||||
<b-form-select :options="columnsOptions" :value="optColumns" @input="setOptColumns"></b-form-select>
|
||||
@@ -123,7 +124,10 @@
|
||||
}}
|
||||
</b-form-checkbox>
|
||||
|
||||
<b-form-checkbox :checked="optHighlight" @input="setOptHighlight">{{ $t("opt.highlight") }}</b-form-checkbox>
|
||||
<b-form-checkbox :checked="optHighlight" @input="setOptHighlight">{{
|
||||
$t("opt.highlight")
|
||||
}}
|
||||
</b-form-checkbox>
|
||||
<b-form-checkbox :checked="optTagOrOperator" @input="setOptTagOrOperator">{{
|
||||
$t("opt.tagOrOperator")
|
||||
}}
|
||||
@@ -148,7 +152,8 @@
|
||||
@input="setOptResultSize"></b-form-input>
|
||||
|
||||
<label>{{ $t("opt.queryMode") }}</label>
|
||||
<b-form-select :options="queryModeOptions" :value="optQueryMode" @input="setOptQueryMode"></b-form-select>
|
||||
<b-form-select :options="queryModeOptions" :value="optQueryMode"
|
||||
@input="setOptQueryMode"></b-form-select>
|
||||
|
||||
<label>{{ $t("opt.slideDuration") }}</label>
|
||||
<b-form-input :value="optLightboxSlideDuration" type="number" min="1"
|
||||
@@ -159,6 +164,17 @@
|
||||
@input="setOptVidPreviewInterval"></b-form-input>
|
||||
</b-card>
|
||||
|
||||
<h4 class="mt-3">{{ $t("mlOptions") }}</h4>
|
||||
<b-card>
|
||||
<label>{{ $t("opt.mlRepositories") }}</label>
|
||||
<b-textarea rows="3" :value="optMlRepositories" @input="setOptMlRepositories"></b-textarea>
|
||||
<br>
|
||||
<b-form-checkbox :checked="optAutoAnalyze" @input="setOptAutoAnalyze">{{
|
||||
$t("opt.autoAnalyze")
|
||||
}}
|
||||
</b-form-checkbox>
|
||||
</b-card>
|
||||
|
||||
<h4 class="mt-3">{{ $t("treemapOptions") }}</h4>
|
||||
<b-card>
|
||||
<label>{{ $t("opt.treemapType") }}</label>
|
||||
@@ -311,6 +327,8 @@ export default {
|
||||
"optSimpleLightbox",
|
||||
"optShowTagPickerFilter",
|
||||
"optFeaturedFields",
|
||||
"optMlRepositories",
|
||||
"optAutoAnalyze",
|
||||
]),
|
||||
clientWidth() {
|
||||
return window.innerWidth;
|
||||
@@ -355,6 +373,8 @@ export default {
|
||||
"setOptSimpleLightbox",
|
||||
"setOptShowTagPickerFilter",
|
||||
"setOptFeaturedFields",
|
||||
"setOptMlRepositories",
|
||||
"setOptAutoAnalyze",
|
||||
]),
|
||||
onResetClick() {
|
||||
localStorage.removeItem("sist2_configuration");
|
||||
|
||||
@@ -7,7 +7,11 @@
|
||||
<Preloader></Preloader>
|
||||
</b-card>
|
||||
|
||||
<b-card v-show="!uiLoading" id="search-panel">
|
||||
<b-alert v-show="!uiLoading && showEsConnectionError" show variant="danger" class="mt-2">
|
||||
{{ $t("toast.esConnErr") }}
|
||||
</b-alert>
|
||||
|
||||
<b-card v-show="!uiLoading && !showEsConnectionError" id="search-panel">
|
||||
<SearchBar @show-help="showHelp=true"></SearchBar>
|
||||
<b-row>
|
||||
<b-col style="height: 70px;" sm="6">
|
||||
@@ -94,7 +98,8 @@ export default Vue.extend({
|
||||
docChecksums: new Set(),
|
||||
searchBusy: false,
|
||||
Sist2Query: Sist2Query,
|
||||
showHelp: false
|
||||
showHelp: false,
|
||||
showEsConnectionError: false
|
||||
}),
|
||||
computed: {
|
||||
...mapGetters(["indices", "optDisplay"]),
|
||||
@@ -143,6 +148,15 @@ export default Vue.extend({
|
||||
this.uiLoading = false;
|
||||
this.search(true);
|
||||
});
|
||||
}).catch(error => {
|
||||
console.log(error);
|
||||
|
||||
if (error.response.status == 503 || error.response.status == 500) {
|
||||
this.showEsConnectionError = true;
|
||||
this.uiLoading = false;
|
||||
} else {
|
||||
this.showErrorToast();
|
||||
}
|
||||
});
|
||||
},
|
||||
methods: {
|
||||
@@ -253,11 +267,20 @@ export default Vue.extend({
|
||||
},
|
||||
size: 0
|
||||
}).then(res => {
|
||||
return {
|
||||
const range = {
|
||||
min: res.aggregations.dateMin.value,
|
||||
max: res.aggregations.dateMax.value,
|
||||
}
|
||||
})
|
||||
|
||||
if (range.min == null) {
|
||||
range.min = 0;
|
||||
range.max = 1;
|
||||
} else if (range.min == range.max) {
|
||||
range.max += 1;
|
||||
}
|
||||
|
||||
return range;
|
||||
});
|
||||
},
|
||||
appendFunc() {
|
||||
if (!this.$store.state.uiReachedScrollEnd && this.search && !this.searchBusy) {
|
||||
|
||||
@@ -83,6 +83,7 @@ void database_open(database_t *db) {
|
||||
LOG_DEBUGF("database.c", "Opening database %s (%d)", db->filename, db->type);
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_open(db->filename, &db->db));
|
||||
sqlite3_busy_timeout(db->db, 1000);
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA cache_size = -200000;", NULL, NULL, NULL));
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA synchronous = OFF;", NULL, NULL, NULL));
|
||||
@@ -328,18 +329,18 @@ database_iterator_t *database_create_document_iterator(database_t *db) {
|
||||
" WHEN sc.json_data IS NULL THEN"
|
||||
" CASE"
|
||||
" WHEN t.tag IS NULL THEN"
|
||||
" document.json_data"
|
||||
" json_set(document.json_data, '$._id', document.id, '$.size', document.size, '$.mtime', document.mtime)"
|
||||
" ELSE"
|
||||
" json_set(document.json_data, '$.tag', json_group_array(t.tag))"
|
||||
" json_set(document.json_data, '$._id', document.id, '$.size', document.size, '$.mtime', document.mtime, '$.tag', json_group_array(t.tag))"
|
||||
" END"
|
||||
" ELSE"
|
||||
" CASE"
|
||||
" WHEN t.tag IS NULL THEN"
|
||||
" json_patch(document.json_data, sc.json_data)"
|
||||
" json_patch(json_set(document.json_data, '$._id', document.id, '$.size', document.size, '$.mtime', document.mtime), sc.json_data)"
|
||||
" ELSE"
|
||||
// This will overwrite any tags specified in the sidecar file!
|
||||
// TODO: concatenate the two arrays?
|
||||
" json_set(json_patch(document.json_data, sc.json_data), '$.tag', json_group_array(t.tag))"
|
||||
" json_set(json_patch(document.json_data, sc.json_data), '$._id', document.id, '$.size', document.size, '$.mtime', document.mtime, '$.tag', json_group_array(t.tag))"
|
||||
" END"
|
||||
" END"
|
||||
" FROM document"
|
||||
@@ -581,18 +582,33 @@ void database_add_work(database_t *db, job_t *job) {
|
||||
ret = sqlite3_step(db->insert_parse_job_stmt);
|
||||
|
||||
if (ret == SQLITE_FULL) {
|
||||
sqlite3_reset(db->insert_parse_job_stmt);
|
||||
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
|
||||
usleep(1000000);
|
||||
pthread_mutex_lock(&db->ipc_ctx->db_mutex);
|
||||
continue;
|
||||
} else {
|
||||
CRASH_IF_STMT_FAIL(ret);
|
||||
}
|
||||
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_reset(db->insert_parse_job_stmt));
|
||||
} while (ret != SQLITE_DONE);
|
||||
ret = sqlite3_reset(db->insert_parse_job_stmt);
|
||||
if (ret == SQLITE_FULL) {
|
||||
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
|
||||
usleep(100000);
|
||||
pthread_mutex_lock(&db->ipc_ctx->db_mutex);
|
||||
} else if (ret != SQLITE_OK) {
|
||||
LOG_FATALF("database.c", "sqlite3_reset returned error %d", ret);
|
||||
}
|
||||
} while (ret != SQLITE_DONE && ret != SQLITE_OK);
|
||||
} else if (job->type == JOB_BULK_LINE) {
|
||||
do {
|
||||
sqlite3_bind_text(db->insert_index_job_stmt, 1, job->bulk_line->doc_id, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_int(db->insert_index_job_stmt, 2, job->bulk_line->type);
|
||||
if (job->bulk_line->type != ES_BULK_LINE_DELETE) {
|
||||
sqlite3_bind_text(db->insert_index_job_stmt, 3, job->bulk_line->line, -1, SQLITE_STATIC);
|
||||
} else {
|
||||
sqlite3_bind_null(db->insert_index_job_stmt, 3);
|
||||
}
|
||||
|
||||
ret = sqlite3_step(db->insert_index_job_stmt);
|
||||
|
||||
@@ -611,6 +627,8 @@ void database_add_work(database_t *db, job_t *job) {
|
||||
pthread_mutex_unlock(&db->ipc_ctx->db_mutex);
|
||||
usleep(100000);
|
||||
pthread_mutex_lock(&db->ipc_ctx->db_mutex);
|
||||
} else if (ret != SQLITE_OK) {
|
||||
LOG_FATALF("database.c", "sqlite3_reset returned error %d", ret);
|
||||
}
|
||||
|
||||
} while (ret != SQLITE_DONE && ret != SQLITE_OK);
|
||||
|
||||
@@ -18,6 +18,14 @@ typedef enum {
|
||||
FTS_DATABASE
|
||||
} database_type_t;
|
||||
|
||||
typedef enum {
|
||||
DATABASE_STAT_INVALID,
|
||||
DATABASE_STAT_TREEMAP,
|
||||
DATABASE_STAT_MIME_AGG,
|
||||
DATABASE_STAT_SIZE_AGG,
|
||||
DATABASE_STAT_DATE_AGG,
|
||||
} database_stat_type_d;
|
||||
|
||||
typedef enum {
|
||||
JOB_UNDEFINED,
|
||||
JOB_BULK_LINE,
|
||||
@@ -104,14 +112,14 @@ database_iterator_t *database_create_document_iterator(database_t *db);
|
||||
cJSON *database_document_iter(database_iterator_t *);
|
||||
|
||||
#define database_document_iter_foreach(element, iter) \
|
||||
for (cJSON *element = database_document_iter(iter); element != NULL; element = database_document_iter(iter))
|
||||
for (cJSON *(element) = database_document_iter(iter); (element) != NULL; (element) = database_document_iter(iter))
|
||||
|
||||
database_iterator_t *database_create_delete_list_iterator(database_t *db);
|
||||
|
||||
char * database_delete_list_iter(database_iterator_t *iter);
|
||||
|
||||
#define database_delete_list_iter_foreach(element, iter) \
|
||||
for (char *element = database_delete_list_iter(iter); element != NULL; element = database_delete_list_iter(iter))
|
||||
for (char *(element) = database_delete_list_iter(iter); (element) != NULL; (element) = database_delete_list_iter(iter))
|
||||
|
||||
|
||||
cJSON *database_incremental_scan_begin(database_t *db);
|
||||
@@ -132,12 +140,16 @@ treemap_row_t database_treemap_iter(database_iterator_t *iter);
|
||||
|
||||
void database_generate_stats(database_t *db, double treemap_threshold);
|
||||
|
||||
database_stat_type_d database_get_stat_type_by_mnemonic(const char *name);
|
||||
|
||||
job_t *database_get_work(database_t *db, job_type_t job_type);
|
||||
|
||||
void database_add_work(database_t *db, job_t *job);
|
||||
|
||||
//void database_index(database_t *db);
|
||||
|
||||
cJSON *database_get_stats(database_t *db, database_stat_type_d type);
|
||||
|
||||
#define CRASH_IF_STMT_FAIL(x) do { \
|
||||
int return_value = x; \
|
||||
if (return_value != SQLITE_DONE && return_value != SQLITE_ROW) { \
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#define SIZE_BUCKET (long)(5 * 1000 * 1000)
|
||||
#define DATE_BUCKET (long)(2629800) // ~30 days
|
||||
|
||||
|
||||
database_iterator_t *database_create_treemap_iterator(database_t *db, long threshold) {
|
||||
|
||||
sqlite3_stmt *stmt;
|
||||
@@ -157,3 +158,85 @@ void database_generate_stats(database_t *db, double treemap_threshold) {
|
||||
LOG_INFO("database.c", "Done!");
|
||||
}
|
||||
|
||||
database_stat_type_d database_get_stat_type_by_mnemonic(const char *name) {
|
||||
if (strcmp(name, "TMAP") == 0) {
|
||||
return DATABASE_STAT_TREEMAP;
|
||||
}
|
||||
if (strcmp(name, "MAGG") == 0) {
|
||||
return DATABASE_STAT_MIME_AGG;
|
||||
}
|
||||
if (strcmp(name, "SAGG") == 0) {
|
||||
return DATABASE_STAT_SIZE_AGG;
|
||||
}
|
||||
if (strcmp(name, "DAGG") == 0) {
|
||||
return DATABASE_STAT_DATE_AGG;
|
||||
}
|
||||
|
||||
return DATABASE_STAT_INVALID;
|
||||
}
|
||||
|
||||
cJSON *database_get_stats(database_t *db, database_stat_type_d type) {
|
||||
|
||||
sqlite3_stmt *stmt;
|
||||
|
||||
switch (type) {
|
||||
case DATABASE_STAT_TREEMAP:
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
|
||||
db->db, "SELECT path,size FROM stats_treemap", -1, &stmt, NULL
|
||||
));
|
||||
break;
|
||||
case DATABASE_STAT_DATE_AGG:
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
|
||||
db->db, "SELECT bucket,count FROM stats_date_agg", -1, &stmt, NULL
|
||||
));
|
||||
break;
|
||||
case DATABASE_STAT_SIZE_AGG:
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
|
||||
db->db, "SELECT bucket,count FROM stats_size_agg", -1, &stmt, NULL
|
||||
));
|
||||
break;
|
||||
case DATABASE_STAT_MIME_AGG:
|
||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_prepare_v2(
|
||||
db->db, "SELECT mime,size,count FROM stats_mime_agg", -1, &stmt, NULL
|
||||
));
|
||||
break;
|
||||
case DATABASE_STAT_INVALID:
|
||||
default:
|
||||
LOG_FATALF("database_stats.c", "Invalid stat type: %d", type);
|
||||
}
|
||||
|
||||
cJSON *json = cJSON_CreateArray();
|
||||
|
||||
int ret;
|
||||
do {
|
||||
ret = sqlite3_step(stmt);
|
||||
CRASH_IF_STMT_FAIL(ret);
|
||||
|
||||
if (ret == SQLITE_DONE) {
|
||||
break;
|
||||
}
|
||||
|
||||
cJSON *row = cJSON_CreateObject();
|
||||
|
||||
switch (type) {
|
||||
case DATABASE_STAT_TREEMAP:
|
||||
cJSON_AddStringToObject(row, "path", (const char *) sqlite3_column_text(stmt, 0));
|
||||
cJSON_AddNumberToObject(row, "size", (double) sqlite3_column_int64(stmt, 1));
|
||||
break;
|
||||
case DATABASE_STAT_DATE_AGG:
|
||||
case DATABASE_STAT_SIZE_AGG:
|
||||
cJSON_AddNumberToObject(row, "bucket", (double) sqlite3_column_int64(stmt, 0));
|
||||
cJSON_AddNumberToObject(row, "count", (double) sqlite3_column_int64(stmt, 1));
|
||||
break;
|
||||
case DATABASE_STAT_MIME_AGG:
|
||||
cJSON_AddStringToObject(row, "mime", (const char *) sqlite3_column_text(stmt, 0));
|
||||
cJSON_AddNumberToObject(row, "size", (double) sqlite3_column_int64(stmt, 1));
|
||||
cJSON_AddNumberToObject(row, "count", (double) sqlite3_column_int64(stmt, 2));
|
||||
break;
|
||||
}
|
||||
|
||||
cJSON_AddItemToArray(json, row);
|
||||
} while (TRUE);
|
||||
|
||||
return json;
|
||||
}
|
||||
@@ -1,5 +0,0 @@
|
||||
#ifndef SIST2_DATABASE_STATS_H
|
||||
#define SIST2_DATABASE_STATS_H
|
||||
|
||||
|
||||
#endif //SIST2_DATABASE_STATS_H
|
||||
@@ -64,20 +64,16 @@ void print_json(cJSON *document, const char id_str[SIST_DOC_ID_LEN]) {
|
||||
cJSON_Delete(line);
|
||||
}
|
||||
|
||||
void index_json_func(job_t *job) {
|
||||
elastic_index_line(job->bulk_line);
|
||||
}
|
||||
|
||||
void delete_document(const char *document_id) {
|
||||
es_bulk_line_t *bulk_line = malloc(sizeof(es_bulk_line_t));
|
||||
es_bulk_line_t bulk_line;
|
||||
|
||||
bulk_line->type = ES_BULK_LINE_DELETE;
|
||||
bulk_line->next = NULL;
|
||||
strcpy(bulk_line->doc_id, document_id);
|
||||
bulk_line.type = ES_BULK_LINE_DELETE;
|
||||
bulk_line.next = NULL;
|
||||
strcpy(bulk_line.doc_id, document_id);
|
||||
|
||||
tpool_add_work(IndexCtx.pool, &(job_t) {
|
||||
.type = JOB_BULK_LINE,
|
||||
.bulk_line = bulk_line,
|
||||
.bulk_line = &bulk_line,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -99,6 +95,7 @@ void index_json(cJSON *document, const char doc_id[SIST_DOC_ID_LEN]) {
|
||||
.type = JOB_BULK_LINE,
|
||||
.bulk_line = bulk_line,
|
||||
});
|
||||
free(bulk_line);
|
||||
}
|
||||
|
||||
void execute_update_script(const char *script, int async, const char index_id[SIST_INDEX_ID_LEN]) {
|
||||
|
||||
@@ -91,8 +91,6 @@ char *build_json_string(document_t *doc) {
|
||||
} else {
|
||||
cJSON_AddStringToObject(json, "mime", mime_text);
|
||||
}
|
||||
cJSON_AddNumberToObject(json, "size", (double) doc->size);
|
||||
cJSON_AddNumberToObject(json, "mtime", doc->mtime);
|
||||
|
||||
// Ignore root directory in the file path
|
||||
doc->ext = (short) (doc->ext - ScanCtx.index.desc.root_len);
|
||||
@@ -122,8 +120,6 @@ char *build_json_string(document_t *doc) {
|
||||
cJSON_AddStringToObject(json, "path", "");
|
||||
}
|
||||
|
||||
cJSON_AddStringToObject(json, "_id", doc->doc_id);
|
||||
|
||||
// Metadata
|
||||
meta_line_t *meta = doc->meta_head;
|
||||
while (meta != NULL) {
|
||||
|
||||
14
src/main.c
14
src/main.c
@@ -195,6 +195,10 @@ void initialize_scan_context(scan_args_t *args) {
|
||||
ScanCtx.mobi_ctx.content_size = args->content_size;
|
||||
ScanCtx.mobi_ctx.log = log_callback;
|
||||
ScanCtx.mobi_ctx.logf = logf_callback;
|
||||
ScanCtx.mobi_ctx.store = write_thumbnail_callback;
|
||||
ScanCtx.mobi_ctx.enable_tn = args->tn_count > 0;
|
||||
ScanCtx.mobi_ctx.tn_size = args->tn_size;
|
||||
ScanCtx.mobi_ctx.tn_qscale = args->tn_quality;
|
||||
|
||||
// TEXT
|
||||
ScanCtx.text_ctx.content_size = args->content_size;
|
||||
@@ -312,17 +316,20 @@ void sist2_index(index_args_t *args) {
|
||||
database_open(db);
|
||||
database_iterator_t *iterator = database_create_document_iterator(db);
|
||||
database_document_iter_foreach(json, iterator) {
|
||||
const char *doc_id = cJSON_GetObjectItem(json, "_id")->valuestring;
|
||||
char doc_id[SIST_DOC_ID_LEN];
|
||||
strcpy(doc_id, cJSON_GetObjectItem(json, "_id")->valuestring);
|
||||
cJSON_DeleteItemFromObject(json, "_id");
|
||||
|
||||
if (args->print) {
|
||||
print_json(json, doc_id);
|
||||
} else {
|
||||
index_json(json, doc_id);
|
||||
cnt += 1;
|
||||
}
|
||||
cJSON_Delete(json);
|
||||
}
|
||||
|
||||
free(iterator);
|
||||
database_close(db, FALSE);
|
||||
|
||||
if (!args->print) {
|
||||
database_iterator_t *del_iter = database_create_delete_list_iterator(db);
|
||||
@@ -330,8 +337,11 @@ void sist2_index(index_args_t *args) {
|
||||
delete_document(id);
|
||||
free(id);
|
||||
}
|
||||
free(del_iter);
|
||||
}
|
||||
|
||||
database_close(db, FALSE);
|
||||
|
||||
tpool_wait(IndexCtx.pool);
|
||||
tpool_destroy(IndexCtx.pool);
|
||||
|
||||
|
||||
@@ -51,11 +51,11 @@
|
||||
#include <ctype.h>
|
||||
#include "git_hash.h"
|
||||
|
||||
#define VERSION "3.0.0"
|
||||
#define VERSION "3.0.5"
|
||||
static const char *const Version = VERSION;
|
||||
static const int VersionMajor = 3;
|
||||
static const int VersionMinor = 0;
|
||||
static const int VersionPatch = 0;
|
||||
static const int VersionPatch = 5;
|
||||
|
||||
#ifndef SIST_PLATFORM
|
||||
#define SIST_PLATFORM unknown
|
||||
|
||||
@@ -149,6 +149,11 @@ void worker_proc_cleanup(tpool_t *pool) {
|
||||
if (ProcData.index_db != NULL) {
|
||||
database_close(ProcData.index_db, FALSE);
|
||||
}
|
||||
|
||||
if (IndexCtx.needs_es_connection) {
|
||||
elastic_cleanup();
|
||||
}
|
||||
|
||||
database_close(ProcData.ipc_db, FALSE);
|
||||
}
|
||||
|
||||
@@ -242,6 +247,7 @@ static void *tpool_worker(void *arg) {
|
||||
pthread_mutex_lock(&pool->shm->mutex);
|
||||
pthread_cond_signal(&pool->shm->done_working_cond);
|
||||
pthread_mutex_unlock(&pool->shm->mutex);
|
||||
worker_proc_cleanup(pool);
|
||||
#endif
|
||||
|
||||
return NULL;
|
||||
|
||||
@@ -20,49 +20,40 @@ static struct mg_http_serve_opts DefaultServeOpts = {
|
||||
|
||||
void stats_files(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
|
||||
if (hm->uri.len != SIST_INDEX_ID_LEN + 4) {
|
||||
if (hm->uri.len != SIST_INDEX_ID_LEN + 7) {
|
||||
HTTP_REPLY_NOT_FOUND
|
||||
return;
|
||||
}
|
||||
|
||||
char arg_index_id[SIST_INDEX_ID_LEN];
|
||||
char arg_stat_type[5];
|
||||
|
||||
memcpy(arg_index_id, hm->uri.ptr + 3, SIST_INDEX_ID_LEN);
|
||||
*(arg_index_id + SIST_INDEX_ID_LEN - 1) = '\0';
|
||||
memcpy(arg_stat_type, hm->uri.ptr + 3 + SIST_INDEX_ID_LEN, 4);
|
||||
*(arg_stat_type + sizeof(arg_stat_type) - 1) = '\0';
|
||||
|
||||
index_t *index = web_get_index_by_id(arg_index_id);
|
||||
if (index == NULL) {
|
||||
database_stat_type_d stat_type = database_get_stat_type_by_mnemonic(arg_stat_type);
|
||||
if (stat_type == DATABASE_STAT_INVALID) {
|
||||
HTTP_REPLY_NOT_FOUND
|
||||
return;
|
||||
}
|
||||
|
||||
const char *file;
|
||||
switch (atoi(hm->uri.ptr + 3 + SIST_INDEX_ID_LEN)) {
|
||||
case 1:
|
||||
file = "treemap.csv";
|
||||
break;
|
||||
case 2:
|
||||
file = "mime_agg.csv";
|
||||
break;
|
||||
case 3:
|
||||
file = "size_agg.csv";
|
||||
break;
|
||||
case 4:
|
||||
file = "date_agg.csv";
|
||||
break;
|
||||
default:
|
||||
database_t *db = web_get_database(arg_index_id);
|
||||
if (db == NULL) {
|
||||
LOG_DEBUGF("serve.c", "Could not get database for index: %s", arg_index_id);
|
||||
HTTP_REPLY_NOT_FOUND
|
||||
return;
|
||||
}
|
||||
|
||||
char disposition[8192];
|
||||
snprintf(disposition, sizeof(disposition),
|
||||
"Content-Disposition: inline; filename=\"%s\"\r\nCache-Control: max-age=31536000\r\n", file);
|
||||
cJSON *json = database_get_stats(db, stat_type);
|
||||
char *json_str = cJSON_PrintUnformatted(json);
|
||||
|
||||
char full_path[PATH_MAX];
|
||||
strcpy(full_path, index->path);
|
||||
strcat(full_path, file);
|
||||
web_send_headers(nc, 200, strlen(json_str), "Content-Type: application/json");
|
||||
mg_send(nc, json_str, strlen(json_str));
|
||||
|
||||
struct mg_http_serve_opts opts = {};
|
||||
mg_http_serve_file(nc, hm, full_path, &opts);
|
||||
free(json_str);
|
||||
cJSON_Delete(json);
|
||||
}
|
||||
|
||||
void serve_index_html(struct mg_connection *nc, struct mg_http_message *hm) {
|
||||
@@ -286,16 +277,23 @@ void index_info(struct mg_connection *nc) {
|
||||
cJSON *json = cJSON_CreateObject();
|
||||
cJSON *arr = cJSON_AddArrayToObject(json, "indices");
|
||||
|
||||
cJSON_AddStringToObject(json, "mongooseVersion", MG_VERSION);
|
||||
cJSON_AddStringToObject(json, "esIndex", WebCtx.es_index);
|
||||
cJSON_AddStringToObject(json, "version", Version);
|
||||
|
||||
#ifdef SIST_DEBUG_INFO
|
||||
cJSON_AddStringToObject(json, "mongooseVersion", MG_VERSION);
|
||||
cJSON_AddStringToObject(json, "esVersion", es_version);
|
||||
cJSON_AddBoolToObject(json, "esVersionSupported", IS_SUPPORTED_ES_VERSION(WebCtx.es_version));
|
||||
cJSON_AddBoolToObject(json, "esVersionLegacy", IS_LEGACY_VERSION(WebCtx.es_version));
|
||||
cJSON_AddStringToObject(json, "platform", QUOTE(SIST_PLATFORM));
|
||||
cJSON_AddStringToObject(json, "sist2Hash", Sist2CommitHash);
|
||||
cJSON_AddStringToObject(json, "lang", WebCtx.lang);
|
||||
cJSON_AddBoolToObject(json, "dev", WebCtx.dev);
|
||||
cJSON_AddBoolToObject(json, "showDebugInfo", TRUE);
|
||||
#else
|
||||
cJSON_AddBoolToObject(json, "showDebugInfo", FALSE);
|
||||
#endif
|
||||
|
||||
cJSON_AddBoolToObject(json, "esVersionSupported", IS_SUPPORTED_ES_VERSION(WebCtx.es_version));
|
||||
cJSON_AddBoolToObject(json, "esVersionLegacy", IS_LEGACY_VERSION(WebCtx.es_version));
|
||||
cJSON_AddStringToObject(json, "lang", WebCtx.lang);
|
||||
|
||||
cJSON_AddBoolToObject(json, "auth0Enabled", WebCtx.auth0_enabled);
|
||||
if (WebCtx.auth0_enabled) {
|
||||
@@ -668,6 +666,9 @@ static void ev_router(struct mg_connection *nc, int ev, void *ev_data, UNUSED(vo
|
||||
mg_send(nc, r->body, r->size);
|
||||
} else if (r->status_code == 0) {
|
||||
sist_log("serve.c", LOG_SIST_ERROR, "Could not connect to elasticsearch!");
|
||||
|
||||
mg_http_reply(nc, 503, HTTP_SERVER_HEADER HTTP_TEXT_TYPE_HEADER,
|
||||
"Elasticsearch connection error, see server logs.");
|
||||
} else {
|
||||
sist_logf("serve.c", LOG_SIST_WARNING, "ElasticSearch error during query (%d)", r->status_code);
|
||||
if (r->size != 0) {
|
||||
|
||||
2
third-party/libscan/CMakeLists.txt
vendored
2
third-party/libscan/CMakeLists.txt
vendored
@@ -106,7 +106,7 @@ find_library(MUPDF_LIB NAMES liblibmupdf.a)
|
||||
find_library(CMS_LIB NAMES lcms2)
|
||||
find_library(JAS_LIB NAMES jasper)
|
||||
find_library(GUMBO_LIB NAMES gumbo)
|
||||
find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/11/ /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/11/)
|
||||
find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/11/ /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/11/ /usr/lib/gcc/x86_64-linux-gnu/8/ /usr/lib/gcc/aarch64-linux-gnu/8/)
|
||||
find_package(Leptonica CONFIG REQUIRED)
|
||||
find_package(FFMPEG REQUIRED)
|
||||
find_package(libraw CONFIG REQUIRED)
|
||||
|
||||
39
third-party/libscan/libscan/mobi/scan_mobi.c
vendored
39
third-party/libscan/libscan/mobi/scan_mobi.c
vendored
@@ -1,9 +1,44 @@
|
||||
#include "scan_mobi.h"
|
||||
|
||||
#include "../../third-party/libmobi/src/mobi.h"
|
||||
#include "../media/media.h"
|
||||
#include <errno.h>
|
||||
#include "stdlib.h"
|
||||
|
||||
int store_cover(scan_mobi_ctx_t *ctx, document_t *doc, MOBIData *m) {
|
||||
MOBIExthHeader *exth = mobi_get_exthrecord_by_tag(m, EXTH_COVEROFFSET);
|
||||
|
||||
if (exth == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
uint32_t offset = mobi_decode_exthvalue(exth->data, exth->size);
|
||||
size_t first_resource = mobi_get_first_resource_record(m);
|
||||
size_t uid = first_resource + offset;
|
||||
MOBIPdbRecord *record = mobi_get_record_by_seqnumber(m, uid);
|
||||
|
||||
if (record == NULL || record->size < 4) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
scan_media_ctx_t media_ctx = {
|
||||
.tn_count = TRUE,
|
||||
.tn_size = ctx->tn_size,
|
||||
.tn_qscale = ctx->tn_qscale,
|
||||
.tesseract_lang = NULL,
|
||||
.tesseract_path = NULL,
|
||||
.read_subtitles = FALSE,
|
||||
.max_media_buffer = 0,
|
||||
.log = ctx->log,
|
||||
.logf = ctx->logf,
|
||||
.store = ctx->store,
|
||||
};
|
||||
|
||||
store_image_thumbnail(&media_ctx, record->data, record->size, doc, "img.jpg");
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
MOBIData *m = mobi_init();
|
||||
@@ -72,6 +107,10 @@ void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc) {
|
||||
|
||||
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
|
||||
|
||||
if (ctx->enable_tn) {
|
||||
store_cover(ctx, doc, m);
|
||||
}
|
||||
|
||||
free(content_str);
|
||||
free(buf);
|
||||
text_buffer_destroy(&tex);
|
||||
|
||||
5
third-party/libscan/libscan/mobi/scan_mobi.h
vendored
5
third-party/libscan/libscan/mobi/scan_mobi.h
vendored
@@ -7,6 +7,11 @@ typedef struct {
|
||||
long content_size;
|
||||
log_callback_t log;
|
||||
logf_callback_t logf;
|
||||
store_callback_t store;
|
||||
|
||||
int tn_qscale;
|
||||
int tn_size;
|
||||
int enable_tn;
|
||||
} scan_mobi_ctx_t;
|
||||
|
||||
void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc);
|
||||
|
||||
2
third-party/libscan/third-party/antiword
vendored
2
third-party/libscan/third-party/antiword
vendored
Submodule third-party/libscan/third-party/antiword updated: badfdac845...ddb042143e
2
third-party/libscan/third-party/libmobi
vendored
2
third-party/libscan/third-party/libmobi
vendored
Submodule third-party/libscan/third-party/libmobi updated: 395dbde361...864e3a86f2
Reference in New Issue
Block a user