mirror of
https://github.com/simon987/sist2.git
synced 2025-04-10 14:06:45 +00:00
Compare commits
4 Commits
e1e22fd79a
...
31646a2747
Author | SHA1 | Date | |
---|---|---|---|
31646a2747 | |||
d9d77de47f | |||
5f0957d029 | |||
1cc48f7f33 |
@ -46,7 +46,7 @@ services:
|
||||
- "discovery.type=single-node"
|
||||
- "ES_JAVA_OPTS=-Xms2g -Xmx2g"
|
||||
sist2-admin:
|
||||
image: simon987/sist2:3.1.4-x64-linux
|
||||
image: simon987/sist2:3.3.3-x64-linux
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./sist2-admin-data/:/sist2-admin/
|
||||
@ -157,6 +157,7 @@ indices, but it uses much less memory and is easier to set up.
|
||||
| Manual tagging | ✓ | ✓ |
|
||||
| User scripts | ✓ | ✓ |
|
||||
| Media Type breakdown for search results | | ✓ |
|
||||
| Embeddings search | ✓ *O(n)* | ✓ *O(logn)* |
|
||||
|
||||
### NER
|
||||
|
||||
|
@ -175,6 +175,32 @@ Using a version >=7.14.0 is recommended to enable the following features:
|
||||
When using a legacy version of ES, a notice will be displayed next to the sist2 version in the web UI.
|
||||
If you don't care about the features above, you can ignore it or disable it in the configuration page.
|
||||
|
||||
# Embeddings search
|
||||
|
||||
Since v3.2.0, User scripts can be used to generate _embeddings_ (vector of float32 numbers) which are stored in the .sist2 index file
|
||||
(see [scripting](scripting.md)). Embeddings can be used for:
|
||||
|
||||
* Nearest-neighbor queries (e.g. "return the documents most similar to this one")
|
||||
* Semantic searches (e.g. "return the documents that are most closely related to the given topic")
|
||||
|
||||
In theory, embeddings can be created for any type of documents (image, text, audio etc.).
|
||||
|
||||
For example, the [clip](https://github.com/simon987/sist2-script-clip) User Script, generates 512-d embeddings of images
|
||||
(videos are also supported using the thumbnails generated by sist2). When the user enters a query in the "Embeddings Search"
|
||||
textbox, the query's embedding is generated in their browser, leveraging the ONNX web runtime.
|
||||
|
||||
<details>
|
||||
<summary>Screenshots</summary>
|
||||
|
||||

|
||||

|
||||
|
||||
1. Embeddings search bar. You can select the model using the dropdown on the left.
|
||||
2. This icon appears for indices with embeddings search enabled.
|
||||
3. Documents with this icon have embeddings. Click on the icon to perform KNN search.
|
||||
</details>
|
||||
|
||||
|
||||
# Tagging
|
||||
|
||||
### Manual tagging
|
||||
@ -199,43 +225,4 @@ See [Automatic tagging](#automatic-tagging) for information about tag
|
||||
|
||||
### Automatic tagging
|
||||
|
||||
See [scripting](scripting.md) documentation.
|
||||
|
||||
# Sidecar files
|
||||
|
||||
When scanning, sist2 will read metadata from `.s2meta` JSON files and overwrite the
|
||||
original document's indexed metadata (does not modify the actual file). Sidecar metadata files will also work inside archives.
|
||||
Sidecar files themselves are not saved in the index.
|
||||
|
||||
This feature is useful to leverage third-party applications such as speech-to-text or
|
||||
OCR to add additional metadata to a file.
|
||||
|
||||
**Example**
|
||||
|
||||
```
|
||||
~/Documents/
|
||||
├── Video.mp4
|
||||
└── Video.mp4.s2meta
|
||||
```
|
||||
|
||||
The sidecar file must have exactly the same file path and the `.s2meta` suffix.
|
||||
|
||||
`Video.mp4.s2meta`:
|
||||
```json
|
||||
{
|
||||
"content": "This sidecar file will overwrite some metadata fields of Video.mp4",
|
||||
"author": "Some author",
|
||||
"duration": 12345,
|
||||
"bitrate": 67890,
|
||||
"some_arbitrary_field": [1,2,3]
|
||||
}
|
||||
```
|
||||
|
||||
```
|
||||
sist2 scan ~/Documents -o ./docs.sist2
|
||||
sist2 index ./docs.sist2
|
||||
```
|
||||
|
||||
*NOTE*: It is technically possible to overwrite the `tag` value using sidecar files, however,
|
||||
it is not currently possible to restore both manual tags and sidecar tags without user scripts
|
||||
while reindexing.
|
||||
See [scripting](scripting.md) documentation.
|
BIN
docs/embeddings-1.png
Normal file
BIN
docs/embeddings-1.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 90 KiB |
BIN
docs/embeddings-2.png
Normal file
BIN
docs/embeddings-2.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 996 KiB |
@ -90,6 +90,7 @@ subreq_ctx_t *web_post_async(const char *url, char *data, int insecure) {
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "sist2");
|
||||
if (insecure) {
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
|
||||
}
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, req->curl_err_buffer);
|
||||
@ -123,6 +124,7 @@ response_t *web_get(const char *url, int timeout, int insecure) {
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
|
||||
if (insecure) {
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
|
||||
}
|
||||
|
||||
struct curl_slist *headers = NULL;
|
||||
@ -162,6 +164,7 @@ response_t *web_post(const char *url, const char *data, int insecure) {
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "sist2");
|
||||
if (insecure) {
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
|
||||
}
|
||||
|
||||
char err_buffer[CURL_ERROR_SIZE + 1] = {};
|
||||
@ -207,6 +210,7 @@ response_t *web_put(const char *url, const char *data, int insecure) {
|
||||
curl_easy_setopt(curl, CURLOPT_IPRESOLVE, CURLOPT_DNS_LOCAL_IP4);
|
||||
if (insecure) {
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
|
||||
}
|
||||
|
||||
struct curl_slist *headers = NULL;
|
||||
@ -241,6 +245,7 @@ response_t *web_delete(const char *url, int insecure) {
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "sist2");
|
||||
if (insecure) {
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
|
||||
}
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, "");
|
||||
|
@ -51,11 +51,11 @@
|
||||
#include <ctype.h>
|
||||
#include "git_hash.h"
|
||||
|
||||
#define VERSION "3.3.2"
|
||||
#define VERSION "3.3.3"
|
||||
static const char *const Version = VERSION;
|
||||
static const int VersionMajor = 3;
|
||||
static const int VersionMinor = 3;
|
||||
static const int VersionPatch = 2;
|
||||
static const int VersionPatch = 3;
|
||||
|
||||
#ifndef SIST_PLATFORM
|
||||
#define SIST_PLATFORM unknown
|
||||
|
Loading…
x
Reference in New Issue
Block a user