Compare commits

...

9 Commits

Author SHA1 Message Date
f423863acb Add option to search in path for sqlite #402 2023-10-16 21:14:46 -04:00
49a21a5a25 Version bump 2023-10-13 19:02:26 -04:00
560aa82ce7 add discord invite 2023-10-09 18:10:51 -04:00
b8c905bd64 expose indexRoot value to documents 2023-10-08 21:00:03 -04:00
8299237ea0 version bump 2023-10-07 15:04:19 -04:00
31646a2747 Fix CURL error 2023-10-07 13:16:22 -04:00
d9d77de47f Update docs 2023-10-07 11:07:13 -04:00
5f0957d029 Update readme 2023-10-07 10:15:41 -04:00
1cc48f7f33 Version bump 2023-10-07 10:15:03 -04:00
13 changed files with 77 additions and 68 deletions

View File

@@ -4,6 +4,8 @@
**Demo**: [sist2.simon987.net](https://sist2.simon987.net/)
**Community URL:** [Discord](https://discord.gg/2PEjDy3Rfs)
# sist2
sist2 (Simple incremental search tool)
@@ -46,7 +48,7 @@ services:
- "discovery.type=single-node"
- "ES_JAVA_OPTS=-Xms2g -Xmx2g"
sist2-admin:
image: simon987/sist2:3.1.4-x64-linux
image: simon987/sist2:3.3.4-x64-linux
restart: unless-stopped
volumes:
- ./sist2-admin-data/:/sist2-admin/
@@ -153,10 +155,10 @@ indices, but it uses much less memory and is easier to set up.
| Query syntax | [fts5](https://www.sqlite.org/fts5.html) | [query_string](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax) |
| Fuzzy search | | ✓ |
| Media Types tree real-time updating | | ✓ |
| Search in file `path` | [WIP](https://github.com/simon987/sist2/issues/402) | ✓ |
| Manual tagging | ✓ | ✓ |
| User scripts | ✓ | ✓ |
| Media Type breakdown for search results | | ✓ |
| Embeddings search | ✓ *O(n)* | ✓ *O(logn)* |
### NER

View File

@@ -175,6 +175,32 @@ Using a version >=7.14.0 is recommended to enable the following features:
When using a legacy version of ES, a notice will be displayed next to the sist2 version in the web UI.
If you don't care about the features above, you can ignore it or disable it in the configuration page.
# Embeddings search
Since v3.2.0, User scripts can be used to generate _embeddings_ (vector of float32 numbers) which are stored in the .sist2 index file
(see [scripting](scripting.md)). Embeddings can be used for:
* Nearest-neighbor queries (e.g. "return the documents most similar to this one")
* Semantic searches (e.g. "return the documents that are most closely related to the given topic")
In theory, embeddings can be created for any type of documents (image, text, audio etc.).
For example, the [clip](https://github.com/simon987/sist2-script-clip) User Script, generates 512-d embeddings of images
(videos are also supported using the thumbnails generated by sist2). When the user enters a query in the "Embeddings Search"
textbox, the query's embedding is generated in their browser, leveraging the ONNX web runtime.
<details>
<summary>Screenshots</summary>
![embeddings-1](embeddings-1.png)
![embeddings-2](embeddings-2.png)
1. Embeddings search bar. You can select the model using the dropdown on the left.
2. This icon appears for indices with embeddings search enabled.
3. Documents with this icon have embeddings. Click on the icon to perform KNN search.
</details>
# Tagging
### Manual tagging
@@ -199,43 +225,4 @@ See [Automatic tagging](#automatic-tagging) for information about tag
### Automatic tagging
See [scripting](scripting.md) documentation.
# Sidecar files
When scanning, sist2 will read metadata from `.s2meta` JSON files and overwrite the
original document's indexed metadata (does not modify the actual file). Sidecar metadata files will also work inside archives.
Sidecar files themselves are not saved in the index.
This feature is useful to leverage third-party applications such as speech-to-text or
OCR to add additional metadata to a file.
**Example**
```
~/Documents/
├── Video.mp4
└── Video.mp4.s2meta
```
The sidecar file must have exactly the same file path and the `.s2meta` suffix.
`Video.mp4.s2meta`:
```json
{
"content": "This sidecar file will overwrite some metadata fields of Video.mp4",
"author": "Some author",
"duration": 12345,
"bitrate": 67890,
"some_arbitrary_field": [1,2,3]
}
```
```
sist2 scan ~/Documents -o ./docs.sist2
sist2 index ./docs.sist2
```
*NOTE*: It is technically possible to overwrite the `tag` value using sidecar files, however,
it is not currently possible to restore both manual tags and sidecar tags without user scripts
while reindexing.
See [scripting](scripting.md) documentation.

BIN
docs/embeddings-1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 90 KiB

BIN
docs/embeddings-2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 996 KiB

View File

@@ -33,18 +33,6 @@ class Sist2Api {
getSist2Info() {
return axios.get(`${this.baseUrl}i`).then(resp => {
const indices = resp.data.indices;
resp.data.indices = indices.map(idx => {
return {
id: idx.id,
name: idx.name,
timestamp: idx.timestamp,
version: idx.version,
models: idx.models,
};
});
this.sist2Info = resp.data;
return resp.data;
@@ -155,6 +143,12 @@ class Sist2Api {
}
}
_getIndexRoot(indexId) {
console.log(indexId)
console.log(this.sist2Info.indices.find(idx => idx.id === indexId))
return this.sist2Info.indices.find(idx => idx.id === indexId).root;
}
esQuery(query) {
return axios.post(`${this.baseUrl}es`, query).then(resp => {
const res = resp.data;
@@ -163,6 +157,7 @@ class Sist2Api {
res.hits.hits.forEach((hit) => {
hit["_source"]["name"] = strUnescape(hit["_source"]["name"]);
hit["_source"]["path"] = strUnescape(hit["_source"]["path"]);
hit["_source"]["indexRoot"] = this._getIndexRoot(hit["_source"]["index"]);
this.setHitProps(hit);
this.setHitTags(hit);

View File

@@ -106,6 +106,8 @@ class Sist2ElasticsearchQuery {
q["sortAsc"] = true;
}
q["searchInPath"] = getters.optSearchInPath;
return q;
}
}

View File

@@ -136,7 +136,7 @@
{{ $t("opt.fuzzy") }}
</b-form-checkbox>
<b-form-checkbox :disabled="uiSqliteMode" :checked="optSearchInPath" @input="setOptSearchInPath">{{
<b-form-checkbox :checked="optSearchInPath" @input="setOptSearchInPath">{{
$t("opt.searchInPath")
}}
</b-form-checkbox>

View File

@@ -160,7 +160,8 @@ void database_fts_index(database_t *db) {
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(
db->db,
"INSERT INTO search(rowid, name, content, title) SELECT id, name, content, title from document_view",
"INSERT INTO search(rowid, name, content, title, path) "
"SELECT id, name, content, title, path from document_view",
NULL, NULL, NULL));
}

View File

@@ -75,23 +75,25 @@ const char *FtsDatabaseSchema =
" WHERE id = OLD.id;"
" END;"
""
"CREATE VIEW IF NOT EXISTS document_view (id, name, content, title)"
"CREATE VIEW IF NOT EXISTS document_view (id, name, content, title, path)"
" AS"
" SELECT id,"
" json_data->>'name',"
" json_data->>'content',"
" json_data->>'title'"
" json_data->>'title',"
" json_data->>'path'"
" FROM document_index;"
""
"CREATE VIRTUAL TABLE IF NOT EXISTS search USING fts5 ("
" name,"
" content,"
" title,"
" path,"
" content='document_view',"
" content_rowid='id'"
");"
// name^8, content^3, title^8
"INSERT INTO search(search, rank) VALUES('rank', 'bm25(8, 3, 8)');"
// name^8, content^3, title^8, path^5
"INSERT INTO search(search, rank) VALUES('rank', 'bm25(8, 3, 8, 5)');"
"";
const char *IpcDatabaseSchema =

View File

@@ -90,6 +90,7 @@ subreq_ctx_t *web_post_async(const char *url, char *data, int insecure) {
curl_easy_setopt(curl, CURLOPT_USERAGENT, "sist2");
if (insecure) {
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
}
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, req->curl_err_buffer);
@@ -123,6 +124,7 @@ response_t *web_get(const char *url, int timeout, int insecure) {
curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
if (insecure) {
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
}
struct curl_slist *headers = NULL;
@@ -162,6 +164,7 @@ response_t *web_post(const char *url, const char *data, int insecure) {
curl_easy_setopt(curl, CURLOPT_USERAGENT, "sist2");
if (insecure) {
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
}
char err_buffer[CURL_ERROR_SIZE + 1] = {};
@@ -207,6 +210,7 @@ response_t *web_put(const char *url, const char *data, int insecure) {
curl_easy_setopt(curl, CURLOPT_IPRESOLVE, CURLOPT_DNS_LOCAL_IP4);
if (insecure) {
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
}
struct curl_slist *headers = NULL;
@@ -241,6 +245,7 @@ response_t *web_delete(const char *url, int insecure) {
curl_easy_setopt(curl, CURLOPT_USERAGENT, "sist2");
if (insecure) {
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
}
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, "");

View File

@@ -51,11 +51,11 @@
#include <ctype.h>
#include "git_hash.h"
#define VERSION "3.3.2"
#define VERSION "3.4.0"
static const char *const Version = VERSION;
static const int VersionMajor = 3;
static const int VersionMinor = 3;
static const int VersionPatch = 2;
static const int VersionMinor = 4;
static const int VersionPatch = 0;
#ifndef SIST_PLATFORM
#define SIST_PLATFORM unknown

View File

@@ -88,7 +88,7 @@ void stats_files(struct mg_connection *nc, struct mg_http_message *hm) {
memcpy(index_id_str, hm->uri.ptr + 3, 8);
*(index_id_str + 8) = '\0';
int index_id = (int)strtol(index_id_str, NULL, 16);
int index_id = (int) strtol(index_id_str, NULL, 16);
memcpy(arg_stat_type, hm->uri.ptr + 3 + 9, 4);
*(arg_stat_type + sizeof(arg_stat_type) - 1) = '\0';
@@ -368,6 +368,10 @@ void index_info(struct mg_connection *nc) {
cJSON_AddNumberToObject(idx_json, "timestamp", (double) idx->desc.timestamp);
cJSON_AddItemToArray(arr, idx_json);
#ifdef SIST_DEBUG_INFO
cJSON_AddStringToObject(idx_json, "root", idx->desc.root);
#endif
cJSON *models = database_get_models(idx->db);
cJSON_AddItemToObject(idx_json, "models", models);
}
@@ -480,7 +484,7 @@ tag_req_t *parse_tag_request(cJSON *json) {
return req;
}
subreq_ctx_t *elastic_delete_tag(const char* sid, const tag_req_t *req) {
subreq_ctx_t *elastic_delete_tag(const char *sid, const tag_req_t *req) {
char *buf = malloc(sizeof(char) * 8192);
snprintf(buf, 8192,
"{"
@@ -500,7 +504,7 @@ subreq_ctx_t *elastic_delete_tag(const char* sid, const tag_req_t *req) {
return web_post_async(url, buf, WebCtx.es_insecure_ssl);
}
subreq_ctx_t *elastic_write_tag(const char* sid, const tag_req_t *req) {
subreq_ctx_t *elastic_write_tag(const char *sid, const tag_req_t *req) {
char *buf = malloc(sizeof(char) * 8192);
snprintf(buf, 8192,
"{"

View File

@@ -179,7 +179,8 @@ fts_search_req_t *get_search_req(struct mg_http_message *hm) {
json_value req_query, req_path, req_size_min, req_size_max, req_date_min, req_date_max, req_page_size,
req_index_ids, req_mime_types, req_tags, req_sort_asc, req_sort, req_seed, req_after,
req_fetch_aggregations, req_highlight, req_highlight_context_size, req_embedding, req_model;
req_fetch_aggregations, req_highlight, req_highlight_context_size, req_embedding, req_model,
req_search_in_path;
if (!cJSON_IsObject(json) ||
(req_query = get_json_string(json, "query")).invalid ||
@@ -197,6 +198,7 @@ fts_search_req_t *get_search_req(struct mg_http_message *hm) {
(req_index_ids = get_json_number_array(json, "indexIds")).invalid ||
(req_mime_types = get_json_array(json, "mimeTypes")).invalid ||
(req_highlight = get_json_bool(json, "highlight")).invalid ||
(req_search_in_path = get_json_bool(json, "searchInPath")).invalid ||
(req_highlight_context_size = get_json_number(json, "highlightContextSize")).invalid ||
(req_embedding = get_json_number_array(json, "embedding")).invalid ||
(req_model = get_json_number(json, "model")).invalid ||
@@ -252,7 +254,6 @@ fts_search_req_t *get_search_req(struct mg_http_message *hm) {
fts_search_req_t *req = malloc(sizeof(fts_search_req_t));
req->sort = sort;
req->query = req_query.val ? strdup(req_query.val->valuestring) : NULL;
req->path = req_path.val ? strdup(req_path.val->valuestring) : NULL;
req->size_min = req_size_min.val ? req_size_min.val->valuedouble : 0;
req->size_max = req_size_max.val ? req_size_max.val->valuedouble : 0;
@@ -271,6 +272,16 @@ fts_search_req_t *get_search_req(struct mg_http_message *hm) {
? req_highlight_context_size.val->valueint
: DEFAULT_HIGHLIGHT_CONTEXT_SIZE;
req->model = req_model.val ? req_model.val->valueint : 0;
if (req_search_in_path.val->valueint == FALSE && req_query.val) {
if (asprintf(&req->query, "- path : %s", req_query.val->valuestring) == -1) {
cJSON_Delete(json);
return NULL;
}
} else {
req->query = req_query.val ? strdup(req_query.val->valuestring) : NULL;
}
req->embedding = req_model.val
? get_float_buffer(req_embedding.val, &req->embedding_size)
: NULL;