From 857f3315c274675818d914abd9d8ad12ec7ecd60 Mon Sep 17 00:00:00 2001 From: simon987 Date: Sat, 19 Aug 2023 15:46:19 -0400 Subject: [PATCH] Rework user scripts, update DB schema to support embeddings --- README.md | 24 +- docs/USAGE.md | 13 - docs/scripting.md | 56 +- schema/mappings.json | 40 ++ scripts/generate_big_index.py | 131 ++++ scripts/start_dev_es.sh | 2 +- scripts/start_dev_es_8.sh | 2 +- sist2-admin/frontend/src/App.vue | 4 +- sist2-admin/frontend/src/Sist2AdminApi.js | 32 + .../frontend/src/components/JobSelect.vue | 34 + .../src/components/UserScriptListItem.vue | 18 + .../src/components/UserScriptPicker.vue | 88 +++ sist2-admin/frontend/src/i18n/messages.js | 10 + sist2-admin/frontend/src/router/index.js | 18 +- sist2-admin/frontend/src/views/Home.vue | 88 ++- sist2-admin/frontend/src/views/Job.vue | 13 + .../frontend/src/views/SearchBackend.vue | 3 - sist2-admin/frontend/src/views/Tail.vue | 3 + sist2-admin/frontend/src/views/UserScript.vue | 117 ++++ sist2-admin/requirements.txt | 4 +- sist2-admin/sist2_admin/app.py | 96 ++- sist2-admin/sist2_admin/config.py | 2 + sist2-admin/sist2_admin/jobs.py | 69 +- sist2-admin/sist2_admin/script.py | 126 ++++ sist2-admin/sist2_admin/sist2.py | 11 - sist2-admin/sist2_admin/state.py | 2 +- sist2-vue/package-lock.json | 609 +++--------------- sist2-vue/package.json | 7 +- sist2-vue/src/App.vue | 6 + sist2-vue/src/Sist2Api.ts | 27 +- sist2-vue/src/Sist2ElasticsearchQuery.ts | 59 +- sist2-vue/src/Sist2SqliteQuery.ts | 2 +- sist2-vue/src/components/DebugInfo.vue | 3 +- sist2-vue/src/components/DocCard.vue | 12 +- sist2-vue/src/components/DocFileTitle.vue | 75 ++- sist2-vue/src/components/DocListItem.vue | 262 ++++---- .../src/components/EmbeddingsSearchBar.vue | 142 ++-- sist2-vue/src/components/IndexPicker.vue | 262 ++++---- sist2-vue/src/components/SortSelect.vue | 2 +- sist2-vue/src/components/icons/MlIcon.vue | 32 +- sist2-vue/src/main.js | 1 - sist2-vue/src/ml/CLIPTransformerModel.js | 17 +- sist2-vue/src/ml/ModelStore.js | 67 ++ sist2-vue/src/ml/mlUtils.js | 1 - sist2-vue/src/store/index.ts | 4 +- sist2-vue/src/views/SearchPage.vue | 9 +- sist2-vue/vue.config.js | 16 + src/cli.c | 57 +- src/cli.h | 16 - src/database/database.c | 72 ++- src/database/database.h | 9 +- src/database/database_embeddings.c | 81 ++- src/database/database_fts.c | 30 +- src/database/database_schema.c | 17 +- src/index/elastic.c | 67 +- src/index/elastic.h | 6 +- src/main.c | 54 +- src/tpool.c | 5 + src/web/serve.c | 49 +- src/web/web_fts.c | 4 +- src/web/web_util.c | 2 +- src/web/web_util.h | 2 + 62 files changed, 1842 insertions(+), 1250 deletions(-) create mode 100644 scripts/generate_big_index.py create mode 100644 sist2-admin/frontend/src/components/JobSelect.vue create mode 100644 sist2-admin/frontend/src/components/UserScriptListItem.vue create mode 100644 sist2-admin/frontend/src/components/UserScriptPicker.vue create mode 100644 sist2-admin/frontend/src/views/UserScript.vue create mode 100644 sist2-admin/sist2_admin/script.py create mode 100644 sist2-vue/src/ml/ModelStore.js diff --git a/README.md b/README.md index e4792dc..2245ce7 100644 --- a/README.md +++ b/README.md @@ -146,17 +146,17 @@ sist2 v3.0.7+ supports SQLite search backend. The SQLite search backend has fewer features and generally comparable query performance for medium-size indices, but it uses much less memory and is easier to set up. -| | SQLite | Elasticsearch | -|----------------------------------------------|:----------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------:| -| Requires separate search engine installation | | ✓ | -| Memory footprint | ~20MB | >500MB | -| Query syntax | [fts5](https://www.sqlite.org/fts5.html) | [query_string](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax) | -| Fuzzy search | | ✓ | -| Media Types tree real-time updating | | ✓ | -| Search in file `path` | | ✓ | -| Manual tagging | ✓ | ✓ | -| User scripts | | ✓ | -| Media Type breakdown for search results | | ✓ | +| | SQLite | Elasticsearch | +|----------------------------------------------|:---------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------:| +| Requires separate search engine installation | | ✓ | +| Memory footprint | ~20MB | >500MB | +| Query syntax | [fts5](https://www.sqlite.org/fts5.html) | [query_string](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax) | +| Fuzzy search | | ✓ | +| Media Types tree real-time updating | | ✓ | +| Search in file `path` | [WIP](https://github.com/simon987/sist2/issues/402) | ✓ | +| Manual tagging | ✓ | ✓ | +| User scripts | ✓ | ✓ | +| Media Type breakdown for search results | | ✓ | ### NER @@ -206,7 +206,7 @@ docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux 3. Install vcpkg dependencies ```bash - vcpkg install curl[core,openssl] sqlite3[core,fts5] cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample,webp,opus,mp3lame,vpx,ffprobe,zlib] + vcpkg install openblas curl[core,openssl] sqlite3[core,fts5] cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample,webp,opus,mp3lame,vpx,ffprobe,zlib] ``` 4. Build diff --git a/docs/USAGE.md b/docs/USAGE.md index 6483d5a..813e8d8 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -5,7 +5,6 @@ Usage: sist2 scan [OPTION]... PATH or: sist2 index [OPTION]... INDEX or: sist2 sqlite-index [OPTION]... INDEX or: sist2 web [OPTION]... INDEX... - or: sist2 exec-script [OPTION]... INDEX Lightning-fast file system indexer and search tool. @@ -74,13 +73,6 @@ Web options --dev Serve html & js files from disk (for development) --lang= Default UI language. Can be changed by the user -Exec-script options - --es-url= Elasticsearch url. DEFAULT: http://localhost:9200 - --es-insecure-ssl Do not verify SSL connections to Elasticsearch. - --es-index= Elasticsearch index name. DEFAULT: sist2 - --script-file= Path to user script. - --async-script Execute user script asynchronously. - Made by simon987 . Released under GPL-3.0 ``` @@ -183,11 +175,6 @@ Using a version >=7.14.0 is recommended to enable the following features: When using a legacy version of ES, a notice will be displayed next to the sist2 version in the web UI. If you don't care about the features above, you can ignore it or disable it in the configuration page. -## exec-script - -The `exec-script` command is used to execute a user script for an index that has already been imported to Elasticsearch with the `index` command. Note that the documents will not be reset to their default state before each execution as the `index` command does: if you make undesired changes to the documents by accident, you will need to run `index` again to revert to the original state. - - # Tagging ### Manual tagging diff --git a/docs/scripting.md b/docs/scripting.md index 01e5aa0..3194b96 100644 --- a/docs/scripting.md +++ b/docs/scripting.md @@ -1,18 +1,47 @@ ## User scripts -*This document is under construction, more in-depth guide coming soon* +User scripts are used to augment your sist2 index with additional metadata, neural network embeddings, tags etc. + + +Since version 3.2.0, user scripts are written in Python, and are ran against the sist2 index file. User scripts do not +need a connection to the search backend. + +You can create a user script based on a template from the sist2-admin interface: + +![sist2-admin-scripts](sist2-admin-scripts.png) + +User scripts leverage the [sist2-python](https://github.com/simon987/sist2-python) library to interface with the +index file*. You can find sist2-python documentation and examples +here: [sist2-python.readthedocs.io](https://sist2-python.readthedocs.io/). + +If you are not using the sist2-admin interface, you can run user scripts manually from the command line: + +``` +pip install git+https://github.com/simon987/sist2-python.git + +python my_script.py /path/to/my_index.sist2 +``` + +\* It is possible to manually update the index using raw SQL queries, but the database schema is not stable and +can change at any time; it is recommended to use the more stable sist2-python wrapper instead. + +
+ +
+ Legacy user scripts (sist2 version < 3.2.0) During the `index` step, you can use the `--script-file \ No newline at end of file diff --git a/sist2-admin/frontend/src/components/UserScriptListItem.vue b/sist2-admin/frontend/src/components/UserScriptListItem.vue new file mode 100644 index 0000000..6f2a053 --- /dev/null +++ b/sist2-admin/frontend/src/components/UserScriptListItem.vue @@ -0,0 +1,18 @@ + + + \ No newline at end of file diff --git a/sist2-admin/frontend/src/components/UserScriptPicker.vue b/sist2-admin/frontend/src/components/UserScriptPicker.vue new file mode 100644 index 0000000..c5b3db2 --- /dev/null +++ b/sist2-admin/frontend/src/components/UserScriptPicker.vue @@ -0,0 +1,88 @@ + + + + + \ No newline at end of file diff --git a/sist2-admin/frontend/src/i18n/messages.js b/sist2-admin/frontend/src/i18n/messages.js index 3de6a01..c96b5fd 100644 --- a/sist2-admin/frontend/src/i18n/messages.js +++ b/sist2-admin/frontend/src/i18n/messages.js @@ -54,8 +54,18 @@ export default { frontendTab: "Frontend", backendTab: "Backend", + scripts: "User Scripts", + script: "User Script", + testScript: "Test/debug User Script", + newScriptName: "New script name", + scriptType: "Script type", + scriptCode: "Script code (Python)", + scriptOptions: "User scripts", + gitRepository: "Git repository URL", + extraArgs: "Extra command line arguments", selectJobs: "Available jobs", + selectJob: "Select a job", webOptions: { title: "Web options", lang: "UI Language", diff --git a/sist2-admin/frontend/src/router/index.js b/sist2-admin/frontend/src/router/index.js index 773fbee..700174a 100644 --- a/sist2-admin/frontend/src/router/index.js +++ b/sist2-admin/frontend/src/router/index.js @@ -6,12 +6,18 @@ import Tasks from "@/views/Tasks"; import Frontend from "@/views/Frontend"; import Tail from "@/views/Tail"; import SearchBackend from "@/views/SearchBackend.vue"; +import UserScript from "@/views/UserScript.vue"; Vue.use(VueRouter); const routes = [ { - path: "/", + path: "/task", + name: "Tasks", + component: Tasks + }, + { + path: "/:tab?", name: "Home", component: Home }, @@ -20,11 +26,6 @@ const routes = [ name: "Job", component: Job }, - { - path: "/task/", - name: "Tasks", - component: Tasks - }, { path: "/frontend/:name", name: "Frontend", @@ -35,6 +36,11 @@ const routes = [ name: "SearchBackend", component: SearchBackend }, + { + path: "/userScript/:name", + name: "UserScript", + component: UserScript + }, { path: "/log/:taskId", name: "Tail", diff --git a/sist2-admin/frontend/src/views/Home.vue b/sist2-admin/frontend/src/views/Home.vue index b8182b8..94e23ab 100644 --- a/sist2-admin/frontend/src/views/Home.vue +++ b/sist2-admin/frontend/src/views/Home.vue @@ -1,6 +1,6 @@