mirror of
https://github.com/simon987/sist2.git
synced 2025-12-20 10:36:00 +00:00
Compare commits
16 Commits
8fdb832c85
...
3.3.6
| Author | SHA1 | Date | |
|---|---|---|---|
| 49a21a5a25 | |||
| 560aa82ce7 | |||
| b8c905bd64 | |||
| 8299237ea0 | |||
| 31646a2747 | |||
| d9d77de47f | |||
| 5f0957d029 | |||
| 1cc48f7f33 | |||
| e1e22fd79a | |||
| 786bbc3859 | |||
| 9698ea0c37 | |||
| f345fc1a9a | |||
| 660fbf75d8 | |||
| 33ae585879 | |||
| 5729cbd6b4 | |||
| a19ec3305a |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -3,6 +3,7 @@ thumbs
|
|||||||
*.cbp
|
*.cbp
|
||||||
CMakeCache.txt
|
CMakeCache.txt
|
||||||
CMakeFiles
|
CMakeFiles
|
||||||
|
cmake-build-default-event-trace
|
||||||
cmake-build-debug
|
cmake-build-debug
|
||||||
cmake_install.cmake
|
cmake_install.cmake
|
||||||
Makefile
|
Makefile
|
||||||
|
|||||||
@@ -4,6 +4,8 @@
|
|||||||
|
|
||||||
**Demo**: [sist2.simon987.net](https://sist2.simon987.net/)
|
**Demo**: [sist2.simon987.net](https://sist2.simon987.net/)
|
||||||
|
|
||||||
|
**Community URL:** [Discord](https://discord.gg/2PEjDy3Rfs)
|
||||||
|
|
||||||
# sist2
|
# sist2
|
||||||
|
|
||||||
sist2 (Simple incremental search tool)
|
sist2 (Simple incremental search tool)
|
||||||
@@ -46,7 +48,7 @@ services:
|
|||||||
- "discovery.type=single-node"
|
- "discovery.type=single-node"
|
||||||
- "ES_JAVA_OPTS=-Xms2g -Xmx2g"
|
- "ES_JAVA_OPTS=-Xms2g -Xmx2g"
|
||||||
sist2-admin:
|
sist2-admin:
|
||||||
image: simon987/sist2:3.1.4-x64-linux
|
image: simon987/sist2:3.3.4-x64-linux
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
volumes:
|
volumes:
|
||||||
- ./sist2-admin-data/:/sist2-admin/
|
- ./sist2-admin-data/:/sist2-admin/
|
||||||
@@ -157,6 +159,7 @@ indices, but it uses much less memory and is easier to set up.
|
|||||||
| Manual tagging | ✓ | ✓ |
|
| Manual tagging | ✓ | ✓ |
|
||||||
| User scripts | ✓ | ✓ |
|
| User scripts | ✓ | ✓ |
|
||||||
| Media Type breakdown for search results | | ✓ |
|
| Media Type breakdown for search results | | ✓ |
|
||||||
|
| Embeddings search | ✓ *O(n)* | ✓ *O(logn)* |
|
||||||
|
|
||||||
### NER
|
### NER
|
||||||
|
|
||||||
|
|||||||
@@ -175,6 +175,32 @@ Using a version >=7.14.0 is recommended to enable the following features:
|
|||||||
When using a legacy version of ES, a notice will be displayed next to the sist2 version in the web UI.
|
When using a legacy version of ES, a notice will be displayed next to the sist2 version in the web UI.
|
||||||
If you don't care about the features above, you can ignore it or disable it in the configuration page.
|
If you don't care about the features above, you can ignore it or disable it in the configuration page.
|
||||||
|
|
||||||
|
# Embeddings search
|
||||||
|
|
||||||
|
Since v3.2.0, User scripts can be used to generate _embeddings_ (vector of float32 numbers) which are stored in the .sist2 index file
|
||||||
|
(see [scripting](scripting.md)). Embeddings can be used for:
|
||||||
|
|
||||||
|
* Nearest-neighbor queries (e.g. "return the documents most similar to this one")
|
||||||
|
* Semantic searches (e.g. "return the documents that are most closely related to the given topic")
|
||||||
|
|
||||||
|
In theory, embeddings can be created for any type of documents (image, text, audio etc.).
|
||||||
|
|
||||||
|
For example, the [clip](https://github.com/simon987/sist2-script-clip) User Script, generates 512-d embeddings of images
|
||||||
|
(videos are also supported using the thumbnails generated by sist2). When the user enters a query in the "Embeddings Search"
|
||||||
|
textbox, the query's embedding is generated in their browser, leveraging the ONNX web runtime.
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Screenshots</summary>
|
||||||
|
|
||||||
|

|
||||||
|

|
||||||
|
|
||||||
|
1. Embeddings search bar. You can select the model using the dropdown on the left.
|
||||||
|
2. This icon appears for indices with embeddings search enabled.
|
||||||
|
3. Documents with this icon have embeddings. Click on the icon to perform KNN search.
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
# Tagging
|
# Tagging
|
||||||
|
|
||||||
### Manual tagging
|
### Manual tagging
|
||||||
@@ -199,43 +225,4 @@ See [Automatic tagging](#automatic-tagging) for information about tag
|
|||||||
|
|
||||||
### Automatic tagging
|
### Automatic tagging
|
||||||
|
|
||||||
See [scripting](scripting.md) documentation.
|
See [scripting](scripting.md) documentation.
|
||||||
|
|
||||||
# Sidecar files
|
|
||||||
|
|
||||||
When scanning, sist2 will read metadata from `.s2meta` JSON files and overwrite the
|
|
||||||
original document's indexed metadata (does not modify the actual file). Sidecar metadata files will also work inside archives.
|
|
||||||
Sidecar files themselves are not saved in the index.
|
|
||||||
|
|
||||||
This feature is useful to leverage third-party applications such as speech-to-text or
|
|
||||||
OCR to add additional metadata to a file.
|
|
||||||
|
|
||||||
**Example**
|
|
||||||
|
|
||||||
```
|
|
||||||
~/Documents/
|
|
||||||
├── Video.mp4
|
|
||||||
└── Video.mp4.s2meta
|
|
||||||
```
|
|
||||||
|
|
||||||
The sidecar file must have exactly the same file path and the `.s2meta` suffix.
|
|
||||||
|
|
||||||
`Video.mp4.s2meta`:
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"content": "This sidecar file will overwrite some metadata fields of Video.mp4",
|
|
||||||
"author": "Some author",
|
|
||||||
"duration": 12345,
|
|
||||||
"bitrate": 67890,
|
|
||||||
"some_arbitrary_field": [1,2,3]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
sist2 scan ~/Documents -o ./docs.sist2
|
|
||||||
sist2 index ./docs.sist2
|
|
||||||
```
|
|
||||||
|
|
||||||
*NOTE*: It is technically possible to overwrite the `tag` value using sidecar files, however,
|
|
||||||
it is not currently possible to restore both manual tags and sidecar tags without user scripts
|
|
||||||
while reindexing.
|
|
||||||
BIN
docs/embeddings-1.png
Normal file
BIN
docs/embeddings-1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 90 KiB |
BIN
docs/embeddings-2.png
Normal file
BIN
docs/embeddings-2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 996 KiB |
BIN
docs/sist2-admin-scripts.png
Normal file
BIN
docs/sist2-admin-scripts.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 78 KiB |
@@ -81,7 +81,7 @@ function humanDuration(sec_num) {
|
|||||||
return `${seconds}s`;
|
return `${seconds}s`;
|
||||||
}
|
}
|
||||||
|
|
||||||
return "<0s";
|
return "<1s";
|
||||||
}
|
}
|
||||||
|
|
||||||
export default {
|
export default {
|
||||||
@@ -134,7 +134,7 @@ export default {
|
|||||||
duration: this.taskDuration(row),
|
duration: this.taskDuration(row),
|
||||||
time: moment.utc(row.started).local().format("dd, MMM Do YYYY, HH:mm:ss"),
|
time: moment.utc(row.started).local().format("dd, MMM Do YYYY, HH:mm:ss"),
|
||||||
logs: null,
|
logs: null,
|
||||||
status: [0,1].includes(row.return_code) ? "ok" : "failed",
|
status: row.return_code === 0 ? "ok" : "failed",
|
||||||
_row: row
|
_row: row
|
||||||
}));
|
}));
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -120,6 +120,10 @@ class Sist2Task:
|
|||||||
|
|
||||||
logger.info(f"Started task {self.display_name}")
|
logger.info(f"Started task {self.display_name}")
|
||||||
|
|
||||||
|
def set_pid(self, pid):
|
||||||
|
self.pid = pid
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Sist2ScanTask(Sist2Task):
|
class Sist2ScanTask(Sist2Task):
|
||||||
|
|
||||||
@@ -133,13 +137,10 @@ class Sist2ScanTask(Sist2Task):
|
|||||||
else:
|
else:
|
||||||
self.job.scan_options.output = None
|
self.job.scan_options.output = None
|
||||||
|
|
||||||
def set_pid(pid):
|
return_code = sist2.scan(self.job.scan_options, logs_cb=self.log_callback, set_pid_cb=self.set_pid)
|
||||||
self.pid = pid
|
|
||||||
|
|
||||||
return_code = sist2.scan(self.job.scan_options, logs_cb=self.log_callback, set_pid_cb=set_pid)
|
|
||||||
self.ended = datetime.utcnow()
|
self.ended = datetime.utcnow()
|
||||||
|
|
||||||
is_ok = return_code in (0, 1)
|
is_ok = (return_code in (0, 1)) if "debug" in sist2.bin_path else (return_code == 0)
|
||||||
|
|
||||||
if not is_ok:
|
if not is_ok:
|
||||||
self._logger.error(json.dumps({"sist2-admin": f"Process returned non-zero exit code ({return_code})"}))
|
self._logger.error(json.dumps({"sist2-admin": f"Process returned non-zero exit code ({return_code})"}))
|
||||||
@@ -165,6 +166,9 @@ class Sist2ScanTask(Sist2Task):
|
|||||||
self.job.previous_index_path = self.job.index_path
|
self.job.previous_index_path = self.job.index_path
|
||||||
db["jobs"][self.job.name] = self.job
|
db["jobs"][self.job.name] = self.job
|
||||||
|
|
||||||
|
if is_ok:
|
||||||
|
return 0
|
||||||
|
|
||||||
return return_code
|
return return_code
|
||||||
|
|
||||||
|
|
||||||
@@ -185,7 +189,7 @@ class Sist2IndexTask(Sist2Task):
|
|||||||
|
|
||||||
logger.debug(f"Fetched search backend options for {self.job.index_options.search_backend}")
|
logger.debug(f"Fetched search backend options for {self.job.index_options.search_backend}")
|
||||||
|
|
||||||
return_code = sist2.index(self.job.index_options, search_backend, logs_cb=self.log_callback)
|
return_code = sist2.index(self.job.index_options, search_backend, logs_cb=self.log_callback, set_pid_cb=self.set_pid)
|
||||||
self.ended = datetime.utcnow()
|
self.ended = datetime.utcnow()
|
||||||
|
|
||||||
duration = self.ended - self.started
|
duration = self.ended - self.started
|
||||||
@@ -249,7 +253,7 @@ class Sist2UserScriptTask(Sist2Task):
|
|||||||
super().run(sist2, db)
|
super().run(sist2, db)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.user_script.setup(self.log_callback)
|
self.user_script.setup(self.log_callback, self.set_pid)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Setup for {self.user_script.name} failed: ")
|
logger.error(f"Setup for {self.user_script.name} failed: ")
|
||||||
logger.exception(e)
|
logger.exception(e)
|
||||||
@@ -269,7 +273,7 @@ class Sist2UserScriptTask(Sist2Task):
|
|||||||
self.log_callback({"sist2-admin": f"Starting user script with {executable=}, {index_path=}, {extra_args=}"})
|
self.log_callback({"sist2-admin": f"Starting user script with {executable=}, {index_path=}, {extra_args=}"})
|
||||||
|
|
||||||
proc = Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.user_script.script_dir())
|
proc = Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.user_script.script_dir())
|
||||||
self.pid = proc.pid
|
self.set_pid(proc.pid)
|
||||||
|
|
||||||
t_stderr = Thread(target=self._consume_logs, args=(self.log_callback, proc, "stderr", False))
|
t_stderr = Thread(target=self._consume_logs, args=(self.log_callback, proc, "stderr", False))
|
||||||
t_stderr.start()
|
t_stderr.start()
|
||||||
@@ -316,7 +320,7 @@ class TaskQueue:
|
|||||||
def _tasks_failed(self):
|
def _tasks_failed(self):
|
||||||
done = set()
|
done = set()
|
||||||
|
|
||||||
for row in self._db["task_done"].sql("WHERE return_code NOT IN (0,1)"):
|
for row in self._db["task_done"].sql("WHERE return_code != 0"):
|
||||||
done.add(uuid.UUID(row["id"]))
|
done.add(uuid.UUID(row["id"]))
|
||||||
|
|
||||||
return done
|
return done
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ def set_executable(file):
|
|||||||
os.chmod(file, os.stat(file).st_mode | stat.S_IEXEC)
|
os.chmod(file, os.stat(file).st_mode | stat.S_IEXEC)
|
||||||
|
|
||||||
|
|
||||||
def _initialize_git_repository(url, path, log_cb, force_clone):
|
def _initialize_git_repository(url, path, log_cb, force_clone, set_pid_cb):
|
||||||
log_cb({"sist2-admin": f"Cloning {url}"})
|
log_cb({"sist2-admin": f"Cloning {url}"})
|
||||||
|
|
||||||
if force_clone or not os.path.exists(os.path.join(path, ".git")):
|
if force_clone or not os.path.exists(os.path.join(path, ".git")):
|
||||||
@@ -36,14 +36,18 @@ def _initialize_git_repository(url, path, log_cb, force_clone):
|
|||||||
log_cb({"sist2-admin": f"Executing setup script {setup_script}"})
|
log_cb({"sist2-admin": f"Executing setup script {setup_script}"})
|
||||||
|
|
||||||
set_executable(setup_script)
|
set_executable(setup_script)
|
||||||
result = subprocess.run([setup_script], cwd=path, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
proc = subprocess.Popen([setup_script], cwd=path, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||||
for line in result.stdout.split(b"\n"):
|
set_pid_cb(proc.pid)
|
||||||
|
proc.wait()
|
||||||
|
stdout = proc.stdout.read()
|
||||||
|
|
||||||
|
for line in stdout.split(b"\n"):
|
||||||
if line:
|
if line:
|
||||||
log_cb({"stdout": line.decode()})
|
log_cb({"stdout": line.decode()})
|
||||||
|
|
||||||
log_cb({"stdout": f"Executed setup script {setup_script}, return code = {result.returncode}"})
|
log_cb({"stdout": f"Executed setup script {setup_script}, return code = {proc.returncode}"})
|
||||||
|
|
||||||
if result.returncode != 0:
|
if proc.returncode != 0:
|
||||||
raise Exception("Error when running setup script!")
|
raise Exception("Error when running setup script!")
|
||||||
|
|
||||||
log_cb({"sist2-admin": f"Initialized git repository in {path}"})
|
log_cb({"sist2-admin": f"Initialized git repository in {path}"})
|
||||||
@@ -60,11 +64,11 @@ class UserScript(BaseModel):
|
|||||||
def script_dir(self):
|
def script_dir(self):
|
||||||
return os.path.join(SCRIPT_FOLDER, self.name)
|
return os.path.join(SCRIPT_FOLDER, self.name)
|
||||||
|
|
||||||
def setup(self, log_cb):
|
def setup(self, log_cb, set_pid_cb):
|
||||||
os.makedirs(self.script_dir(), exist_ok=True)
|
os.makedirs(self.script_dir(), exist_ok=True)
|
||||||
|
|
||||||
if self.type == ScriptType.GIT:
|
if self.type == ScriptType.GIT:
|
||||||
_initialize_git_repository(self.git_repository, self.script_dir(), log_cb, self.force_clone)
|
_initialize_git_repository(self.git_repository, self.script_dir(), log_cb, self.force_clone, set_pid_cb)
|
||||||
self.force_clone = False
|
self.force_clone = False
|
||||||
elif self.type == ScriptType.SIMPLE:
|
elif self.type == ScriptType.SIMPLE:
|
||||||
self._setup_simple()
|
self._setup_simple()
|
||||||
|
|||||||
@@ -243,7 +243,7 @@ class Sist2:
|
|||||||
self.bin_path = bin_path
|
self.bin_path = bin_path
|
||||||
self._data_dir = data_directory
|
self._data_dir = data_directory
|
||||||
|
|
||||||
def index(self, options: IndexOptions, search_backend: Sist2SearchBackend, logs_cb):
|
def index(self, options: IndexOptions, search_backend: Sist2SearchBackend, logs_cb, set_pid_cb):
|
||||||
|
|
||||||
args = [
|
args = [
|
||||||
self.bin_path,
|
self.bin_path,
|
||||||
@@ -255,6 +255,8 @@ class Sist2:
|
|||||||
logs_cb({"sist2-admin": f"Starting sist2 command with args {args}"})
|
logs_cb({"sist2-admin": f"Starting sist2 command with args {args}"})
|
||||||
proc = Popen(args, stdout=PIPE, stderr=PIPE)
|
proc = Popen(args, stdout=PIPE, stderr=PIPE)
|
||||||
|
|
||||||
|
set_pid_cb(proc.pid)
|
||||||
|
|
||||||
t_stderr = Thread(target=self._consume_logs_stderr, args=(logs_cb, proc))
|
t_stderr = Thread(target=self._consume_logs_stderr, args=(logs_cb, proc))
|
||||||
t_stderr.start()
|
t_stderr.start()
|
||||||
|
|
||||||
|
|||||||
@@ -33,18 +33,6 @@ class Sist2Api {
|
|||||||
|
|
||||||
getSist2Info() {
|
getSist2Info() {
|
||||||
return axios.get(`${this.baseUrl}i`).then(resp => {
|
return axios.get(`${this.baseUrl}i`).then(resp => {
|
||||||
const indices = resp.data.indices;
|
|
||||||
|
|
||||||
resp.data.indices = indices.map(idx => {
|
|
||||||
return {
|
|
||||||
id: idx.id,
|
|
||||||
name: idx.name,
|
|
||||||
timestamp: idx.timestamp,
|
|
||||||
version: idx.version,
|
|
||||||
models: idx.models,
|
|
||||||
};
|
|
||||||
});
|
|
||||||
|
|
||||||
this.sist2Info = resp.data;
|
this.sist2Info = resp.data;
|
||||||
|
|
||||||
return resp.data;
|
return resp.data;
|
||||||
@@ -155,6 +143,12 @@ class Sist2Api {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_getIndexRoot(indexId) {
|
||||||
|
console.log(indexId)
|
||||||
|
console.log(this.sist2Info.indices.find(idx => idx.id === indexId))
|
||||||
|
return this.sist2Info.indices.find(idx => idx.id === indexId).root;
|
||||||
|
}
|
||||||
|
|
||||||
esQuery(query) {
|
esQuery(query) {
|
||||||
return axios.post(`${this.baseUrl}es`, query).then(resp => {
|
return axios.post(`${this.baseUrl}es`, query).then(resp => {
|
||||||
const res = resp.data;
|
const res = resp.data;
|
||||||
@@ -163,6 +157,7 @@ class Sist2Api {
|
|||||||
res.hits.hits.forEach((hit) => {
|
res.hits.hits.forEach((hit) => {
|
||||||
hit["_source"]["name"] = strUnescape(hit["_source"]["name"]);
|
hit["_source"]["name"] = strUnescape(hit["_source"]["name"]);
|
||||||
hit["_source"]["path"] = strUnescape(hit["_source"]["path"]);
|
hit["_source"]["path"] = strUnescape(hit["_source"]["path"]);
|
||||||
|
hit["_source"]["indexRoot"] = this._getIndexRoot(hit["_source"]["index"]);
|
||||||
|
|
||||||
this.setHitProps(hit);
|
this.setHitProps(hit);
|
||||||
this.setHitTags(hit);
|
this.setHitTags(hit);
|
||||||
@@ -421,7 +416,9 @@ class Sist2Api {
|
|||||||
return axios.get(`${this.baseUrl}fts/dateRange`)
|
return axios.get(`${this.baseUrl}fts/dateRange`)
|
||||||
.then(resp => ({
|
.then(resp => ({
|
||||||
min: resp.data.dateMin,
|
min: resp.data.dateMin,
|
||||||
max: resp.data.dateMax,
|
max: (resp.data.dateMax === resp.data.dateMin)
|
||||||
|
? resp.data.dateMax + 1
|
||||||
|
: resp.data.dateMax,
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -585,7 +582,7 @@ class Sist2Api {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
getTagSuggestions(prefix) {
|
getTagSuggestions(prefix) {
|
||||||
if (this.backend() === "sqlite") {
|
if (this.backend() === "sqlite") {
|
||||||
return this.getTagSuggestionsSqlite(prefix);
|
return this.getTagSuggestionsSqlite(prefix);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
24
src/cli.c
24
src/cli.c
@@ -74,6 +74,21 @@ void sqlite_index_args_destroy(sqlite_index_args_t *args) {
|
|||||||
free(args);
|
free(args);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char *add_trailing_slash(char *abs_path) {
|
||||||
|
if (strcmp(abs_path, "/") == 0) {
|
||||||
|
// Special case: don't add trailing slash for "/"
|
||||||
|
return abs_path;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *new_abs_path = realloc(abs_path, strlen(abs_path) + 2);
|
||||||
|
if (new_abs_path == NULL) {
|
||||||
|
LOG_FATALF("cli.c", "FIXME: realloc() failed for abs_path=%s", abs_path);
|
||||||
|
}
|
||||||
|
strcat(new_abs_path, "/");
|
||||||
|
|
||||||
|
return new_abs_path;
|
||||||
|
}
|
||||||
|
|
||||||
int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||||
if (argc < 2) {
|
if (argc < 2) {
|
||||||
fprintf(stderr, "Required positional argument: PATH.\n");
|
fprintf(stderr, "Required positional argument: PATH.\n");
|
||||||
@@ -83,15 +98,10 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
|||||||
char *abs_path = abspath(argv[1]);
|
char *abs_path = abspath(argv[1]);
|
||||||
if (abs_path == NULL) {
|
if (abs_path == NULL) {
|
||||||
LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1]);
|
LOG_FATALF("cli.c", "Invalid PATH argument. File not found: %s", argv[1]);
|
||||||
} else {
|
|
||||||
char *new_abs_path = realloc(abs_path, strlen(abs_path) + 2);
|
|
||||||
if (new_abs_path == NULL) {
|
|
||||||
LOG_FATALF("cli.c", "FIXME: realloc() failed for argv[1]=%s, abs_path=%s", argv[1], abs_path);
|
|
||||||
}
|
|
||||||
strcat(new_abs_path, "/");
|
|
||||||
args->path = new_abs_path;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
args->path = add_trailing_slash(abs_path);
|
||||||
|
|
||||||
if (args->tn_quality == OPTION_VALUE_UNSPECIFIED) {
|
if (args->tn_quality == OPTION_VALUE_UNSPECIFIED) {
|
||||||
args->tn_quality = DEFAULT_QUALITY;
|
args->tn_quality = DEFAULT_QUALITY;
|
||||||
} else if (args->tn_quality < 0 || args->tn_quality > 100) {
|
} else if (args->tn_quality < 0 || args->tn_quality > 100) {
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
#include "ctx.h"
|
#include "ctx.h"
|
||||||
|
|
||||||
ScanCtx_t ScanCtx = {
|
ScanCtx_t ScanCtx = {
|
||||||
.stat_index_size = 0,
|
|
||||||
.stat_tn_size = 0,
|
|
||||||
.pool = NULL,
|
.pool = NULL,
|
||||||
.index.path = {0,},
|
.index.path = {0,},
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -31,9 +31,6 @@ typedef struct {
|
|||||||
int depth;
|
int depth;
|
||||||
int calculate_checksums;
|
int calculate_checksums;
|
||||||
|
|
||||||
size_t stat_tn_size;
|
|
||||||
size_t stat_index_size;
|
|
||||||
|
|
||||||
pcre *exclude;
|
pcre *exclude;
|
||||||
pcre_extra *exclude_extra;
|
pcre_extra *exclude_extra;
|
||||||
int fast;
|
int fast;
|
||||||
|
|||||||
@@ -149,7 +149,7 @@ void database_open(database_t *db) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SIST_DEBUG
|
#ifdef SIST_DEBUG
|
||||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA foreign_keys = ON;", NULL, NULL, NULL));
|
// CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA foreign_keys = ON;", NULL, NULL, NULL));
|
||||||
#else
|
#else
|
||||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA ignore_check_constraints = ON;", NULL, NULL, NULL));
|
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "PRAGMA ignore_check_constraints = ON;", NULL, NULL, NULL));
|
||||||
#endif
|
#endif
|
||||||
@@ -373,7 +373,7 @@ void database_open(database_t *db) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void database_close(database_t *db, int optimize) {
|
void database_close(database_t *db, int optimize) {
|
||||||
LOG_DEBUGF("database.c", "Closing database %s", db->filename);
|
LOG_DEBUGF("database.c", "Closing database %s (%p)", db->filename, db->db);
|
||||||
|
|
||||||
if (optimize) {
|
if (optimize) {
|
||||||
LOG_DEBUG("database.c", "Optimizing database");
|
LOG_DEBUG("database.c", "Optimizing database");
|
||||||
@@ -516,32 +516,31 @@ database_iterator_t *database_create_document_iterator(database_t *db) {
|
|||||||
CRASH_IF_NOT_SQLITE_OK(
|
CRASH_IF_NOT_SQLITE_OK(
|
||||||
sqlite3_prepare_v2(
|
sqlite3_prepare_v2(
|
||||||
db->db,
|
db->db,
|
||||||
"WITH doc (j) AS (SELECT CASE"
|
"WITH doc (id, j) AS ("
|
||||||
" WHEN emb.embedding IS NULL THEN"
|
"SELECT"
|
||||||
" json_set(document.json_data, "
|
" document.id,"
|
||||||
" '$._id', document.id, "
|
" json_set(document.json_data,"
|
||||||
" '$.size', document.size, "
|
" '$._id', document.id,"
|
||||||
" '$.mtime', document.mtime, "
|
" '$.index', (SELECT id FROM descriptor),"
|
||||||
" '$.mime', mim.name,"
|
" '$.size', document.size,"
|
||||||
" '$.thumbnail', document.thumbnail_count, "
|
" '$.mtime', document.mtime,"
|
||||||
" '$.tag', json_group_array((SELECT tag FROM tag WHERE document.id = tag.id)))"
|
" '$.mime', mim.name,"
|
||||||
" ELSE"
|
" '$.thumbnail', document.thumbnail_count,"
|
||||||
" json_set(document.json_data,"
|
" '$.tag', json_group_array(t.tag))"
|
||||||
" '$._id', document.id,"
|
|
||||||
" '$.size', document.size,"
|
|
||||||
" '$.mtime', document.mtime,"
|
|
||||||
" '$.mime', mim.name,"
|
|
||||||
" '$.thumbnail', document.thumbnail_count, "
|
|
||||||
" '$.tag', json_group_array((SELECT tag FROM tag WHERE document.id = tag.id)),"
|
|
||||||
" '$.emb', json_group_object(m.path, json(emb_to_json(emb.embedding))),"
|
|
||||||
" '$.embedding', 1)"
|
|
||||||
" END"
|
|
||||||
" FROM document"
|
" FROM document"
|
||||||
" LEFT JOIN embedding emb ON document.id = emb.id"
|
" LEFT JOIN mime mim ON mim.id = document.mime"
|
||||||
" LEFT JOIN model m ON emb.model_id = m.id"
|
" LEFT JOIN tag t ON t.id = document.id"
|
||||||
" LEFT JOIN mime mim ON mim.id = document.mime"
|
|
||||||
" GROUP BY document.id)"
|
" GROUP BY document.id)"
|
||||||
" SELECT json_set(j, '$.index', (SELECT id FROM descriptor)) FROM doc",
|
"SELECT CASE"
|
||||||
|
" WHEN emb.embedding IS NULL THEN j"
|
||||||
|
" ELSE json_set(j,"
|
||||||
|
" '$.emb', json_group_object(m.path, json(emb_to_json(emb.embedding))),"
|
||||||
|
" '$.embedding', 1"
|
||||||
|
" ) END"
|
||||||
|
" FROM doc"
|
||||||
|
" LEFT JOIN embedding emb ON doc.id = emb.id"
|
||||||
|
" LEFT JOIN model m ON emb.model_id = m.id"
|
||||||
|
" GROUP BY doc.id",
|
||||||
-1, &stmt, NULL));
|
-1, &stmt, NULL));
|
||||||
|
|
||||||
database_iterator_t *iter = malloc(sizeof(database_iterator_t));
|
database_iterator_t *iter = malloc(sizeof(database_iterator_t));
|
||||||
@@ -594,8 +593,9 @@ cJSON *database_document_iter(database_iterator_t *iter) {
|
|||||||
cJSON *database_incremental_scan_begin(database_t *db) {
|
cJSON *database_incremental_scan_begin(database_t *db) {
|
||||||
LOG_DEBUG("database.c", "Preparing database for incremental scan");
|
LOG_DEBUG("database.c", "Preparing database for incremental scan");
|
||||||
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "DELETE FROM marked;", NULL, NULL, NULL));
|
CRASH_IF_NOT_SQLITE_OK(sqlite3_exec(db->db, "DELETE FROM marked;", NULL, NULL, NULL));
|
||||||
|
LOG_DEBUG("database.c", "Preparing database for incremental scan (create marked table)");
|
||||||
CRASH_IF_NOT_SQLITE_OK(
|
CRASH_IF_NOT_SQLITE_OK(
|
||||||
sqlite3_exec(db->db, "INSERT INTO marked SELECT ROWID, 0, mtime FROM document;", NULL, NULL, NULL));
|
sqlite3_exec(db->db, "INSERT INTO marked SELECT id, 0, mtime FROM document;", NULL, NULL, NULL));
|
||||||
}
|
}
|
||||||
|
|
||||||
cJSON *database_incremental_scan_end(database_t *db) {
|
cJSON *database_incremental_scan_end(database_t *db) {
|
||||||
|
|||||||
@@ -105,7 +105,6 @@ typedef struct database {
|
|||||||
sqlite3_stmt *fts_write_tag_stmt;
|
sqlite3_stmt *fts_write_tag_stmt;
|
||||||
sqlite3_stmt *fts_model_size;
|
sqlite3_stmt *fts_model_size;
|
||||||
|
|
||||||
|
|
||||||
char **tag_array;
|
char **tag_array;
|
||||||
|
|
||||||
database_ipc_ctx_t *ipc_ctx;
|
database_ipc_ctx_t *ipc_ctx;
|
||||||
|
|||||||
@@ -90,6 +90,7 @@ subreq_ctx_t *web_post_async(const char *url, char *data, int insecure) {
|
|||||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "sist2");
|
curl_easy_setopt(curl, CURLOPT_USERAGENT, "sist2");
|
||||||
if (insecure) {
|
if (insecure) {
|
||||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, req->curl_err_buffer);
|
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, req->curl_err_buffer);
|
||||||
@@ -123,6 +124,7 @@ response_t *web_get(const char *url, int timeout, int insecure) {
|
|||||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
|
curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
|
||||||
if (insecure) {
|
if (insecure) {
|
||||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct curl_slist *headers = NULL;
|
struct curl_slist *headers = NULL;
|
||||||
@@ -162,6 +164,7 @@ response_t *web_post(const char *url, const char *data, int insecure) {
|
|||||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "sist2");
|
curl_easy_setopt(curl, CURLOPT_USERAGENT, "sist2");
|
||||||
if (insecure) {
|
if (insecure) {
|
||||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
char err_buffer[CURL_ERROR_SIZE + 1] = {};
|
char err_buffer[CURL_ERROR_SIZE + 1] = {};
|
||||||
@@ -207,6 +210,7 @@ response_t *web_put(const char *url, const char *data, int insecure) {
|
|||||||
curl_easy_setopt(curl, CURLOPT_IPRESOLVE, CURLOPT_DNS_LOCAL_IP4);
|
curl_easy_setopt(curl, CURLOPT_IPRESOLVE, CURLOPT_DNS_LOCAL_IP4);
|
||||||
if (insecure) {
|
if (insecure) {
|
||||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct curl_slist *headers = NULL;
|
struct curl_slist *headers = NULL;
|
||||||
@@ -241,6 +245,7 @@ response_t *web_delete(const char *url, int insecure) {
|
|||||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "sist2");
|
curl_easy_setopt(curl, CURLOPT_USERAGENT, "sist2");
|
||||||
if (insecure) {
|
if (insecure) {
|
||||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, "");
|
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, "");
|
||||||
|
|||||||
@@ -260,9 +260,6 @@ void sist2_scan(scan_args_t *args) {
|
|||||||
tpool_wait(ScanCtx.pool);
|
tpool_wait(ScanCtx.pool);
|
||||||
tpool_destroy(ScanCtx.pool);
|
tpool_destroy(ScanCtx.pool);
|
||||||
|
|
||||||
LOG_DEBUGF("main.c", "Thumbnail store size: %lu", ScanCtx.stat_tn_size);
|
|
||||||
LOG_DEBUGF("main.c", "Index size: %lu", ScanCtx.stat_index_size);
|
|
||||||
|
|
||||||
database_t *db = database_create(args->output, INDEX_DATABASE);
|
database_t *db = database_create(args->output, INDEX_DATABASE);
|
||||||
database_open(db);
|
database_open(db);
|
||||||
|
|
||||||
@@ -356,7 +353,6 @@ void sist2_sqlite_index(sqlite_index_args_t *args) {
|
|||||||
database_fts_optimize(db);
|
database_fts_optimize(db);
|
||||||
|
|
||||||
database_close(db, FALSE);
|
database_close(db, FALSE);
|
||||||
database_close(search_db, FALSE);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void sist2_web(web_args_t *args) {
|
void sist2_web(web_args_t *args) {
|
||||||
|
|||||||
@@ -51,11 +51,11 @@
|
|||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include "git_hash.h"
|
#include "git_hash.h"
|
||||||
|
|
||||||
#define VERSION "3.3.0"
|
#define VERSION "3.3.6"
|
||||||
static const char *const Version = VERSION;
|
static const char *const Version = VERSION;
|
||||||
static const int VersionMajor = 3;
|
static const int VersionMajor = 3;
|
||||||
static const int VersionMinor = 3;
|
static const int VersionMinor = 3;
|
||||||
static const int VersionPatch = 0;
|
static const int VersionPatch = 6;
|
||||||
|
|
||||||
#ifndef SIST_PLATFORM
|
#ifndef SIST_PLATFORM
|
||||||
#define SIST_PLATFORM unknown
|
#define SIST_PLATFORM unknown
|
||||||
|
|||||||
26
src/tpool.c
26
src/tpool.c
@@ -77,14 +77,14 @@ static void worker_thread_loop(tpool_t *pool) {
|
|||||||
job_t *job = database_get_work(ProcData.ipc_db, pool->shm->job_type);
|
job_t *job = database_get_work(ProcData.ipc_db, pool->shm->job_type);
|
||||||
|
|
||||||
if (job != NULL) {
|
if (job != NULL) {
|
||||||
pthread_mutex_lock(&(pool->shm->data_mutex));
|
|
||||||
pool->shm->busy_count += 1;
|
|
||||||
pthread_mutex_unlock(&(pool->shm->data_mutex));
|
|
||||||
|
|
||||||
if (pool->shm->stop) {
|
if (pool->shm->stop) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pthread_mutex_lock(&(pool->shm->data_mutex));
|
||||||
|
pool->shm->busy_count += 1;
|
||||||
|
pthread_mutex_unlock(&(pool->shm->data_mutex));
|
||||||
|
|
||||||
if (job->type == JOB_PARSE_JOB) {
|
if (job->type == JOB_PARSE_JOB) {
|
||||||
parse(job->parse_job);
|
parse(job->parse_job);
|
||||||
} else if (job->type == JOB_BULK_LINE) {
|
} else if (job->type == JOB_BULK_LINE) {
|
||||||
@@ -110,11 +110,11 @@ static void worker_thread_loop(tpool_t *pool) {
|
|||||||
if (LogCtx.json_logs) {
|
if (LogCtx.json_logs) {
|
||||||
progress_bar_print_json(done,
|
progress_bar_print_json(done,
|
||||||
count,
|
count,
|
||||||
ScanCtx.stat_tn_size,
|
0,
|
||||||
ScanCtx.stat_index_size, pool->shm->waiting);
|
0, pool->shm->waiting);
|
||||||
} else {
|
} else {
|
||||||
progress_bar_print((double) done / count,
|
progress_bar_print((double) done / count,
|
||||||
ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
|
0, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -200,11 +200,11 @@ static void *tpool_worker(void *arg) {
|
|||||||
pool->shm->ipc_ctx.completed_job_count += 1;
|
pool->shm->ipc_ctx.completed_job_count += 1;
|
||||||
pthread_mutex_unlock(&(pool->shm->ipc_ctx.mutex));
|
pthread_mutex_unlock(&(pool->shm->ipc_ctx.mutex));
|
||||||
|
|
||||||
pthread_mutex_lock(&(pool->shm->data_mutex));
|
|
||||||
pool->shm->busy_count -= 1;
|
|
||||||
pthread_mutex_unlock(&(pool->shm->data_mutex));
|
|
||||||
|
|
||||||
if (WIFSIGNALED(status)) {
|
if (WIFSIGNALED(status)) {
|
||||||
|
pthread_mutex_lock(&(pool->shm->data_mutex));
|
||||||
|
pool->shm->busy_count -= 1;
|
||||||
|
pthread_mutex_unlock(&(pool->shm->data_mutex));
|
||||||
|
|
||||||
int crashed_thread_id = -1;
|
int crashed_thread_id = -1;
|
||||||
for (int i = 0; i < MAX_THREADS; i++) {
|
for (int i = 0; i < MAX_THREADS; i++) {
|
||||||
if (pool->shm->thread_id_to_pid_mapping[i] == pid) {
|
if (pool->shm->thread_id_to_pid_mapping[i] == pid) {
|
||||||
@@ -265,14 +265,14 @@ void tpool_wait(tpool_t *pool) {
|
|||||||
if (pool->shm->ipc_ctx.job_count > 0) {
|
if (pool->shm->ipc_ctx.job_count > 0) {
|
||||||
pthread_cond_wait(&(pool->shm->done_working_cond), &pool->shm->mutex);
|
pthread_cond_wait(&(pool->shm->done_working_cond), &pool->shm->mutex);
|
||||||
} else {
|
} else {
|
||||||
if (pool->shm->ipc_ctx.job_count == 0 && pool->shm->busy_count == 0) {
|
if (pool->shm->ipc_ctx.job_count == 0 && pool->shm->busy_count <= 0) {
|
||||||
pool->shm->stop = TRUE;
|
pool->shm->stop = TRUE;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (pool->print_progress && !LogCtx.json_logs) {
|
if (pool->print_progress && !LogCtx.json_logs) {
|
||||||
progress_bar_print(1.0, ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
|
progress_bar_print(1.0, 0, 0);
|
||||||
}
|
}
|
||||||
pthread_mutex_unlock(&pool->shm->mutex);
|
pthread_mutex_unlock(&pool->shm->mutex);
|
||||||
|
|
||||||
|
|||||||
@@ -88,7 +88,7 @@ void stats_files(struct mg_connection *nc, struct mg_http_message *hm) {
|
|||||||
|
|
||||||
memcpy(index_id_str, hm->uri.ptr + 3, 8);
|
memcpy(index_id_str, hm->uri.ptr + 3, 8);
|
||||||
*(index_id_str + 8) = '\0';
|
*(index_id_str + 8) = '\0';
|
||||||
int index_id = (int)strtol(index_id_str, NULL, 16);
|
int index_id = (int) strtol(index_id_str, NULL, 16);
|
||||||
|
|
||||||
memcpy(arg_stat_type, hm->uri.ptr + 3 + 9, 4);
|
memcpy(arg_stat_type, hm->uri.ptr + 3 + 9, 4);
|
||||||
*(arg_stat_type + sizeof(arg_stat_type) - 1) = '\0';
|
*(arg_stat_type + sizeof(arg_stat_type) - 1) = '\0';
|
||||||
@@ -368,6 +368,10 @@ void index_info(struct mg_connection *nc) {
|
|||||||
cJSON_AddNumberToObject(idx_json, "timestamp", (double) idx->desc.timestamp);
|
cJSON_AddNumberToObject(idx_json, "timestamp", (double) idx->desc.timestamp);
|
||||||
cJSON_AddItemToArray(arr, idx_json);
|
cJSON_AddItemToArray(arr, idx_json);
|
||||||
|
|
||||||
|
#ifdef SIST_DEBUG_INFO
|
||||||
|
cJSON_AddStringToObject(idx_json, "root", idx->desc.root);
|
||||||
|
#endif
|
||||||
|
|
||||||
cJSON *models = database_get_models(idx->db);
|
cJSON *models = database_get_models(idx->db);
|
||||||
cJSON_AddItemToObject(idx_json, "models", models);
|
cJSON_AddItemToObject(idx_json, "models", models);
|
||||||
}
|
}
|
||||||
@@ -480,7 +484,7 @@ tag_req_t *parse_tag_request(cJSON *json) {
|
|||||||
return req;
|
return req;
|
||||||
}
|
}
|
||||||
|
|
||||||
subreq_ctx_t *elastic_delete_tag(const char* sid, const tag_req_t *req) {
|
subreq_ctx_t *elastic_delete_tag(const char *sid, const tag_req_t *req) {
|
||||||
char *buf = malloc(sizeof(char) * 8192);
|
char *buf = malloc(sizeof(char) * 8192);
|
||||||
snprintf(buf, 8192,
|
snprintf(buf, 8192,
|
||||||
"{"
|
"{"
|
||||||
@@ -500,7 +504,7 @@ subreq_ctx_t *elastic_delete_tag(const char* sid, const tag_req_t *req) {
|
|||||||
return web_post_async(url, buf, WebCtx.es_insecure_ssl);
|
return web_post_async(url, buf, WebCtx.es_insecure_ssl);
|
||||||
}
|
}
|
||||||
|
|
||||||
subreq_ctx_t *elastic_write_tag(const char* sid, const tag_req_t *req) {
|
subreq_ctx_t *elastic_write_tag(const char *sid, const tag_req_t *req) {
|
||||||
char *buf = malloc(sizeof(char) * 8192);
|
char *buf = malloc(sizeof(char) * 8192);
|
||||||
snprintf(buf, 8192,
|
snprintf(buf, 8192,
|
||||||
"{"
|
"{"
|
||||||
|
|||||||
Reference in New Issue
Block a user