Compare commits

...

6 Commits

Author SHA1 Message Date
Shy
40136b74b4 Update readme 2025-07-05 19:40:20 -04:00
Shy
cb0a587fe9 Fix #504, version bump 2025-07-05 19:01:00 -04:00
Shy
d221e08d67 Fix #534, version bump 2025-07-05 09:56:50 -04:00
Shy
bcab40783c Version bump 2025-07-05 08:51:28 -04:00
Shy
ea23bf01e3 Fixes #536 2025-07-05 08:51:15 -04:00
Shy
f5d070496f Closes #535 2025-07-05 08:17:32 -04:00
18 changed files with 231 additions and 15 deletions

View File

@ -62,7 +62,9 @@ add_executable(
src/database/database_schema.c
src/database/database_fts.c
src/web/web_fts.c
src/database/database_embeddings.c)
src/database/database_embeddings.c
src/ignorelist.c
src/ignorelist.h)
set_target_properties(sist2 PROPERTIES LINKER_LANGUAGE C)
target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/)
@ -76,6 +78,7 @@ find_package(CURL CONFIG REQUIRED)
find_library(MAGIC_LIB NAMES libmagic.a REQUIRED)
find_package(unofficial-sqlite3 CONFIG REQUIRED)
find_package(OpenBLAS CONFIG REQUIRED)
find_package(libgit2 CONFIG REQUIRED)
target_include_directories(
@ -149,6 +152,7 @@ target_link_libraries(
# m
z
libgit2::libgit2package
argparse
unofficial::mongoose::mongoose
CURL::libcurl

View File

@ -216,7 +216,7 @@ docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
3. Install vcpkg dependencies
```bash
vcpkg install openblas curl[core,openssl] sqlite3[core,fts5,json1] cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf[ocr] gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample,webp,opus,mp3lame,vpx,zlib]
vcpkg install openblas curl[core,openssl] sqlite3[core,fts5,json1] cpp-jwt pcre cjson brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf[ocr] gtest mongoose libmagic libraw gumbo ffmpeg[core,avcodec,avformat,swscale,swresample,webp,opus,mp3lame,vpx,zlib] libgit2[core,pcre]
```
4. Build
@ -224,6 +224,6 @@ docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
git clone --recursive https://github.com/sist2app/sist2/
(cd sist2-vue; npm install; npm run build)
(cd sist2-admin/frontend; npm install; npm run build)
cmake -DSIST_DEBUG=off -DCMAKE_TOOLCHAIN_FILE=<VCPKG_ROOT>/scripts/buildsystems/vcpkg.cmake .
cmake -DSIST_DEBUG=off -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=<VCPKG_ROOT>/scripts/buildsystems/vcpkg.cmake .
make
```

View File

@ -108,6 +108,27 @@ sist scan ~/Documents -o ./documents.sist2 --incremental
sist scan ~/Documents -o ./documents.sist2 --incremental
```
### Excluding files
You can use the `--exclude` option to specify exclude patterns. For more complex setups, you can create a
`.sist2ignore` file at the root of the scan path (For example, `~/Documents/.sist2ignore` for the example above).
The syntax for sist2ignore is the same as .gitignore for Git (reference [here](https://git-scm.com/docs/gitignore)).
Example:
**.sist2ignore**
```gitignore
# Ignore all PDF files
*.pdf
# But don't ignore them for the /important_files/ directory
!/important_files/*.pdf
# Ignore all files in _staging/ directories
_staging/
```
### Index documents to Elasticsearch search backend
```bash

View File

@ -93,7 +93,9 @@ export default {
threads: "Number of threads",
batchSize: "Index batch size",
script: "User script",
searchIndex: "Search index file location"
searchIndex: "Search index file location",
esMappings: "Elasticsearch mappings file override",
esSettings: "Elasticsearch settings file override"
},
scanOptions: {
title: "Scanning options",

View File

@ -44,6 +44,12 @@
<label>{{ $t("backendOptions.batchSize") }}</label>
<b-form-input v-model="backend.batch_size" type="number" min="1" @change="update()"></b-form-input>
<label>{{ $t("backendOptions.esMappings") }}</label>
<b-form-textarea v-model="backend.es_mappings" rows="4" @change="update()"></b-form-textarea>
<label>{{ $t("backendOptions.esSettings") }}</label>
<b-form-textarea v-model="backend.es_settings" rows="4" @change="update()"></b-form-textarea>
</template>
<template v-else>
<label>{{ $t("backendOptions.searchIndex") }}</label>

View File

@ -10,7 +10,7 @@ from logging import FileHandler, StreamHandler
from subprocess import Popen, PIPE
from tempfile import NamedTemporaryFile
from threading import Thread
from typing import List
from typing import List, Optional
from pydantic import BaseModel
@ -40,6 +40,8 @@ class Sist2SearchBackend(BaseModel):
es_url: str = "http://elasticsearch:9200"
es_insecure_ssl: bool = False
es_mappings: Optional[str] = None
es_settings: Optional[str] = None
es_index: str = "sist2"
threads: int = 1
batch_size: int = 70
@ -57,6 +59,8 @@ class IndexOptions(BaseModel):
path: str = None
incremental_index: bool = True
search_backend: str = None
es_mappings_file: Optional[str] = None
es_settings_file: Optional[str] = None
def __init__(self, **kwargs):
super().__init__(**kwargs)
@ -75,6 +79,12 @@ class IndexOptions(BaseModel):
if search_backend.es_insecure_ssl:
args.append(f"--es-insecure-ssl")
if self.es_mappings_file:
args.append(f"--mappings-file={self.es_mappings_file}")
if self.es_settings_file:
args.append(f"--settings-file={self.es_settings_file}")
if self.incremental_index:
args.append(f"--incremental-index")
@ -249,6 +259,20 @@ class Sist2:
def index(self, options: IndexOptions, search_backend: Sist2SearchBackend, logs_cb, set_pid_cb):
if search_backend.es_mappings:
with NamedTemporaryFile("w", prefix="sist2-admin", suffix=".txt", delete=False) as f:
f.write(search_backend.es_mappings)
options.es_mappings_file = f.name
else:
options.es_mappings_file = None
if search_backend.es_settings:
with NamedTemporaryFile("w", prefix="sist2-admin", suffix=".txt", delete=False) as f:
f.write(search_backend.es_settings)
options.es_settings_file = f.name
else:
options.es_settings_file = None
args = [
self.bin_path,
*options.args(search_backend),

View File

@ -69,7 +69,8 @@ class Sist2Api {
hit._props.isImage = true;
}
if ("width" in hit._source && !hit._props.isSubDocument && hit._source.videoc !== "tiff"
&& hit._source.videoc !== "raw" && hit._source.videoc !== "ppm") {
&& hit._source.videoc !== "raw" && hit._source.videoc !== "ppm"
&& hit._source.mime !== "image/jp2") {
hit._props.isPlayableImage = true;
}
if ("width" in hit._source && "height" in hit._source) {

View File

@ -238,7 +238,7 @@ class Sist2ElasticsearchQuery {
pre_tags: ["<mark>"],
post_tags: ["</mark>"],
fragment_size: getters.optFragmentSize,
number_of_fragments: 1,
number_of_fragments: getters.optFragmentCount,
order: "score",
fields: {
content: {},

View File

@ -3,6 +3,8 @@
</template>
<script>
const FRAGMENT_SEPARATOR = "<br /><i style='line-height: 2.4'>[…]</i><br/>";
export default {
name: "ContentDiv",
props: ["doc"],
@ -13,10 +15,10 @@ export default {
}
if (this.doc.highlight["content.nGram"]) {
return this.doc.highlight["content.nGram"][0];
return this.doc.highlight["content.nGram"].join(FRAGMENT_SEPARATOR);
}
if (this.doc.highlight.content) {
return this.doc.highlight.content[0];
return this.doc.highlight.content.join(FRAGMENT_SEPARATOR);
}
}
}

View File

@ -59,6 +59,7 @@ export default {
searchInPath: "Enable matching query against document path",
suggestPath: "Enable auto-complete in path filter bar",
fragmentSize: "Highlight context size",
fragmentCount: "Number of highlight snippets",
queryMode: "Search mode",
displayMode: "Display",
columns: "Column count",
@ -242,6 +243,7 @@ export default {
searchInPath: "Abgleich der Abfrage mit dem Dokumentpfad aktivieren",
suggestPath: "Aktiviere Auto-Vervollständigung in Pfadfilter-Leiste",
fragmentSize: "Kontextgröße",
fragmentCount: "Anzahl der hervorgehobenen Snippets",
queryMode: "Such-Modus",
displayMode: "Ansicht",
columns: "Anzahl Spalten",
@ -417,6 +419,7 @@ export default {
searchInPath: "Activer la recherche dans le chemin des documents",
suggestPath: "Activer l'autocomplétion dans la barre de filtre de chemin",
fragmentSize: "Longueur du contexte de surlignage",
fragmentCount: "Nombre d'extraits surlignés",
queryMode: "Mode de recherche",
displayMode: "Affichage",
columns: "Nombre de colonnes",
@ -592,6 +595,7 @@ export default {
searchInPath: "匹配文档路径",
suggestPath: "搜索框启用自动补全",
fragmentSize: "高亮上下文大小",
fragmentCount: "突出显示的项目数",
queryMode: "搜索模式",
displayMode: "显示",
columns: "列数",
@ -767,6 +771,7 @@ export default {
searchInPath: "Włącz szukanie również w ścieżce dokumentu",
suggestPath: "Włącz auto-uzupełnianie w filtrze ścieżek",
fragmentSize: "Podświetl wielkość kontekstu w znakach",
fragmentCount: "Liczba wyróżnionych fragmentów",
queryMode: "Tryb szukania",
displayMode: "Wyświetlanie",
columns: "Liczba kolumn",

View File

@ -3,7 +3,7 @@ import Vuex from "vuex"
import {deserializeMimes, randomSeed, serializeMimes} from "@/util";
import {getInstance} from "@/plugins/auth0.js";
const CONF_VERSION = 3;
const CONF_VERSION = 4;
Vue.use(Vuex);
@ -41,6 +41,7 @@ export default new Vuex.Store({
optTagOrOperator: false,
optFuzzy: true,
optFragmentSize: 200,
optFragmentCount: 1,
optQueryMode: "simple",
optSearchInPath: false,
optColumns: "auto",
@ -170,6 +171,7 @@ export default new Vuex.Store({
setOptSearchInPath: (state, val) => state.optSearchInPath = val,
setOptSuggestPath: (state, val) => state.optSuggestPath = val,
setOptFragmentSize: (state, val) => state.optFragmentSize = val,
setOptFragmentCount: (state, val) => state.optFragmentCount = val,
setOptQueryMode: (state, val) => state.optQueryMode = val,
setOptResultSize: (state, val) => state.optSize = val,
setOptTagOrOperator: (state, val) => state.optTagOrOperator = val,
@ -430,6 +432,7 @@ export default new Vuex.Store({
optSearchInPath: state => state.optSearchInPath,
optSuggestPath: state => state.optSuggestPath,
optFragmentSize: state => state.optFragmentSize,
optFragmentCount: state => state.optFragmentCount,
optQueryMode: state => state.optQueryMode,
optTreemapType: state => state.optTreemapType,
optTreemapTiling: state => state.optTreemapTiling,

View File

@ -151,6 +151,10 @@
<b-form-input :value="optFragmentSize" step="10" type="number" min="0"
@input="setOptFragmentSize"></b-form-input>
<label :class="{'text-muted': uiSqliteMode}">{{ $t("opt.fragmentCount") }}</label>
<b-form-input :value="optFragmentCount" :disabled="uiSqliteMode" step="1" type="number" min="1"
@input="setOptFragmentCount"></b-form-input>
<label>{{ $t("opt.resultSize") }}</label>
<b-form-input :value="optResultSize" type="number" min="10"
@input="setOptResultSize"></b-form-input>
@ -314,6 +318,7 @@ export default {
"optSearchInPath",
"optSuggestPath",
"optFragmentSize",
"optFragmentCount",
"optQueryMode",
"optTreemapType",
"optTreemapTiling",
@ -360,6 +365,7 @@ export default {
"setOptSearchInPath",
"setOptSuggestPath",
"setOptFragmentSize",
"setOptFragmentCount",
"setOptQueryMode",
"setOptTreemapType",
"setOptTreemapTiling",

View File

@ -19,6 +19,7 @@
#include "src/database/database.h"
#include "src/index/elastic.h"
#include "sqlite3.h"
#include "ignorelist.h"
#include <pcre.h>
@ -34,6 +35,7 @@ typedef struct {
pcre *exclude;
pcre_extra *exclude_extra;
int fast;
ignorelist_t *ignorelist;
scan_arc_ctx_t arc_ctx;
scan_comic_ctx_t comic_ctx;

106
src/ignorelist.c Normal file
View File

@ -0,0 +1,106 @@
#include "ignorelist.h"
#include "ctx.h"
#include <git2.h>
typedef struct ignorelist {
git_repository *repo;
char repo_path[PATH_MAX];
int has_rules;
} ignorelist_t;
char *get_tempdir() {
char *tempdir_env = getenv("TMPDIR");
if (tempdir_env != NULL) {
return tempdir_env;
}
return "/tmp/";
}
void ignorelist_destroy(ignorelist_t* ignorelist) {
git_libgit2_shutdown();
if (ignorelist->repo != NULL) {
git_repository_free(ignorelist->repo);
}
free(ignorelist);
}
ignorelist_t *ignorelist_create() {
git_libgit2_init();
ignorelist_t *ignorelist = malloc(sizeof(ignorelist_t));
ignorelist->repo = NULL;
ignorelist->has_rules = FALSE;
char *tempdir = get_tempdir();
if (tempdir[strlen(tempdir) - 1] == '/') {
sprintf(ignorelist->repo_path, "%ssist2-ignorelist-%d", tempdir, getpid());
} else {
sprintf(ignorelist->repo_path, "%s/sist2-ignorelist-%d", tempdir, getpid());
}
return ignorelist;
}
void ignorelist_load_ignore_file(ignorelist_t *ignorelist, const char *filepath) {
FILE *file;
char line[PATH_MAX * 2];
file = fopen(filepath, "r");
if(file == NULL) {
// No ignore list
return;
}
LOG_DEBUGF("ignorelist.c", "Opening temporary git repository %s", ignorelist->repo_path);
int init_result = git_repository_init(&ignorelist->repo, ignorelist->repo_path, TRUE);
if (init_result != 0) {
LOG_FATALF("ignorelist.c", "Got error code from git_repository_init(): %d", init_result);
}
git_ignore_clear_internal_rules(ignorelist->repo);
while(fgets(line, PATH_MAX * 2, file)){
line[strlen(line) - 1] = '\0'; // Strip trailing newline
char *rules = {line,};
int result = git_ignore_add_rule(ignorelist->repo, rules);
if (result == 0) {
LOG_DEBUGF("ignorelist.c", "Load ignore rule: %s", line);
ignorelist->has_rules = TRUE;
} else {
LOG_FATALF("ignorelist.c", "Invalid ignore rule: %s", line);
}
}
fclose(file);
}
int ignorelist_is_ignored(ignorelist_t *ignorelist, const char *filepath) {
if (!ignorelist->has_rules) {
return FALSE;
}
const char *rel_path = filepath + ScanCtx.index.desc.root_len;
int ignored = -1;
int result = git_ignore_path_is_ignored(&ignored, ignorelist->repo, rel_path);
if (result != 0) {
LOG_FATALF("ignorelist.c", "git_ignore_path_is_ignored returned error code: %d", result);
}
return ignored;
}

16
src/ignorelist.h Normal file
View File

@ -0,0 +1,16 @@
#ifndef SIST2_IGNORELIST_H
#define SIST2_IGNORELIST_H
#include "src/sist.h"
typedef struct ignorelist ignorelist_t;
ignorelist_t *ignorelist_create();
void ignorelist_destroy(ignorelist_t* ignorelist);
void ignorelist_load_ignore_file(ignorelist_t* ignorelist, const char* filepath);
int ignorelist_is_ignored(ignorelist_t* ignorelist, const char* filepath);
#endif //SIST2_IGNORELIST_H

View File

@ -23,8 +23,17 @@ int handle_entry(const char *filepath, const struct stat *info, int typeflag, st
if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) {
LOG_DEBUGF("walk.c", "Excluded: %s", filepath);
if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
} else if (typeflag == FTW_D) {
if (typeflag == FTW_D) {
return FTW_SKIP_SUBTREE;
}
return FTW_CONTINUE;
}
if (ignorelist_is_ignored(ScanCtx.ignorelist, filepath)) {
LOG_DEBUGF("walk.c", "Ignored: %s", filepath);
if (typeflag == FTW_D) {
return FTW_SKIP_SUBTREE;
}

View File

@ -11,6 +11,7 @@
#include "web/serve.h"
#include "parsing/mime.h"
#include "parsing/parse.h"
#include "ignorelist.h"
#include <signal.h>
#include <pthread.h>
@ -239,6 +240,13 @@ void sist2_scan(scan_args_t *args) {
LOG_INFOF("main.c", "sist2 v%s", Version);
ScanCtx.ignorelist = ignorelist_create();
char ignore_filepath[PATH_MAX];
sprintf(ignore_filepath, "%s.sist2ignore", args->path);
ignorelist_load_ignore_file(ScanCtx.ignorelist, ignore_filepath);
ScanCtx.pool = tpool_create(ScanCtx.threads, TRUE);
tpool_start(ScanCtx.pool);
@ -268,6 +276,7 @@ void sist2_scan(scan_args_t *args) {
database_generate_stats(db, args->treemap_threshold);
database_close(db, args->optimize_database);
ignorelist_destroy(ScanCtx.ignorelist);
}
void sist2_index(index_args_t *args) {

View File

@ -51,11 +51,11 @@
#include <ctype.h>
#include "git_hash.h"
#define VERSION "3.4.5"
#define VERSION "3.5.0"
static const char *const Version = VERSION;
static const int VersionMajor = 3;
static const int VersionMinor = 4;
static const int VersionPatch = 5;
static const int VersionMinor = 5;
static const int VersionPatch = 0;
#ifndef SIST_PLATFORM
#define SIST_PLATFORM unknown