Compare commits

...

11 Commits

Author SHA1 Message Date
d58fcbc788
Merge pull request #246 from yatli/mem_cap_dev
scan memory threshold
2022-02-13 13:26:18 -05:00
b483447b1c
Merge pull request #251 from yatli/example_systemd
add systemd integration example
2022-02-13 11:56:16 -05:00
Yatao Li
0d68d5fc7f use --index-incremental 2022-02-14 00:47:01 +08:00
Yatao Li
1813bf505c add systemd integration example 2022-02-13 19:05:13 +08:00
Yatao Li
9a6e7c7c47 reset throttle timer for each work item 2022-02-13 18:43:25 +08:00
Yatao Li
68252b4e80 query page size on tpool creation 2022-02-13 18:43:25 +08:00
Yatao Li
d1f13f2c84 stop scanning gracefully if memory limit target cannot be met 2022-02-13 18:43:25 +08:00
Yatao Li
6075c21a3a do not throttle writer/index thread pools 2022-02-13 18:43:23 +08:00
Yatao Li
f3674ffa02 stop threadpool when the memory limit is too low for any worker thread to proceed 2022-02-13 18:42:54 +08:00
Yatao Li
de187eff1c minor fix 2022-02-13 18:42:54 +08:00
Yatao Li
8e96174e1f scan memory threshold 2022-02-13 18:42:54 +08:00
13 changed files with 207 additions and 5 deletions

7
contrib/systemd/Makefile Normal file
View File

@ -0,0 +1,7 @@
install:
install sist2-update-all.sh /usr/bin/sist2-update-all.sh
install sist2-update-files.sh /usr/bin/sist2-update-files.sh
install sist2-update-nextcloud.sh /usr/bin/sist2-update-nextcloud.sh
install sist2-update.service /etc/systemd/system/sist2-update.service
install sist2-update.timer /etc/systemd/system/sist2-update.timer
systemctl daemon-reload

31
contrib/systemd/README.md Normal file
View File

@ -0,0 +1,31 @@
# Systemd integration example
This example contains my (yatli) personal configuration for sist2 auto-updating.
The following indices are involved in this configuration:
| Index | Path | Description |
|-----------|------------------|--------------------------------------------|
| files | /zpool/files | Main file repository |
| nextcloud | /zpool/nextcloud | Externally synchronized to a cloud account |
The systemd integration achieves automatic sist2 scanning & indexing everyday at 3:00AM.
### Tailoring the configuration for yourself
`sist2-update-all.sh` calls update scripts for each sist2 index. Add or remove
update scripts accordingly to suit your need. Each update script (e.g.
`sist2-update-files.sh`) has important parameters laid down at the beginning so
make sure to edit them to point to your files and index locations.
### Installation
```bash
# install the services and scripts
sudo make install
# enable & start the timer
sudo systemctl enable sist2-update.timer
sudo systemctl start sist2-update.timer
# verify that the timer has been enabled
systemctl list-timers --all
```

View File

@ -0,0 +1,9 @@
#!/bin/bash
set -e
__dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "Update index: Files"
source ${__dir}/sist2-update-files.sh
echo "Update index: Nextcloud"
source ${__dir}/sist2-update-nextcloud.sh
echo "Done. Restarting sist2."
docker restart sist2-sist2-1

View File

@ -0,0 +1,34 @@
#!/bin/bash
set -e
DATE=$(date +%Y_%m_%d)
CONTENT=/zpool/files
ORIG=/mnt/ssd/sist-index/files.idx
NEW=/mnt/ssd/sist-index/files_$DATE.idx
EXCLUDE='ZArchives|TorrentStore|TorrentDownload|624f0c59-1fef-44f6-95e9-7483296f2833|ubuntu-full-2021-12-07'
NAME=Files
#REWRITE_URL="http://localhost:33333/activate?collection=$NAME&path="
REWRITE_URL=""
sist2 scan \
--threads 14 \
--mem-throttle 32768 \
--quality 1.0 \
--name $NAME \
--ocr-lang=eng+chi_sim \
--ocr-ebooks \
--ocr-images \
--exclude=$EXCLUDE \
--rewrite-url=$REWRITE_URL \
--incremental=$ORIG \
--output=$NEW \
$CONTENT
echo ">>> Scan complete"
rm -rf $ORIG
mv $NEW $ORIG
unset http_proxy
unset https_proxy
unset HTTP_PROXY
unset HTTPS_PROXY
sist2 index $ORIG --incremental-index
echo ">>> Index complete"

View File

@ -0,0 +1,33 @@
#!/bin/bash
set -e
DATE=$(date +%Y_%m_%d)
CONTENT=/zpool/nextcloud/v-yadli
ORIG=/mnt/ssd/sist-index/nextcloud.idx
NEW=/mnt/ssd/sist-index/nextcloud_$DATE.idx
EXCLUDE='Yatao|.*263418493\\/Image\\/.*'
NAME=NextCloud
# REWRITE_URL="http://localhost:33333/activate?collection=$NAME&path="
REWRITE_URL=""
sist2 scan \
--threads 14 \
--mem-throttle 32768 \
--quality 1.0 \
--name $NAME \
--ocr-lang=eng+chi_sim \
--ocr-ebooks \
--ocr-images \
--exclude=$EXCLUDE \
--rewrite-url=$REWRITE_URL \
--incremental=$ORIG \
--output=$NEW \
$CONTENT
echo ">>> Scan complete"
rm -rf $ORIG
mv $NEW $ORIG
unset http_proxy
unset https_proxy
unset HTTP_PROXY
unset HTTPS_PROXY
sist2 index $ORIG --incremental-index

View File

@ -0,0 +1,6 @@
[Unit]
Description=sist2-update
[Service]
User=yatli
ExecStart=/bin/bash /usr/bin/sist2-update-all.sh

View File

@ -0,0 +1,10 @@
[Unit]
Description=sist2-update
[Timer]
OnCalendar=*-*-* 3:00:00
Persistent=true
Unit=sist2-update.service
[Install]
WantedBy=timers.target

View File

@ -19,6 +19,8 @@
#define DEFAULT_MAX_MEM_BUFFER 2000
#define DEFAULT_THROTTLE_MEMORY_THRESHOLD 0
const char *TESS_DATAPATHS[] = {
"/usr/share/tessdata/",
"/usr/share/tesseract-ocr/tessdata/",
@ -255,6 +257,10 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
args->max_memory_buffer = DEFAULT_MAX_MEM_BUFFER;
}
if (args->scan_mem_limit <= 0) {
args->scan_mem_limit = DEFAULT_THROTTLE_MEMORY_THRESHOLD;
}
if (args->list_path != NULL) {
if (strcmp(args->list_path, "-") == 0) {
args->list_file = stdin;

View File

@ -10,6 +10,7 @@ typedef struct scan_args {
int size;
int content_size;
int threads;
int scan_mem_limit;
char *incremental;
char *output;
char *rewrite_url;

View File

@ -35,6 +35,7 @@ typedef struct {
int threads;
int depth;
int calculate_checksums;
size_t mem_limit;
size_t stat_tn_size;
size_t stat_index_size;

View File

@ -253,6 +253,7 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.threads = args->threads;
ScanCtx.depth = args->depth;
ScanCtx.mem_limit = args->scan_mem_limit * 1024 * 1024;
strncpy(ScanCtx.index.path, args->output, sizeof(ScanCtx.index.path));
strncpy(ScanCtx.index.desc.name, args->name, sizeof(ScanCtx.index.desc.name));
@ -382,10 +383,10 @@ void sist2_scan(scan_args_t *args) {
load_incremental_index(args);
}
ScanCtx.pool = tpool_create(args->threads, thread_cleanup, TRUE, TRUE);
ScanCtx.pool = tpool_create(ScanCtx.threads, thread_cleanup, TRUE, TRUE, ScanCtx.mem_limit);
tpool_start(ScanCtx.pool);
ScanCtx.writer_pool = tpool_create(1, writer_cleanup, TRUE, FALSE);
ScanCtx.writer_pool = tpool_create(1, writer_cleanup, TRUE, FALSE, 0);
tpool_start(ScanCtx.writer_pool);
if (args->list_path) {
@ -466,7 +467,7 @@ void sist2_index(index_args_t *args) {
f = index_json;
}
IndexCtx.pool = tpool_create(args->threads, elastic_cleanup, FALSE, args->print == 0);
IndexCtx.pool = tpool_create(args->threads, elastic_cleanup, FALSE, args->print == 0, 0);
tpool_start(IndexCtx.pool);
READ_INDICES(file_path, args->index_path, {
@ -586,6 +587,7 @@ int main(int argc, const char *argv[]) {
OPT_GROUP("Scan options"),
OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"),
OPT_STRING(0, "mem-throttle", &scan_args->scan_mem_limit, "Total memory threshold in MB for scan throttling. DEFAULT=0"),
OPT_FLOAT('q', "quality", &scan_args->quality,
"Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=3"),
OPT_INTEGER(0, "size", &scan_args->size,

View File

@ -28,6 +28,9 @@ typedef struct tpool {
int work_cnt;
int done_cnt;
int busy_cnt;
int throttle_stuck_cnt;
size_t mem_limit;
size_t page_size;
int free_arg;
int stop;
@ -114,11 +117,42 @@ int tpool_add_work(tpool_t *pool, thread_func_t func, void *arg) {
return 1;
}
/**
* see: https://github.com/htop-dev/htop/blob/f782f821f7f8081cb43bbad1c37f32830a260a81/linux/LinuxProcessList.c
*/
__always_inline
static size_t _get_total_mem(tpool_t* pool) {
FILE* statmfile = fopen("/proc/self/statm", "r");
if (!statmfile)
return 0;
long int dummy, dummy2, dummy3, dummy4, dummy5, dummy6;
long int m_resident;
int r = fscanf(statmfile, "%ld %ld %ld %ld %ld %ld %ld",
&dummy, /* m_virt */
&m_resident,
&dummy2, /* m_share */
&dummy3, /* m_trs */
&dummy4, /* unused since Linux 2.6; always 0 */
&dummy5, /* m_drs */
&dummy6); /* unused since Linux 2.6; always 0 */
fclose(statmfile);
if (r == 7) {
return m_resident * pool->page_size;
} else {
return 0;
}
}
/**
* Thread worker function
*/
static void *tpool_worker(void *arg) {
tpool_t *pool = arg;
int stuck_notified = 0;
int throttle_ms = 0;
while (1) {
pthread_mutex_lock(&pool->work_mutex);
@ -138,10 +172,35 @@ static void *tpool_worker(void *arg) {
pthread_mutex_unlock(&(pool->work_mutex));
if (work != NULL) {
stuck_notified = 0;
throttle_ms = 0;
while(!pool->stop && pool->mem_limit > 0 && _get_total_mem(pool) >= pool->mem_limit) {
if (!stuck_notified && throttle_ms >= 90000) {
// notify the pool that this thread is stuck.
pthread_mutex_lock(&(pool->work_mutex));
pool->throttle_stuck_cnt += 1;
if (pool->throttle_stuck_cnt == pool->thread_cnt) {
LOG_ERROR("tpool.c", "Throttle memory limit too low, cannot proceed!");
pool->stop = TRUE;
}
pthread_mutex_unlock(&(pool->work_mutex));
stuck_notified = 1;
}
usleep(10000);
throttle_ms += 10;
}
if (pool->stop) {
break;
}
// we are not stuck anymore. cancel our notification.
if (stuck_notified) {
pthread_mutex_lock(&(pool->work_mutex));
pool->throttle_stuck_cnt -= 1;
pthread_mutex_unlock(&(pool->work_mutex));
}
work->func(work->arg);
if (pool->free_arg) {
free(work->arg);
@ -243,18 +302,21 @@ void tpool_destroy(tpool_t *pool) {
* Create a thread pool
* @param thread_cnt Worker threads count
*/
tpool_t *tpool_create(int thread_cnt, void cleanup_func(), int free_arg, int print_progress) {
tpool_t *tpool_create(int thread_cnt, void cleanup_func(), int free_arg, int print_progress, size_t mem_limit) {
tpool_t *pool = malloc(sizeof(tpool_t));
pool->thread_cnt = thread_cnt;
pool->work_cnt = 0;
pool->done_cnt = 0;
pool->busy_cnt = 0;
pool->throttle_stuck_cnt = 0;
pool->mem_limit = mem_limit;
pool->stop = FALSE;
pool->free_arg = free_arg;
pool->cleanup_func = cleanup_func;
pool->threads = calloc(sizeof(pthread_t), thread_cnt);
pool->print_progress = print_progress;
pool->page_size = getpagesize();
pthread_mutex_init(&(pool->work_mutex), NULL);

View File

@ -8,7 +8,7 @@ typedef struct tpool tpool_t;
typedef void (*thread_func_t)(void *arg);
tpool_t *tpool_create(int num, void (*cleanup_func)(), int free_arg, int print_progress);
tpool_t *tpool_create(int num, void (*cleanup_func)(), int free_arg, int print_progress, size_t mem_limit);
void tpool_start(tpool_t *pool);
void tpool_destroy(tpool_t *pool);