mirror of
https://github.com/simon987/sist2.git
synced 2025-12-11 14:38:54 +00:00
Update --ocr-* args, enable OCR'ing images
This commit is contained in:
16
src/cli.c
16
src/cli.c
@@ -168,7 +168,23 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (args->ocr_images && args->tesseract_lang == NULL) {
|
||||
fprintf(stderr, "You must specify --ocr-lang <LANG> to use --ocr-images");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (args->ocr_ebooks && args->tesseract_lang == NULL) {
|
||||
fprintf(stderr, "You must specify --ocr-lang <LANG> to use --ocr-ebooks");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (args->tesseract_lang != NULL) {
|
||||
|
||||
if (!args->ocr_ebooks && !args->ocr_images) {
|
||||
fprintf(stderr, "You must specify at least one of --ocr-ebooks, --ocr-images");
|
||||
return 1;
|
||||
}
|
||||
|
||||
TessBaseAPI *api = TessBaseAPICreate();
|
||||
|
||||
const char* trained_data_path;
|
||||
|
||||
@@ -21,6 +21,8 @@ typedef struct scan_args {
|
||||
char *archive_passphrase;
|
||||
char *tesseract_lang;
|
||||
const char *tesseract_path;
|
||||
int ocr_images;
|
||||
int ocr_ebooks;
|
||||
char *exclude_regex;
|
||||
int fast;
|
||||
const char* treemap_threshold_str;
|
||||
|
||||
@@ -38,6 +38,8 @@ char *get_meta_key_text(enum metakey meta_key) {
|
||||
return "parent";
|
||||
case MetaExifMake:
|
||||
return "exif_make";
|
||||
case MetaExifDescription:
|
||||
return "exif_description";
|
||||
case MetaExifSoftware:
|
||||
return "exif_software";
|
||||
case MetaExifExposureTime:
|
||||
@@ -150,6 +152,7 @@ char *build_json_string(document_t *doc) {
|
||||
case MetaFontName:
|
||||
case MetaParent:
|
||||
case MetaExifMake:
|
||||
case MetaExifDescription:
|
||||
case MetaExifSoftware:
|
||||
case MetaExifExposureTime:
|
||||
case MetaExifFNumber:
|
||||
|
||||
14
src/main.c
14
src/main.c
@@ -220,6 +220,11 @@ void initialize_scan_context(scan_args_t *args) {
|
||||
ScanCtx.media_ctx.store = _store;
|
||||
ScanCtx.media_ctx.max_media_buffer = (long) args->max_memory_buffer * 1024 * 1024;
|
||||
ScanCtx.media_ctx.read_subtitles = args->read_subtitles;
|
||||
|
||||
if (args->ocr_images) {
|
||||
ScanCtx.media_ctx.tesseract_lang = args->tesseract_lang;
|
||||
ScanCtx.media_ctx.tesseract_path = args->tesseract_path;
|
||||
}
|
||||
init_media();
|
||||
|
||||
// OOXML
|
||||
@@ -501,7 +506,7 @@ void sist2_web(web_args_t *args) {
|
||||
WebCtx.tag_auth_enabled = args->tag_auth_enabled;
|
||||
WebCtx.tagline = args->tagline;
|
||||
WebCtx.dev = args->dev;
|
||||
strcpy(WebCtx.lang, args->lang);
|
||||
strcpy(WebCtx.lang, args->lang);
|
||||
|
||||
for (int i = 0; i < args->index_count; i++) {
|
||||
char *abs_path = abspath(args->indices[i]);
|
||||
@@ -576,8 +581,11 @@ int main(int argc, const char *argv[]) {
|
||||
OPT_STRING(0, "archive-passphrase", &scan_args->archive_passphrase,
|
||||
"Passphrase for encrypted archive files"),
|
||||
|
||||
OPT_STRING(0, "ocr", &scan_args->tesseract_lang, "Tesseract language (use tesseract --list-langs to see "
|
||||
"which are installed on your machine)"),
|
||||
OPT_STRING(0, "ocr-lang", &scan_args->tesseract_lang,
|
||||
"Tesseract language (use 'tesseract --list-langs' to see "
|
||||
"which are installed on your machine)"),
|
||||
OPT_BOOLEAN(0, "ocr-images", &scan_args->ocr_images, "Enable OCR'ing of image files."),
|
||||
OPT_BOOLEAN(0, "ocr-ebooks", &scan_args->ocr_ebooks, "Enable OCR'ing of ebook files."),
|
||||
OPT_STRING('e', "exclude", &scan_args->exclude_regex, "Files that match this regex will not be scanned"),
|
||||
OPT_BOOLEAN(0, "fast", &scan_args->fast, "Only index file names & mime type"),
|
||||
OPT_STRING(0, "treemap-threshold", &scan_args->treemap_threshold_str, "Relative size threshold for treemap "
|
||||
|
||||
10
src/web/static_generated.c
vendored
10
src/web/static_generated.c
vendored
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user