mirror of
https://github.com/simon987/sist2.git
synced 2025-04-08 13:06:47 +00:00
Update --ocr-* args, enable OCR'ing images
This commit is contained in:
parent
b37e5a4ad4
commit
ad95684771
11
README.md
11
README.md
@ -102,8 +102,9 @@ scan is also supported.
|
||||
|
||||
### OCR
|
||||
|
||||
You can enable OCR support for pdf,xps,fb2,epub file types with the
|
||||
`--ocr <lang>` option. Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
|
||||
You can enable OCR support for ebook (pdf,xps,fb2,epub) or image file types with the
|
||||
`--ocr-lang <lang>` option in combination with `--ocr-images` and/or `--ocr-ebooks`.
|
||||
Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
|
||||
directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).
|
||||
|
||||
The `simon987/sist2` image comes with common languages
|
||||
@ -116,9 +117,9 @@ name must be identical to the `*.traineddata` file installed on your system
|
||||
Examples:
|
||||
|
||||
```bash
|
||||
sist2 scan --ocr jpn ~/Books/Manga/
|
||||
sist2 scan --ocr eng ~/Books/Textbooks/
|
||||
sist2 scan --ocr eng+chi_sim ~/Books/Chinese-Bilingual/
|
||||
sist2 scan --ocr-ebooks --ocr-lang jpn ~/Books/Manga/
|
||||
sist2 scan --ocr-images --ocr-lang eng ~/Images/Screenshots/
|
||||
sist2 scan --ocr-ebooks --ocr-images --ocr-lang eng+chi_sim ~/Chinese-Bilingual/
|
||||
```
|
||||
|
||||
## Build from source
|
||||
|
@ -43,7 +43,7 @@ Scan options
|
||||
--depth=<int> Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1
|
||||
--archive=<str> Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse
|
||||
--archive-passphrase=<str> Passphrase for encrypted archive files
|
||||
--ocr=<str> Tesseract language (use tesseract --list-langs to see which are installed on your machine)
|
||||
# TODO: add new --ocr-* options here
|
||||
-e, --exclude=<str> Files that match this regex will not be scanned
|
||||
--fast Only index file names & mime type
|
||||
--treemap-threshold=<str> Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
|
||||
|
16
src/cli.c
16
src/cli.c
@ -168,7 +168,23 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (args->ocr_images && args->tesseract_lang == NULL) {
|
||||
fprintf(stderr, "You must specify --ocr-lang <LANG> to use --ocr-images");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (args->ocr_ebooks && args->tesseract_lang == NULL) {
|
||||
fprintf(stderr, "You must specify --ocr-lang <LANG> to use --ocr-ebooks");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (args->tesseract_lang != NULL) {
|
||||
|
||||
if (!args->ocr_ebooks && !args->ocr_images) {
|
||||
fprintf(stderr, "You must specify at least one of --ocr-ebooks, --ocr-images");
|
||||
return 1;
|
||||
}
|
||||
|
||||
TessBaseAPI *api = TessBaseAPICreate();
|
||||
|
||||
const char* trained_data_path;
|
||||
|
@ -21,6 +21,8 @@ typedef struct scan_args {
|
||||
char *archive_passphrase;
|
||||
char *tesseract_lang;
|
||||
const char *tesseract_path;
|
||||
int ocr_images;
|
||||
int ocr_ebooks;
|
||||
char *exclude_regex;
|
||||
int fast;
|
||||
const char* treemap_threshold_str;
|
||||
|
@ -38,6 +38,8 @@ char *get_meta_key_text(enum metakey meta_key) {
|
||||
return "parent";
|
||||
case MetaExifMake:
|
||||
return "exif_make";
|
||||
case MetaExifDescription:
|
||||
return "exif_description";
|
||||
case MetaExifSoftware:
|
||||
return "exif_software";
|
||||
case MetaExifExposureTime:
|
||||
@ -150,6 +152,7 @@ char *build_json_string(document_t *doc) {
|
||||
case MetaFontName:
|
||||
case MetaParent:
|
||||
case MetaExifMake:
|
||||
case MetaExifDescription:
|
||||
case MetaExifSoftware:
|
||||
case MetaExifExposureTime:
|
||||
case MetaExifFNumber:
|
||||
|
14
src/main.c
14
src/main.c
@ -220,6 +220,11 @@ void initialize_scan_context(scan_args_t *args) {
|
||||
ScanCtx.media_ctx.store = _store;
|
||||
ScanCtx.media_ctx.max_media_buffer = (long) args->max_memory_buffer * 1024 * 1024;
|
||||
ScanCtx.media_ctx.read_subtitles = args->read_subtitles;
|
||||
|
||||
if (args->ocr_images) {
|
||||
ScanCtx.media_ctx.tesseract_lang = args->tesseract_lang;
|
||||
ScanCtx.media_ctx.tesseract_path = args->tesseract_path;
|
||||
}
|
||||
init_media();
|
||||
|
||||
// OOXML
|
||||
@ -501,7 +506,7 @@ void sist2_web(web_args_t *args) {
|
||||
WebCtx.tag_auth_enabled = args->tag_auth_enabled;
|
||||
WebCtx.tagline = args->tagline;
|
||||
WebCtx.dev = args->dev;
|
||||
strcpy(WebCtx.lang, args->lang);
|
||||
strcpy(WebCtx.lang, args->lang);
|
||||
|
||||
for (int i = 0; i < args->index_count; i++) {
|
||||
char *abs_path = abspath(args->indices[i]);
|
||||
@ -576,8 +581,11 @@ int main(int argc, const char *argv[]) {
|
||||
OPT_STRING(0, "archive-passphrase", &scan_args->archive_passphrase,
|
||||
"Passphrase for encrypted archive files"),
|
||||
|
||||
OPT_STRING(0, "ocr", &scan_args->tesseract_lang, "Tesseract language (use tesseract --list-langs to see "
|
||||
"which are installed on your machine)"),
|
||||
OPT_STRING(0, "ocr-lang", &scan_args->tesseract_lang,
|
||||
"Tesseract language (use 'tesseract --list-langs' to see "
|
||||
"which are installed on your machine)"),
|
||||
OPT_BOOLEAN(0, "ocr-images", &scan_args->ocr_images, "Enable OCR'ing of image files."),
|
||||
OPT_BOOLEAN(0, "ocr-ebooks", &scan_args->ocr_ebooks, "Enable OCR'ing of ebook files."),
|
||||
OPT_STRING('e', "exclude", &scan_args->exclude_regex, "Files that match this regex will not be scanned"),
|
||||
OPT_BOOLEAN(0, "fast", &scan_args->fast, "Only index file names & mime type"),
|
||||
OPT_STRING(0, "treemap-threshold", &scan_args->treemap_threshold_str, "Relative size threshold for treemap "
|
||||
|
10
src/web/static_generated.c
vendored
10
src/web/static_generated.c
vendored
File diff suppressed because one or more lines are too long
2
third-party/libscan/libscan/ebook/ebook.c
vendored
2
third-party/libscan/libscan/ebook/ebook.c
vendored
@ -233,7 +233,7 @@ void fill_image(fz_context *fzctx, UNUSED(fz_device *dev),
|
||||
|
||||
int l2factor = 0;
|
||||
|
||||
if (img->w >= MIN_OCR_SIZE && img->h >= MIN_OCR_SIZE && OCR_IS_VALID_BPP(img->n)) {
|
||||
if (img->w >= MIN_OCR_WIDTH && img->h >= MIN_OCR_HEIGHT && OCR_IS_VALID_BPP(img->n)) {
|
||||
fz_pixmap *pix = img->get_pixmap(fzctx, img, NULL, img->w, img->h, &l2factor);
|
||||
ocr_extract_text(thread_ctx.tesseract_path, thread_ctx.tesseract_lang, pix->samples, pix->w, pix->h, pix->n, pix->stride, pix->xres, fill_image_ocr_cb);
|
||||
fz_drop_pixmap(fzctx, pix);
|
||||
|
66
third-party/libscan/libscan/media/media.c
vendored
66
third-party/libscan/libscan/media/media.c
vendored
@ -1,12 +1,18 @@
|
||||
#include "media.h"
|
||||
#include "../ocr/ocr.h"
|
||||
#include <ctype.h>
|
||||
|
||||
#define MIN_SIZE 32
|
||||
#define AVIO_BUF_SIZE 8192
|
||||
#define IS_VIDEO(fmt) ((fmt)->iformat->name && strcmp((fmt)->iformat->name, "image2") != 0)
|
||||
|
||||
#define STREAM_IS_IMAGE (stream->nb_frames <= 1)
|
||||
|
||||
#define STORE_AS_IS ((void*)-1)
|
||||
|
||||
// Pointer to document being processed
|
||||
__thread document_t *thread_doc;
|
||||
|
||||
const char *get_filepath_with_ext(document_t *doc, const char *filepath, const char *mime_str) {
|
||||
|
||||
int has_extension = doc->ext > doc->base;
|
||||
@ -311,7 +317,7 @@ append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *f
|
||||
if (strcmp(key, "artist") == 0) {
|
||||
append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
|
||||
} else if (strcmp(key, "imagedescription") == 0) {
|
||||
APPEND_TAG_META(MetaContent)
|
||||
append_tag_meta_if_not_exists(ctx, doc, tag, MetaContent);
|
||||
} else if (strcmp(key, "make") == 0) {
|
||||
APPEND_TAG_META(MetaExifMake)
|
||||
} else if (strcmp(key, "model") == 0) {
|
||||
@ -343,6 +349,55 @@ append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *f
|
||||
}
|
||||
}
|
||||
|
||||
static void ocr_image_cb(const char *text, size_t len) {
|
||||
APPEND_STR_META(thread_doc, MetaContent, text);
|
||||
}
|
||||
|
||||
#define OCR_PIXEL_FORMAT AV_PIX_FMT_RGB32
|
||||
#define OCR_BYTES_PER_PIXEL 4
|
||||
#define OCR_PIXELS_PER_INCH 70
|
||||
|
||||
void ocr_image(scan_media_ctx_t *ctx, document_t *doc, const AVCodecContext *decoder, AVFrame *frame) {
|
||||
|
||||
// Convert to RGB32
|
||||
AVFrame *rgb_frame = av_frame_alloc();
|
||||
|
||||
struct SwsContext *sws_ctx = sws_getContext(
|
||||
frame->width, frame->height, decoder->pix_fmt,
|
||||
frame->width, frame->height, OCR_PIXEL_FORMAT,
|
||||
SWS_LANCZOS, 0, 0, 0
|
||||
);
|
||||
|
||||
int dst_buf_len = av_image_get_buffer_size(OCR_PIXEL_FORMAT, frame->width, frame->height, 1);
|
||||
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len * 2);
|
||||
|
||||
av_image_fill_arrays(rgb_frame->data, rgb_frame->linesize, dst_buf, OCR_PIXEL_FORMAT, frame->width, frame->height,
|
||||
1);
|
||||
|
||||
sws_scale(sws_ctx,
|
||||
(const uint8_t *const *) frame->data, frame->linesize,
|
||||
0, frame->height,
|
||||
rgb_frame->data, rgb_frame->linesize
|
||||
);
|
||||
|
||||
thread_doc = doc;
|
||||
ocr_extract_text(
|
||||
ctx->tesseract_path,
|
||||
ctx->tesseract_lang,
|
||||
rgb_frame->data[0],
|
||||
frame->width,
|
||||
frame->height,
|
||||
OCR_BYTES_PER_PIXEL,
|
||||
rgb_frame->linesize[0],
|
||||
OCR_PIXELS_PER_INCH,
|
||||
ocr_image_cb
|
||||
);
|
||||
|
||||
sws_freeContext(sws_ctx);
|
||||
av_free(*rgb_frame->data);
|
||||
av_frame_free(&rgb_frame);
|
||||
}
|
||||
|
||||
void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) {
|
||||
|
||||
int video_stream = -1;
|
||||
@ -419,11 +474,11 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
|
||||
avcodec_open2(decoder, video_codec, NULL);
|
||||
|
||||
//Seek
|
||||
if (stream->nb_frames > 1 && stream->codecpar->codec_id != AV_CODEC_ID_GIF) {
|
||||
if (!STREAM_IS_IMAGE && stream->codecpar->codec_id != AV_CODEC_ID_GIF) {
|
||||
int seek_ret;
|
||||
for (int i = 20; i >= 0; i--) {
|
||||
seek_ret = av_seek_frame(pFormatCtx, video_stream,
|
||||
stream->duration * 0.10, 0);
|
||||
(long) ((double) stream->duration * 0.10), 0);
|
||||
if (seek_ret == 0) {
|
||||
break;
|
||||
}
|
||||
@ -438,6 +493,11 @@ void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx,
|
||||
return;
|
||||
}
|
||||
|
||||
if (ctx->tesseract_lang != NULL && STREAM_IS_IMAGE) {
|
||||
ocr_image(ctx, doc, decoder, frame_and_packet->frame);
|
||||
}
|
||||
|
||||
// NOTE: OCR'd content takes precedence over exif image description
|
||||
append_video_meta(ctx, pFormatCtx, frame_and_packet->frame, doc, IS_VIDEO(pFormatCtx));
|
||||
|
||||
// Scale frame
|
||||
|
3
third-party/libscan/libscan/media/media.h
vendored
3
third-party/libscan/libscan/media/media.h
vendored
@ -19,6 +19,9 @@ typedef struct {
|
||||
float tn_qscale;
|
||||
long max_media_buffer;
|
||||
int read_subtitles;
|
||||
|
||||
const char *tesseract_lang;
|
||||
const char *tesseract_path;
|
||||
} scan_media_ctx_t;
|
||||
|
||||
__always_inline
|
||||
|
15
third-party/libscan/libscan/ocr/ocr.h
vendored
15
third-party/libscan/libscan/ocr/ocr.h
vendored
@ -4,7 +4,8 @@
|
||||
#include "../scan.h"
|
||||
#include <tesseract/capi.h>
|
||||
|
||||
#define MIN_OCR_SIZE 350
|
||||
#define MIN_OCR_WIDTH 350
|
||||
#define MIN_OCR_HEIGHT 100
|
||||
#define MIN_OCR_LEN 10
|
||||
|
||||
#define OCR_IS_VALID_BPP(d) \
|
||||
@ -19,7 +20,7 @@ ocr_extract_text(const char *tesseract_path, const char *tesseract_lang,
|
||||
const int img_bpp, const int img_stride, const int img_xres,
|
||||
const ocr_extract_callback_t cb) {
|
||||
|
||||
if (img_w < MIN_OCR_SIZE || img_h < MIN_OCR_SIZE || img_xres <= 0 ||
|
||||
if (img_w < MIN_OCR_WIDTH || img_h < MIN_OCR_HEIGHT || img_xres <= 0 ||
|
||||
!OCR_IS_VALID_BPP(img_bpp)) {
|
||||
return;
|
||||
}
|
||||
@ -31,11 +32,13 @@ ocr_extract_text(const char *tesseract_path, const char *tesseract_lang,
|
||||
TessBaseAPISetSourceResolution(api, img_xres);
|
||||
|
||||
char *text = TessBaseAPIGetUTF8Text(api);
|
||||
size_t len = strlen(text);
|
||||
if (len >= MIN_OCR_LEN) {
|
||||
cb(text, len);
|
||||
if (text != NULL) {
|
||||
size_t len = strlen(text);
|
||||
if (len >= MIN_OCR_LEN) {
|
||||
cb(text, len);
|
||||
}
|
||||
TessDeleteText(text);
|
||||
}
|
||||
TessDeleteText(text);
|
||||
|
||||
TessBaseAPIEnd(api);
|
||||
TessBaseAPIDelete(api);
|
||||
|
1
third-party/libscan/libscan/scan.h
vendored
1
third-party/libscan/libscan/scan.h
vendored
@ -61,6 +61,7 @@ enum metakey {
|
||||
MetaFontName,
|
||||
MetaParent,
|
||||
MetaExifMake,
|
||||
MetaExifDescription,
|
||||
MetaExifSoftware,
|
||||
MetaExifExposureTime,
|
||||
MetaExifFNumber,
|
||||
|
Loading…
x
Reference in New Issue
Block a user