mirror of
https://github.com/simon987/sist2.git
synced 2025-04-10 14:06:45 +00:00
Disable OSD, add preserve_interword_spaces for chi_sim OCR (#443)
This commit is contained in:
parent
af5059f366
commit
2936240df8
13
third-party/libscan/libscan/ocr/ocr.h
vendored
13
third-party/libscan/libscan/ocr/ocr.h
vendored
@ -5,7 +5,7 @@
|
||||
#include <tesseract/capi.h>
|
||||
|
||||
#define MIN_OCR_WIDTH 350
|
||||
#define MIN_OCR_HEIGHT 100
|
||||
#define MIN_OCR_HEIGHT 33
|
||||
#define MIN_OCR_LEN 10
|
||||
|
||||
#define OCR_IS_VALID_BPP(d) \
|
||||
@ -28,7 +28,13 @@ ocr_extract_text(const char *tesseract_path, const char *tesseract_lang,
|
||||
TessBaseAPI *api = TessBaseAPICreate();
|
||||
TessBaseAPIInit3(api, tesseract_path, tesseract_lang);
|
||||
|
||||
TessBaseAPISetPageSegMode(api, PSM_AUTO_OSD);
|
||||
// https://github.com/simon987/sist2/issues/443
|
||||
if (strstr(tesseract_lang, "chi") != NULL) {
|
||||
TessBaseAPISetVariable(api, "preserve_interword_spaces", "1");
|
||||
}
|
||||
|
||||
// TODO: add this as param?
|
||||
// TessBaseAPISetPageSegMode(api, PSM_AUTO_OSD);
|
||||
|
||||
TessBaseAPISetImage(api, img_buf, img_w, img_h, img_bpp, img_stride);
|
||||
TessBaseAPISetSourceResolution(api, img_xres);
|
||||
@ -39,6 +45,9 @@ ocr_extract_text(const char *tesseract_path, const char *tesseract_lang,
|
||||
if (len >= MIN_OCR_LEN) {
|
||||
cb(text, len);
|
||||
}
|
||||
|
||||
fprintf(stderr, "OCR: '%s'\n", text);
|
||||
|
||||
TessDeleteText(text);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user