mirror of
https://github.com/simon987/sist2.git
synced 2025-04-10 14:06:45 +00:00
Fix tesseract lang validation, update README.md, fix tesseract memory leak
This commit is contained in:
parent
255bc2d689
commit
15ae2190cf
43
README.md
43
README.md
@ -67,23 +67,23 @@ See [Usage guide](docs/USAGE.md) for more details
|
||||
|
||||
## Format support
|
||||
|
||||
File type | Library | Content | Thumbnail | Metadata
|
||||
:---|:---|:---|:---|:---
|
||||
pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
|
||||
cbz,cbr | [libscan](https://github.com/simon987/libscan) | - | yes | - |
|
||||
`audio/*` | ffmpeg | - | yes | ID3 tags |
|
||||
`video/*` | ffmpeg | - | yes | title, comment, artist |
|
||||
`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
|
||||
raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags |
|
||||
ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
|
||||
`text/plain` | [libscan](https://github.com/simon987/libscan) | yes | no | - |
|
||||
html, xml | [libscan](https://github.com/simon987/libscan) | yes | no | - |
|
||||
tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
|
||||
docx, xlsx, pptx | [libscan](https://github.com/simon987/libscan) | yes | if embedded | creator, modified_by, title |
|
||||
doc (MS Word 97-2003) | antiword | yes | yes | author, title |
|
||||
mobi, azw, azw3 | libmobi | yes | no | author, title |
|
||||
wpd (WordPerfect) | libwpd | yes | no | *planned* |
|
||||
json, jsonl, ndjson | [libscan](https://github.com/simon987/libscan) | yes | - | - |
|
||||
| File type | Library | Content | Thumbnail | Metadata |
|
||||
|:--------------------------------------------------------------------------|:-----------------------------------------------------------------------------|:---------|:------------|:---------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
|
||||
| cbz,cbr | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | - | yes | - |
|
||||
| `audio/*` | ffmpeg | - | yes | ID3 tags |
|
||||
| `video/*` | ffmpeg | - | yes | title, comment, artist |
|
||||
| `image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
|
||||
| raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags |
|
||||
| ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
|
||||
| `text/plain` | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - |
|
||||
| html, xml | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - |
|
||||
| tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
|
||||
| docx, xlsx, pptx | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | if embedded | creator, modified_by, title |
|
||||
| doc (MS Word 97-2003) | antiword | yes | yes | author, title |
|
||||
| mobi, azw, azw3 | libmobi | yes | no | author, title |
|
||||
| wpd (WordPerfect) | libwpd | yes | no | *planned* |
|
||||
| json, jsonl, ndjson | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | - | - |
|
||||
|
||||
\* *See [Archive files](#archive-files)*
|
||||
|
||||
@ -109,11 +109,16 @@ directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-File
|
||||
The `simon987/sist2` image comes with common languages
|
||||
(hin, jpn, eng, fra, rus, spa) pre-installed.
|
||||
|
||||
Examples
|
||||
You can use the `+` separator to specify multiple languages. The language
|
||||
name must be identical to the `*.traineddata` file installed on your system
|
||||
(use `chi_sim` rather than `chi-sim`).
|
||||
|
||||
Examples:
|
||||
|
||||
```bash
|
||||
sist2 scan --ocr jpn ~/Books/Manga/
|
||||
sist2 scan --ocr eng ~/Books/Textbooks/
|
||||
sist2 scan --ocr eng+chi_sim ~/Books/Chinese-Bilingual/
|
||||
```
|
||||
|
||||
## Build from source
|
||||
@ -126,7 +131,7 @@ You can compile **sist2** by yourself if you don't want to use the pre-compiled
|
||||
git clone --recursive https://github.com/simon987/sist2/
|
||||
cd sist2
|
||||
docker build . -f ./Dockerfile -t my-sist2-image
|
||||
docker run --rm my-sist2-image cat /root/sist2 > sist2-x64-linux
|
||||
docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
|
||||
```
|
||||
|
||||
### On a linux computer
|
||||
|
31
src/cli.c
31
src/cli.c
@ -146,7 +146,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
if (args->name == NULL) {
|
||||
args->name = g_path_get_basename(args->output);
|
||||
} else {
|
||||
char* tmp = malloc(strlen(args->name) + 1);
|
||||
char *tmp = malloc(strlen(args->name) + 1);
|
||||
strcpy(tmp, args->name);
|
||||
args->name = tmp;
|
||||
}
|
||||
@ -171,14 +171,25 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
if (args->tesseract_lang != NULL) {
|
||||
TessBaseAPI *api = TessBaseAPICreate();
|
||||
|
||||
char filename[128];
|
||||
sprintf(filename, "%s.traineddata", args->tesseract_lang);
|
||||
const char *path = find_file_in_paths(TESS_DATAPATHS, filename);
|
||||
if (path == NULL) {
|
||||
LOG_FATAL("cli.c", "Could not find tesseract language file!");
|
||||
}
|
||||
const char* trained_data_path;
|
||||
char *lang = malloc(strlen(args->tesseract_lang) + 1);
|
||||
strcpy(lang, args->tesseract_lang);
|
||||
|
||||
ret = TessBaseAPIInit3(api, path, args->tesseract_lang);
|
||||
lang = strtok(lang, "+");
|
||||
|
||||
while (lang != NULL) {
|
||||
char filename[128];
|
||||
sprintf(filename, "%s.traineddata", lang);
|
||||
trained_data_path = find_file_in_paths(TESS_DATAPATHS, filename);
|
||||
if (trained_data_path == NULL) {
|
||||
LOG_FATALF("cli.c", "Could not find tesseract language file: %s!", filename);
|
||||
}
|
||||
|
||||
lang = strtok(NULL, "+");
|
||||
}
|
||||
free(lang);
|
||||
|
||||
ret = TessBaseAPIInit3(api, trained_data_path, args->tesseract_lang);
|
||||
if (ret != 0) {
|
||||
fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang);
|
||||
return 1;
|
||||
@ -186,7 +197,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
TessBaseAPIEnd(api);
|
||||
TessBaseAPIDelete(api);
|
||||
|
||||
args->tesseract_path = path;
|
||||
args->tesseract_path = trained_data_path;
|
||||
}
|
||||
|
||||
if (args->exclude_regex != NULL) {
|
||||
@ -220,7 +231,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
|
||||
}
|
||||
|
||||
if (args->list_path != NULL) {
|
||||
if(strcmp(args->list_path, "-") == 0) {
|
||||
if (strcmp(args->list_path, "-") == 0) {
|
||||
args->list_file = stdin;
|
||||
LOG_DEBUG("cli.c", "Using stdin as list file")
|
||||
} else {
|
||||
|
31
third-party/libscan/libscan/ocr/ocr.h
vendored
31
third-party/libscan/libscan/ocr/ocr.h
vendored
@ -19,25 +19,26 @@ ocr_extract_text(const char *tesseract_path, const char *tesseract_lang,
|
||||
const int img_bpp, const int img_stride, const int img_xres,
|
||||
const ocr_extract_callback_t cb) {
|
||||
|
||||
if (img_w < MIN_OCR_SIZE || img_h < MIN_OCR_SIZE || img_xres <= 0 ||
|
||||
!OCR_IS_VALID_BPP(img_bpp)) {
|
||||
return;
|
||||
}
|
||||
if (img_w < MIN_OCR_SIZE || img_h < MIN_OCR_SIZE || img_xres <= 0 ||
|
||||
!OCR_IS_VALID_BPP(img_bpp)) {
|
||||
return;
|
||||
}
|
||||
|
||||
TessBaseAPI *api = TessBaseAPICreate();
|
||||
TessBaseAPIInit3(api, tesseract_path, tesseract_lang);
|
||||
TessBaseAPI *api = TessBaseAPICreate();
|
||||
TessBaseAPIInit3(api, tesseract_path, tesseract_lang);
|
||||
|
||||
TessBaseAPISetImage(api, img_buf, img_w, img_h, img_bpp, img_stride);
|
||||
TessBaseAPISetSourceResolution(api, img_xres);
|
||||
TessBaseAPISetImage(api, img_buf, img_w, img_h, img_bpp, img_stride);
|
||||
TessBaseAPISetSourceResolution(api, img_xres);
|
||||
|
||||
char *text = TessBaseAPIGetUTF8Text(api);
|
||||
size_t len = strlen(text);
|
||||
if (len >= MIN_OCR_LEN) {
|
||||
cb(text, len);
|
||||
}
|
||||
char *text = TessBaseAPIGetUTF8Text(api);
|
||||
size_t len = strlen(text);
|
||||
if (len >= MIN_OCR_LEN) {
|
||||
cb(text, len);
|
||||
}
|
||||
TessDeleteText(text);
|
||||
|
||||
TessBaseAPIEnd(api);
|
||||
TessBaseAPIDelete(api);
|
||||
TessBaseAPIEnd(api);
|
||||
TessBaseAPIDelete(api);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user