Compare commits

..

48 Commits

Author SHA1 Message Date
4dedd281f1 Push compiled vue changes 2022-01-09 09:30:31 -05:00
65c499e477 Merge pull request #231 from simon987/dev
v2.11.6
2022-01-09 09:28:24 -05:00
625f3d0d6e Option to update media type tab in real time, add media type table in details 2022-01-08 18:23:22 -05:00
64b8aab8bf Validate that all the tesseract data files are in the same folder 2022-01-08 15:04:07 -05:00
ad95684771 Update --ocr-* args, enable OCR'ing images 2022-01-08 14:24:50 -05:00
b37e5a4ad4 Fix some warnings in media.c 2022-01-08 11:06:14 -05:00
15ae2190cf Fix tesseract lang validation, update README.md, fix tesseract memory leak 2022-01-08 11:04:52 -05:00
255bc2d689 Tweak MIN_OCR_SIZE behavior, update gitignore 2022-01-08 10:33:02 -05:00
fe1aa6dd4c Merge pull request #227 from yatli/dev
refactor: split ocr_extract_text from ebook
2022-01-08 10:25:41 -05:00
cd2a44e016 Update ocr.h
Fix minimum image size validation in ocr_extract_text
2022-01-08 10:24:57 -05:00
ed2a3f342a Localize tag add/delete, fix some translations, add LanguageIcon, add --lang arg, fix lightbox slideshow time, fix gif hover 2022-01-08 10:03:38 -05:00
1107fe9a53 Remove libscan hash debug info 2022-01-08 10:00:34 -05:00
a96e65d039 Add zh-CN option in language dropdown 2022-01-07 17:44:49 -05:00
87936eecd4 Merge pull request #229 from yatli/master
add zh-CN translation
2022-01-07 13:55:14 -05:00
Yatao Li
d817a0e9dd add zh-CN translation 2022-01-08 01:39:50 +08:00
Yatao Li
94a5e0ac59 refactor: split ocr_extract_text from ebook 2022-01-07 23:20:35 +08:00
d40f5052f9 static link for libasan in debug build 2021-12-29 19:25:03 -05:00
ee9a8fa514 Add thread lock for incremental_mark_file_for_copy() 2021-12-29 19:18:10 -05:00
81008d8936 Add --list-file argument 2021-12-29 18:54:13 -05:00
52466d5d8a Update tesseract datapaths 2021-12-25 11:12:00 -05:00
5f73fc024b Version bump, update readme 2021-12-25 11:08:52 -05:00
f2fd7ccf41 Fix raw parsing maybe, fix index picker css 2021-12-25 11:08:52 -05:00
d87fee8e00 Merge pull request #214 from dpieski/patch-2
Update USAGE.md
2021-12-22 09:55:24 -05:00
Andrew
672d1344d7 Update USAGE.md
Get-WmiObject is deprecated in favor of Get-CimInstance
2021-12-15 15:00:36 -06:00
27e32db1ed Fix attempt for excludes 2021-11-17 20:18:48 -05:00
bb91139ffb console log fixes, version bump 2021-11-15 20:52:24 -05:00
70cfa8c37c Fix Dockerfile.arm64 2021-11-13 18:25:24 -05:00
7493dedc8c Merge pull request #208 from simon987/dev
v2.11.4
2021-11-13 17:37:47 -05:00
c786a31bb2 Merge remote-tracking branch 'origin/master' into dev
# Conflicts:
#	README.md
2021-11-13 17:36:55 -05:00
48d024e751 Update dockerfiles 2021-11-13 17:36:30 -05:00
08b2ca9d43 Update lcms -> lcms2 2021-11-12 11:29:50 -05:00
ed8b4f4fad Add natural sorting support 2021-11-12 10:33:51 -05:00
66de93a8bd Language & formatting 2021-11-12 10:17:32 -05:00
e3f78fb693 Shift click & select all/none in index picker 2021-11-12 10:12:25 -05:00
030643cee0 Move CI scripts to script folder 2021-11-12 09:05:37 -05:00
b17b9439df Print progress bar in index module 2021-11-07 13:20:05 -05:00
414f65346c Update docker command in README.md 2021-11-07 13:18:32 -05:00
be8eedc9c7 Skip subtree of excluded directories 2021-11-07 11:56:09 -05:00
5b62fe77f2 Update demo URL 2021-11-07 09:52:28 -05:00
61ab68ce15 Update argparse repo URL 2021-11-07 09:42:17 -05:00
82ecb8bb85 Update gitignore 2021-11-07 09:36:39 -05:00
a41b5dcc1f Remove libscan git submodule 2021-11-07 09:30:14 -05:00
06f21d5f0f Remove libscan submodule 2021-11-07 09:17:02 -05:00
e82a388d1e Don't show resolution badge on narrow images 2021-10-22 10:21:35 -04:00
bf02e571b3 Forgot to add that file two commits ago 2021-10-22 09:44:56 -04:00
750a392a61 Show reduced ResuldCard when there are no results 2021-10-22 09:32:17 -04:00
3d7b977a82 Read ES version, handle legacy versions, add notice & debug info 2021-10-21 19:14:43 -04:00
cd71551a22 Some documentation updates 2021-09-25 09:30:53 -04:00
104 changed files with 6787 additions and 241 deletions

View File

@@ -10,22 +10,7 @@ steps:
- name: build
image: simon987/sist2-build
commands:
- ./ci/build.sh
- name: docker
image: plugins/docker
settings:
username:
from_secret: DOCKER_USER
password:
from_secret: DOCKER_PASSWORD
repo: simon987/sist2
context: ./
dockerfile: ./Dockerfile
auto_tag: true
auto_tag_suffix: x64-linux
when:
event:
- tag
- ./scripts/build.sh
- name: scp files
image: appleboy/drone-scp
settings:
@@ -42,6 +27,21 @@ steps:
- ./VERSION
- ./sist2-x64-linux
- ./sist2-x64-linux-debug
- name: docker
image: plugins/docker
settings:
username:
from_secret: DOCKER_USER
password:
from_secret: DOCKER_PASSWORD
repo: simon987/sist2
context: ./
dockerfile: ./Dockerfile
auto_tag: true
auto_tag_suffix: x64-linux
when:
event:
- tag
---
kind: pipeline
@@ -55,7 +55,7 @@ steps:
- name: build
image: simon987/sist2-build-arm64
commands:
- ./ci/build_arm64.sh
- ./scripts/build_arm64.sh
- name: scp files
image: appleboy/drone-scp
settings:

8
.gitignore vendored
View File

@@ -10,17 +10,19 @@ Makefile
LOG
sist2*
!sist2-vue/
index.sist2/
*.sist2/
bundle*.css
bundle.js
*.a
vgcore.*
build/
third-party/
third-party/argparse
*.idx/
VERSION
git_hash.h
Testing/
test_i
test_i_inc
node_modules/
node_modules/
.cmake/
i_inc/

11
.gitmodules vendored
View File

@@ -1,6 +1,9 @@
[submodule "third-party/libscan"]
path = third-party/libscan
url = https://github.com/simon987/libscan
[submodule "third-party/argparse"]
path = third-party/argparse
url = https://github.com/cofyc/argparse
url = https://github.com/simon987/argparse
[submodule "third-party/libscan/third-party/utf8.h"]
path = third-party/libscan/third-party/utf8.h
url = https://github.com/sheredom/utf8.h
[submodule "third-party/libscan/third-party/antiword"]
path = third-party/libscan/third-party/antiword
url = https://github.com/simon987/antiword

View File

@@ -22,9 +22,6 @@ add_subdirectory(third-party/argparse)
add_executable(sist2
# argparse
third-party/argparse/argparse.h third-party/argparse/argparse.c
src/main.c
src/sist.h
src/io/walk.h src/io/walk.c
@@ -41,7 +38,11 @@ add_executable(sist2
src/log.c src/log.h
src/cli.c src/cli.h
src/stats.c src/stats.h src/ctx.c
src/parsing/sidecar.c src/parsing/sidecar.h)
src/parsing/sidecar.c src/parsing/sidecar.h
# argparse
third-party/argparse/argparse.h third-party/argparse/argparse.c
)
target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/)
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib)
@@ -86,6 +87,7 @@ if (SIST_DEBUG)
sist2
PRIVATE
-fsanitize=address
-static-libasan
)
set_target_properties(
sist2

View File

@@ -6,12 +6,10 @@ COPY . .
RUN cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake .
RUN make -j$(nproc)
RUN strip sist2
RUN ls -lh
RUN ls -lh sist2-vue/dist/
FROM ubuntu:20.10
FROM ubuntu:21.10
RUN apt update && apt install -y curl libasan5
RUN apt update && apt install -y curl libasan5 && rm -rf /var/lib/apt/lists/*
RUN mkdir -p /usr/share/tessdata && \
cd /usr/share/tessdata/ && \
@@ -22,9 +20,9 @@ RUN mkdir -p /usr/share/tessdata && \
curl -o /usr/share/tessdata/rus.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/rus.traineddata &&\
curl -o /usr/share/tessdata/spa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata
COPY --from=build /build/sist2 /root/sist2
ENTRYPOINT ["/root/sist2"]
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
ENTRYPOINT ["/root/sist2"]
COPY --from=build /build/sist2 /root/sist2

View File

@@ -7,9 +7,9 @@ RUN cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE
RUN make -j$(nproc)
RUN strip sist2
FROM ubuntu:20.10
FROM --platform="linux/arm64/v8" ubuntu:21.10
RUN apt update && apt install -y curl libasan5
RUN apt update && apt install -y curl libasan5 && rm -rf /var/lib/apt/lists/*
RUN mkdir -p /usr/share/tessdata && \
cd /usr/share/tessdata/ && \
@@ -20,9 +20,9 @@ RUN mkdir -p /usr/share/tessdata && \
curl -o /usr/share/tessdata/rus.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/rus.traineddata &&\
curl -o /usr/share/tessdata/spa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata
COPY --from=build /build/sist2 /root/sist2
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
ENTRYPOINT ["/root/sist2"]
ENTRYPOINT ["/root/sist2"]
COPY --from=build /build/sist2 /root/sist2

View File

@@ -2,7 +2,7 @@
[![CodeFactor](https://www.codefactor.io/repository/github/simon987/sist2/badge?s=05daa325188aac4eae32c786f3d9cf4e0593f822)](https://www.codefactor.io/repository/github/simon987/sist2)
[![Development snapshots](https://ci.simon987.net/api/badges/simon987/sist2/status.svg)](https://files.simon987.net/.gate/sist2/simon987_sist2/)
**Demo**: [sist2.simon987.net](https://sist2.simon987.net/?i=Demo%20files)
**Demo**: [sist2.simon987.net](https://sist2.simon987.net/)
# sist2
@@ -10,7 +10,7 @@ sist2 (Simple incremental search tool)
*Warning: sist2 is in early development*
![sist2.png](docs/sist2.png)
![search panel](docs/sist2.png)
## Features
@@ -33,12 +33,11 @@ sist2 (Simple incremental search tool)
## Getting Started
1. Have an Elasticsearch (>= 6.X.X) instance running
1. Have an Elasticsearch (>= 6.8.X, ideally >=7.14.0) instance running
1. Download [from official website](https://www.elastic.co/downloads/elasticsearch)
1. *(or)* Run using docker:
```bash
docker run -d --name es1 --net sist2_net -p 9200:9200 \
-e "discovery.type=single-node" elasticsearch:7.14.0
docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.14.0
```
1. *(or)* Run using docker-compose:
```yaml
@@ -52,7 +51,7 @@ sist2 (Simple incremental search tool)
1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases) *
1. *(or)* Download a [development snapshot](https://files.simon987.net/.gate/sist2/simon987_sist2/) *(Not
recommended!)*
1. *(or)* `docker pull simon987/sist2:2.11.2-x64-linux`
1. *(or)* `docker pull simon987/sist2:2.11.6-x64-linux`
1. See [Usage guide](docs/USAGE.md)
@@ -68,23 +67,23 @@ See [Usage guide](docs/USAGE.md) for more details
## Format support
File type | Library | Content | Thumbnail | Metadata
:---|:---|:---|:---|:---
pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
cbz,cbr | [libscan](https://github.com/simon987/libscan) | - | yes | - |
`audio/*` | ffmpeg | - | yes | ID3 tags |
`video/*` | ffmpeg | - | yes | title, comment, artist |
`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags |
ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
`text/plain` | [libscan](https://github.com/simon987/libscan) | yes | no | - |
html, xml | [libscan](https://github.com/simon987/libscan) | yes | no | - |
tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
docx, xlsx, pptx | [libscan](https://github.com/simon987/libscan) | yes | if embedded | creator, modified_by, title |
doc (MS Word 97-2003) | antiword | yes | yes | author, title |
mobi, azw, azw3 | libmobi | yes | no | author, title |
wpd (WordPerfect) | libwpd | yes | no | *planned* |
json, jsonl, ndjson | [libscan](https://github.com/simon987/libscan) | yes | - | - |
| File type | Library | Content | Thumbnail | Metadata |
|:--------------------------------------------------------------------------|:-----------------------------------------------------------------------------|:---------|:------------|:---------------------------------------------------------------------------------------------------------------------------------------|
| pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
| cbz,cbr | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | - | yes | - |
| `audio/*` | ffmpeg | - | yes | ID3 tags |
| `video/*` | ffmpeg | - | yes | title, comment, artist |
| `image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
| raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags |
| ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
| `text/plain` | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - |
| html, xml | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - |
| tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
| docx, xlsx, pptx | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | if embedded | creator, modified_by, title |
| doc (MS Word 97-2003) | antiword | yes | yes | author, title |
| mobi, azw, azw3 | libmobi | yes | no | author, title |
| wpd (WordPerfect) | libwpd | yes | no | *planned* |
| json, jsonl, ndjson | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | - | - |
\* *See [Archive files](#archive-files)*
@@ -103,18 +102,24 @@ scan is also supported.
### OCR
You can enable OCR support for pdf,xps,fb2,epub file types with the
`--ocr <lang>` option. Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
You can enable OCR support for ebook (pdf,xps,fb2,epub) or image file types with the
`--ocr-lang <lang>` option in combination with `--ocr-images` and/or `--ocr-ebooks`.
Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).
The `simon987/sist2` image comes with common languages
(hin, jpn, eng, fra, rus, spa) pre-installed.
Examples
You can use the `+` separator to specify multiple languages. The language
name must be identical to the `*.traineddata` file installed on your system
(use `chi_sim` rather than `chi-sim`).
Examples:
```bash
sist2 scan --ocr jpn ~/Books/Manga/
sist2 scan --ocr eng ~/Books/Textbooks/
sist2 scan --ocr-ebooks --ocr-lang jpn ~/Books/Manga/
sist2 scan --ocr-images --ocr-lang eng ~/Images/Screenshots/
sist2 scan --ocr-ebooks --ocr-images --ocr-lang eng+chi_sim ~/Chinese-Bilingual/
```
## Build from source
@@ -127,7 +132,7 @@ You can compile **sist2** by yourself if you don't want to use the pre-compiled
git clone --recursive https://github.com/simon987/sist2/
cd sist2
docker build . -f ./Dockerfile -t my-sist2-image
docker run --rm my-sist2-image cat /root/sist2 > sist2-x64-linux
docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
```
### On a linux computer
@@ -144,7 +149,7 @@ docker run --rm my-sist2-image cat /root/sist2 > sist2-x64-linux
```bash
vcpkg install curl[core,openssl]
vcpkg install lmdb cjson glib brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libuuid libmagic libraw jasper lcms gumbo
vcpkg install lmdb cjson glib brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw jasper lcms gumbo
```
1. Build

View File

@@ -14,6 +14,7 @@
* [examples](#web-examples)
* [rewrite_url](#rewrite_url)
* [link to specific indices](#link-to-specific-indices)
* [elasticsearch](#elasticsearch)
* [exec-script](#exec-script)
* [tagging](#tagging)
* [sidecar files](#sidecar-files)
@@ -42,7 +43,7 @@ Scan options
--depth=<int> Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1
--archive=<str> Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse
--archive-passphrase=<str> Passphrase for encrypted archive files
--ocr=<str> Tesseract language (use tesseract --list-langs to see which are installed on your machine)
# TODO: add new --ocr-* options here
-e, --exclude=<str> Files that match this regex will not be scanned
--fast Only index file names & mime type
--treemap-threshold=<str> Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
@@ -84,7 +85,7 @@ Exec-script options
### Scan options
* `-t, --threads`
Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-WmiObject Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-CimInstance Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
* `-q, --quality`
Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best.
* `--size`
@@ -266,9 +267,20 @@ sist2 web index1 index2 index3 index4
When the `rewrite_url` field is not empty, the web module ignores the `root`
field and will return a HTTP redirect to `<rewrite_url><path>/<name><extension>`
instead of serving the file from disk.
Both the `root` and `rewrite_url` fields are safe to manually modify from the
Both the `root` and `rewrite_url` fields are safe to manually modify from the
`descriptor.json` file.
# Elasticsearch
Elasticsearch versions >=6.8.0, <8.0.0 are supported by sist2.
Using a version >=7.14.0 is recommended to enable the following features:
- Bug fix for large documents (See #198)
When using a legacy version of ES, a notice will be displayed next to the sist2 version in the web UI.
If you don't care about the features above, you can ignore it or disable it in the configuration page.
## exec-script
The `exec-script` command is used to execute a user script for an index that has already been imported to Elasticsearch with the `index` command. Note that the documents will not be reset to their default state before each execution as the `index` command does: if you make undesired changes to the documents by accident, you will need to run `index` again to revert to the original state.
@@ -303,7 +315,7 @@ See [scripting](scripting.md) documentation.
# Sidecar files
When scanning, sist2 will read metadata from `.s2meta` JSON files and overwrite the
original document's metadata. Sidecar metadata files will also work inside archives.
original document's indexed metadata (does not modify the actual file). Sidecar metadata files will also work inside archives.
Sidecar files themselves are not saved in the index.
This feature is useful to leverage third-party applications such as speech-to-text or

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.9 KiB

After

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 889 KiB

After

Width:  |  Height:  |  Size: 1011 KiB

View File

@@ -78,6 +78,7 @@
"name": {
"analyzer": "content_analyzer",
"type": "text",
"fielddata": true,
"fields": {
"nGram": {
"type": "text",

View File

@@ -0,0 +1,58 @@
{
"index": {
"refresh_interval": "30s",
"codec": "best_compression",
"number_of_replicas": 0
},
"analysis": {
"tokenizer": {
"path_tokenizer": {
"type": "path_hierarchy",
"delimiter": "/"
},
"tag_tokenizer": {
"type": "path_hierarchy",
"delimiter": "."
},
"my_nGram_tokenizer": {
"type": "nGram",
"min_gram": 3,
"max_gram": 3
}
},
"analyzer": {
"path_analyzer": {
"tokenizer": "path_tokenizer",
"filter": [
"lowercase"
]
},
"tag_analyzer": {
"tokenizer": "tag_tokenizer",
"filter": [
"lowercase"
]
},
"case_insensitive_kw_analyzer": {
"tokenizer": "keyword",
"filter": [
"lowercase"
]
},
"my_nGram": {
"tokenizer": "my_nGram_tokenizer",
"filter": [
"lowercase",
"asciifolding"
]
},
"content_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
}

View File

@@ -6,5 +6,4 @@ python3 scripts/mime.py > src/parsing/mime_generated.c
python3 scripts/serve_static.py > src/web/static_generated.c
python3 scripts/index_static.py > src/index/static_generated.c
printf "static const char *const Sist2CommitHash = \"%s\";\n" $(git rev-parse HEAD) > src/git_hash.h
printf "static const char *const LibScanCommitHash = \"%s\";\n" $(cd third-party/libscan/ && git rev-parse HEAD) >> src/git_hash.h
printf "static const char *const Sist2CommitHash = \"%s\";\n" $(git rev-parse HEAD) > src/git_hash.h

View File

@@ -3,6 +3,7 @@ import json
files = [
"schema/mappings.json",
"schema/settings.json",
"schema/settings_legacy.json",
"schema/pipeline.json",
]

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -51,6 +51,7 @@ export interface EsHit {
duration: number
tag: string[]
checksum: string
thumbnail: string
}
_props: {
isSubDocument: boolean
@@ -61,6 +62,8 @@ export interface EsHit {
isPlayableImage: boolean
isAudio: boolean
hasThumbnail: boolean
tnW: number
tnH: number
}
highlight: {
name: string[] | undefined,
@@ -131,6 +134,8 @@ class Sist2Api {
if ("thumbnail" in hit._source) {
hit._props.hasThumbnail = true;
hit._props.tnW = Number(hit._source.thumbnail.split(",")[0]);
hit._props.tnH = Number(hit._source.thumbnail.split(",")[1]);
}
switch (mimeCategory) {
@@ -251,20 +256,31 @@ class Sist2Api {
});
}
getMimeTypes() {
return this.esQuery({
aggs: {
mimeTypes: {
terms: {
field: "mime",
size: 10000
}
getMimeTypes(query = undefined) {
const AGGS = {
mimeTypes: {
terms: {
field: "mime",
size: 10000
}
},
size: 0,
}).then(resp => {
}
};
if (!query) {
query = {
aggs: AGGS,
size: 0,
};
} else {
query.size = 0;
query.aggs = AGGS;
}
return this.esQuery(query).then(resp => {
const mimeMap: any[] = [];
resp["aggregations"]["mimeTypes"]["buckets"].sort((a: any, b: any) => a.key > b.key).forEach((bucket: any) => {
const buckets = resp["aggregations"]["mimeTypes"]["buckets"];
buckets.sort((a: any, b: any) => a.key > b.key).forEach((bucket: any) => {
const tmp = bucket["key"].split("/");
const category = tmp[0];
const mime = tmp[1];
@@ -284,11 +300,18 @@ class Sist2Api {
});
if (!category_exists) {
mimeMap.push({"text": category, children: [child]});
mimeMap.push({text: category, children: [child], id: category});
}
})
return mimeMap;
mimeMap.forEach(node => {
if (node.children) {
node.children.sort((a, b) => a.id.localeCompare(b.id));
}
})
mimeMap.sort((a, b) => a.id.localeCompare(b.id))
return {buckets, mimeMap};
});
}

View File

@@ -43,6 +43,20 @@ const SORT_MODES = {
{_tie: {order: "asc"}}
],
key: (hit: EsHit) => hit._source.size
},
nameAsc: {
mode: [
{name: {order: "asc"}},
{_tie: {order: "asc"}}
],
key: (hit: EsHit) => hit._source.name
},
nameDesc: {
mode: [
{name: {order: "desc"}},
{_tie: {order: "asc"}}
],
key: (hit: EsHit) => hit._source.name
}
} as any;
@@ -73,6 +87,8 @@ class Sist2Query {
const selectedMimeTypes = getters.selectedMimeTypes;
const selectedTags = getters.selectedTags;
const legacyES = store.state.sist2Info.esVersionLegacy;
const filters = [
{terms: {index: selectedIndexIds}}
] as any[];
@@ -187,9 +203,13 @@ class Sist2Query {
"name.nGram": {},
"content.nGram": {},
font_name: {},
},
max_analyzed_offset: 9_999_999
}
};
if (!legacyES) {
q.highlight.max_analyzed_offset = 9_999_999;
}
if (getters.optSearchInPath) {
q.highlight.fields["path.text"] = {};
q.highlight.fields["path.nGram"] = {};

View File

@@ -5,7 +5,6 @@
<b-card-body>
<!-- TODO: ES connectivity, Link to GH page -->
<b-table :items="tableItems" small borderless responsive="md" thead-class="hidden" class="mb-0"></b-table>
<hr />
@@ -16,7 +15,7 @@
<script>
import IndexDebugInfo from "@/components/IndexDebugInfo";
import DebugIcon from "@/components/DebugIcon";
import DebugIcon from "@/components/icons/DebugIcon";
export default {
name: "DebugInfo.vue",
@@ -28,10 +27,12 @@ export default {
{key: "platform", value: this.$store.state.sist2Info.platform},
{key: "debugBinary", value: this.$store.state.sist2Info.debug},
{key: "sist2CommitHash", value: this.$store.state.sist2Info.sist2Hash},
{key: "libscanCommitHash", value: this.$store.state.sist2Info.libscanHash},
{key: "esIndex", value: this.$store.state.sist2Info.esIndex},
{key: "tagline", value: this.$store.state.sist2Info.tagline},
{key: "dev", value: this.$store.state.sist2Info.dev},
{key: "esVersion", value: this.$store.state.sist2Info.esVersion},
{key: "esVersionSupported", value: this.$store.state.sist2Info.esVersionSupported},
{key: "esVersionLegacy", value: this.$store.state.sist2Info.esVersionLegacy},
]
}
}

View File

@@ -15,11 +15,15 @@
<span class="badge badge-resolution">{{ humanTime(doc._source.duration) }}</span>
</div>
<div v-if="doc._props.isImage && !hover" class="card-img-overlay" :class="{'small-badge': smallBadge}">
<div
v-if="doc._props.isImage && !hover && doc._props.tnW / doc._props.tnH < 5"
class="card-img-overlay"
:class="{'small-badge': smallBadge}">
<span class="badge badge-resolution">{{ `${doc._source.width}x${doc._source.height}` }}</span>
</div>
<div v-if="(doc._props.isVideo || doc._props.isGif) && doc._source.duration > 0 && !hover" class="card-img-overlay"
<div v-if="(doc._props.isVideo || doc._props.isGif) && doc._source.duration > 0 && !hover"
class="card-img-overlay"
:class="{'small-badge': smallBadge}">
<span class="badge badge-resolution">{{ humanTime(doc._source.duration) }}</span>
</div>
@@ -30,16 +34,19 @@
</svg>
</div>
<img v-if="doc._props.isPlayableImage || doc._props.isPlayableVideo"
<img ref="tn"
v-if="doc._props.isPlayableImage || doc._props.isPlayableVideo"
:src="(doc._props.isGif && hover) ? `f/${doc._id}` : `t/${doc._source.index}/${doc._id}`"
alt=""
:style="{height: (doc._props.isGif && hover) ? `${tnHeight()}px` : undefined}"
class="pointer fit card-img-top" @click="onThumbnailClick()">
<img v-else :src="`t/${doc._source.index}/${doc._id}`" alt=""
class="fit card-img-top">
</div>
<!-- Audio player-->
<audio v-if="doc._props.isAudio" ref="audio" preload="none" class="audio-fit fit" controls :type="doc._source.mime"
<audio v-if="doc._props.isAudio" ref="audio" preload="none" class="audio-fit fit" controls
:type="doc._source.mime"
:src="`f/${doc._id}`"
@play="onAudioPlay()"></audio>
@@ -117,6 +124,9 @@ export default {
},
onTnLeave() {
this.hover = false;
},
tnHeight() {
return this.$refs.tn.height;
}
},
}

View File

@@ -1,5 +1,6 @@
<template>
<b-list-group-item class="flex-column align-items-start mb-2" :class="{'sub-document': doc._props.isSubDocument}">
<b-list-group-item class="flex-column align-items-start mb-2" :class="{'sub-document': doc._props.isSubDocument}"
@mouseenter="onTnEnter()" @mouseleave="onTnLeave()" >
<!-- Info modal-->
<DocInfoModal :show="showInfo" :doc="doc" @close="showInfo = false"></DocInfoModal>
@@ -56,7 +57,7 @@ import TagContainer from "@/components/TagContainer";
import DocFileTitle from "@/components/DocFileTitle";
import DocInfoModal from "@/components/DocInfoModal";
import ContentDiv from "@/components/ContentDiv";
import FileIcon from "@/components/FileIcon";
import FileIcon from "@/components/icons/FileIcon";
export default {
name: "DocListItem",
@@ -85,7 +86,13 @@ export default {
return this.doc.highlight["path.nGram"] + "/"
}
return this.doc._source.path + "/"
}
},
onTnEnter() {
this.hover = true;
},
onTnLeave() {
this.hover = false;
},
}
}
</script>

View File

@@ -7,11 +7,27 @@
value-field="id"></b-form-select>
</div>
<div v-else>
<b-list-group id="index-picker-desktop">
<div class="d-flex justify-content-between align-content-center">
<span>
{{ selectedIndices.length }}
{{ selectedIndices.length === 1 ? $t("indexPicker.selectedIndex") : $t("indexPicker.selectedIndices") }}
</span>
<div>
<b-button variant="link" @click="selectAll()"> {{ $t("indexPicker.selectAll") }}</b-button>
<b-button variant="link" @click="selectNone()"> {{ $t("indexPicker.selectNone") }}</b-button>
</div>
</div>
<b-list-group id="index-picker-desktop" class="unselectable">
<b-list-group-item
v-for="idx in indices"
@click="toggleIndex(idx)"
class="d-flex justify-content-between align-items-center list-group-item-action pointer">
@click="toggleIndex(idx, $event)"
@click.shift="shiftClick(idx, $event)"
class="d-flex justify-content-between align-items-center list-group-item-action pointer"
:class="{active: lastClickIndex === idx}"
>
<div class="d-flex">
<b-checkbox @change="toggleIndex(idx)" :checked="isSelected(idx)"></b-checkbox>
{{ idx.name }}
@@ -36,6 +52,7 @@ export default Vue.extend({
data() {
return {
loading: true,
lastClickIndex: null
}
},
computed: {
@@ -53,13 +70,50 @@ export default Vue.extend({
...mapActions({
setSelectedIndices: "setSelectedIndices"
}),
shiftClick(index, e) {
if (this.lastClickIndex === null) {
return;
}
const select = this.isSelected(this.lastClickIndex);
let leftBoundary = this.indices.indexOf(this.lastClickIndex);
let rightBoundary = this.indices.indexOf(index);
if (rightBoundary < leftBoundary) {
let tmp = leftBoundary;
leftBoundary = rightBoundary;
rightBoundary = tmp;
}
for (let i = leftBoundary; i <= rightBoundary; i++) {
if (select) {
if (!this.isSelected(this.indices[i])) {
this.setSelectedIndices([this.indices[i], ...this.selectedIndices]);
}
} else {
this.setSelectedIndices(this.selectedIndices.filter(idx => idx !== this.indices[i]));
}
}
},
selectAll() {
this.setSelectedIndices(this.indices);
},
selectNone() {
this.setSelectedIndices([]);
},
onSelect(value) {
this.setSelectedIndices(this.indices.filter(idx => value.includes(idx.id)));
},
formatIdxDate(timestamp: number): string {
return format(new Date(timestamp * 1000), "yyyy-MM-dd");
},
toggleIndex(index) {
toggleIndex(index, e) {
if (e.shiftKey) {
return;
}
this.lastClickIndex = index;
if (this.isSelected(index)) {
this.setSelectedIndices(this.selectedIndices.filter(idx => idx.id != index.id));
} else {
@@ -79,6 +133,11 @@ export default Vue.extend({
font-size: 80%;
}
.theme-black .version-badge {
color: #eee !important;
background: none;
}
.version-badge {
color: #222 !important;
background: none;
@@ -92,4 +151,21 @@ export default Vue.extend({
overflow-y: auto;
max-height: 132px;
}
.btn-link:focus {
box-shadow: none;
}
.unselectable {
user-select: none;
-ms-user-select: none;
-moz-user-select: none;
-webkit-user-select: none;
}
.list-group-item.active {
z-index: 2;
background-color: inherit;
color: inherit;
}
</style>

View File

@@ -1,6 +1,5 @@
<template>
<div>
<!-- TODO: Set slideshowTime as a configurable option-->
<FsLightbox
:key="lightboxKey"
:toggler="showLightbox"
@@ -10,7 +9,7 @@
:types="lightboxTypes"
:source-index="lightboxSlide"
:custom-toolbar-buttons="customButtons"
:slideshow-time="1000 * 10"
:slideshow-time="$store.getters.optLightboxSlideDuration * 1000"
:zoom-increment="0.5"
:load-only-current-source="$store.getters.optLightboxLoadOnlyCurrent"
:on-close="onClose"

View File

@@ -7,37 +7,24 @@ import InspireTree from "inspire-tree";
import InspireTreeDOM from "inspire-tree-dom";
import "inspire-tree-dom/dist/inspire-tree-light.min.css";
import {getSelectedTreeNodes} from "@/util";
import {getSelectedTreeNodes, getTreeNodeAttributes} from "@/util";
import Sist2Api from "@/Sist2Api";
import Sist2Query from "@/Sist2Query";
export default {
name: "MimePicker",
data() {
return {
mimeTree: null,
stashedMimeTreeAttributes: null
}
},
mounted() {
this.$store.subscribe((mutation) => {
if (mutation.type === "setUiMimeMap") {
const mimeMap = mutation.payload.slice();
this.mimeTree = new InspireTree({
selection: {
mode: 'checkbox'
},
data: mimeMap
});
new InspireTreeDOM(this.mimeTree, {
target: '#mimeTree'
});
this.mimeTree.on("node.state.changed", this.handleTreeClick);
this.mimeTree.deselect();
if (this.$store.state._onLoadSelectedMimeTypes.length > 0) {
this.$store.state._onLoadSelectedMimeTypes.forEach(mime => {
this.mimeTree.node(mime).select();
});
}
if (mutation.type === "setUiMimeMap" && this.mimeTree === null) {
this.initializeTree();
} else if (mutation.type === "busSearch") {
this.updateTree();
}
});
},
@@ -49,6 +36,73 @@ export default {
this.$store.commit("setSelectedMimeTypes", getSelectedTreeNodes(this.mimeTree));
},
updateTree() {
if (this.$store.getters.optUpdateMimeMap === false) {
return;
}
if (this.stashedMimeTreeAttributes === null) {
this.stashedMimeTreeAttributes = getTreeNodeAttributes(this.mimeTree);
}
const query = Sist2Query.searchQuery();
Sist2Api.getMimeTypes(query).then(({buckets, mimeMap}) => {
this.$store.commit("setUiMimeMap", mimeMap);
this.$store.commit("setUiDetailsMimeAgg", buckets);
this.mimeTree.removeAll();
this.mimeTree.addNodes(mimeMap);
// Restore selected mimes
if (this.stashedMimeTreeAttributes === null) {
// NOTE: This happens when successive fast searches are triggered
this.stashedMimeTreeAttributes = {};
// Always add the selected mime types
this.$store.state.selectedMimeTypes.forEach(mime => {
this.stashedMimeTreeAttributes[mime] = {
checked: true
}
});
}
Object.entries(this.stashedMimeTreeAttributes).forEach(([mime, attributes]) => {
if (this.mimeTree.node(mime)) {
if (attributes.checked) {
this.mimeTree.node(mime).select();
}
if (attributes.collapsed === false) {
this.mimeTree.node(mime).expand();
}
}
});
this.stashedMimeTreeAttributes = null;
});
},
initializeTree() {
const mimeMap = this.$store.state.uiMimeMap;
this.mimeTree = new InspireTree({
selection: {
mode: "checkbox"
},
data: mimeMap
});
new InspireTreeDOM(this.mimeTree, {
target: "#mimeTree"
});
this.mimeTree.on("node.state.changed", this.handleTreeClick);
this.mimeTree.deselect();
if (this.$store.state._onLoadSelectedMimeTypes.length > 0) {
this.$store.state._onLoadSelectedMimeTypes.forEach(mime => {
this.mimeTree.node(mime).select();
});
}
}
}
}
</script>

View File

@@ -8,7 +8,8 @@
</b-navbar-brand>
<span class="badge badge-pill version" v-if="$store && $store.state.sist2Info">
v{{ sist2Version() }}<span v-if="isDebug()">-dbg</span>
v{{ sist2Version() }}<span v-if="isDebug()">-dbg</span><span v-if="isLegacy() && !hideLegacy()">-<a
href="https://github.com/simon987/sist2/blob/master/docs/USAGE.md#elasticsearch" target="_blank">legacyES</a></span>
</span>
<span v-if="$store && $store.state.sist2Info" class="tagline" v-html="tagline()"></span>
@@ -19,7 +20,8 @@
</template>
<script>
import Sist2Icon from "@/components/Sist2Icon";
import Sist2Icon from "@/components/icons/Sist2Icon";
export default {
name: "NavBar",
components: {Sist2Icon},
@@ -32,6 +34,12 @@ export default {
},
isDebug() {
return this.$store.state.sist2Info.debug;
},
isLegacy() {
return this.$store.state.sist2Info.esVersionLegacy;
},
hideLegacy() {
return this.$store.state.optHideLegacy;
}
}
}
@@ -95,7 +103,7 @@ export default {
}
}
.theme-light .btn-link{
.theme-light .btn-link {
color: #222;
}
</style>

View File

@@ -3,31 +3,56 @@
<span>{{ hitCount }} {{ hitCount === 1 ? $t("hit") : $t("hits") }}</span>
<div style="float: right">
<b-button v-b-toggle.collapse-1 variant="primary" class="not-mobile">{{ $t("details") }}</b-button>
<b-button v-b-toggle.collapse-1 variant="primary" class="not-mobile" @click="onToggle()">{{
$t("details")
}}
</b-button>
<SortSelect class="ml-2"></SortSelect>
<template v-if="hitCount !== 0">
<SortSelect class="ml-2"></SortSelect>
<DisplayModeToggle class="ml-2"></DisplayModeToggle>
<DisplayModeToggle class="ml-2"></DisplayModeToggle>
</template>
</div>
<b-collapse id="collapse-1" class="pt-2" style="clear:both;">
<b-card>
<b-table :items="tableItems" small borderless thead-class="hidden" class="mb-0"></b-table>
<b-table :items="tableItems" small borderless bordered thead-class="hidden" class="mb-0"></b-table>
<br/>
<h4>
{{$t("mimeTypes")}}
<b-button size="sm" variant="primary" class="float-right" @click="onCopyClick"><ClipboardIcon/></b-button>
</h4>
<Preloader v-if="$store.state.uiDetailsMimeAgg == null"></Preloader>
<b-table
v-else
sort-by="doc_count"
:sort-desc="true"
thead-class="hidden"
:items="$store.state.uiDetailsMimeAgg" small bordered class="mb-0"
></b-table>
</b-card>
</b-collapse>
</b-card>
</template>
<script lang="ts">
import {EsResult} from "@/Sist2Api";
import Sist2Api, {EsResult} from "@/Sist2Api";
import Vue from "vue";
import {humanFileSize, humanTime} from "@/util";
import {humanFileSize} from "@/util";
import DisplayModeToggle from "@/components/DisplayModeToggle.vue";
import SortSelect from "@/components/SortSelect.vue";
import Preloader from "@/components/Preloader.vue";
import Sist2Query from "@/Sist2Query";
import ClipboardIcon from "@/components/icons/ClipboardIcon.vue";
export default Vue.extend({
name: "ResultsCard",
components: {SortSelect, DisplayModeToggle},
components: {ClipboardIcon, Preloader, SortSelect, DisplayModeToggle},
created() {
},
computed: {
lastResultsLoaded() {
return this.$store.state.lastQueryResults != null;
@@ -52,6 +77,39 @@ export default Vue.extend({
totalSize() {
return humanFileSize((this.$store.state.lastQueryResults as EsResult).aggregations.total_size.value);
},
onToggle() {
const show = !document.getElementById("collapse-1").classList.contains("show");
this.$store.commit("setUiShowDetails", show);
if (show && this.$store.state.uiDetailsMimeAgg == null && !this.$store.state.optUpdateMimeMap) {
// Mime aggs are not updated automatically, update now
this.forceUpdateMimeAgg();
}
},
onCopyClick() {
let tsvString = "";
this.$store.state.uiDetailsMimeAgg.slice().sort((a,b) => b["doc_count"] - a["doc_count"]).forEach(row => {
tsvString += `${row["key"]}\t${row["doc_count"]}\n`;
});
navigator.clipboard.writeText(tsvString);
this.$bvToast.toast(
this.$t("toast.copiedToClipboard"),
{
title: null,
noAutoHide: false,
toaster: "b-toaster-bottom-right",
headerClass: "hidden",
bodyClass: "toast-body-info",
});
},
forceUpdateMimeAgg() {
const query = Sist2Query.searchQuery();
Sist2Api.getMimeTypes(query).then(({buckets}) => {
this.$store.commit("setUiDetailsMimeAgg", buckets);
});
}
},
});

View File

@@ -19,6 +19,14 @@
{{ $t("sort.sizeDesc") }}
</b-dropdown-item>
<b-dropdown-item :class="{'dropdown-active': sort === 'nameDesc'}" @click="onSelect('nameDesc')">
{{ $t("sort.nameDesc") }}
</b-dropdown-item>
<b-dropdown-item :class="{'dropdown-active': sort === 'nameAsc'}" @click="onSelect('nameAsc')">
{{ $t("sort.nameAsc") }}
</b-dropdown-item>
<b-dropdown-item :class="{'dropdown-active': sort === 'random'}" @click="onSelect('random')">
{{ $t("sort.random") }}
</b-dropdown-item>

View File

@@ -51,7 +51,7 @@
>{{ tag.text.split(".").pop() }}</span>
<b-popover :target="hit._id+tag.rawText" triggers="focus blur" placement="top">
<b-button variant="danger" @click="onTagDeleteClick(tag, $event)">Delete</b-button>
<b-button variant="danger" @click="onTagDeleteClick(tag, $event)">{{$t("deleteTag")}}</b-button>
</b-popover>
</div>
@@ -63,7 +63,7 @@
</template>
<!-- Add button -->
<small v-if="showAddButton" class="badge add-tag-button" @click="tagAdd()">Add</small>
<small v-if="showAddButton" class="badge add-tag-button" @click="tagAdd()">{{$t("addTag")}}</small>
<!-- Size tag-->
<small v-else class="text-muted badge-size">{{

View File

@@ -120,7 +120,7 @@ export default {
},
mounted() {
this.$store.subscribe((mutation) => {
if (mutation.type === "setUiMimeMap") {
if (mutation.type === "setUiMimeMap" && this.tagTree === null) {
this.initializeTree();
this.updateTree();
} else if (mutation.type === "busUpdateTags") {
@@ -147,6 +147,7 @@ export default {
this.tagTree.on("node.state.changed", this.handleTreeClick);
},
updateTree() {
// TODO: remember which tags are selected and restore?
const tagMap = [];
Sist2Api.getTags().then(tags => {
tags.forEach(tag => addTag(tagMap, tag.id, tag.id, tag.count));

View File

@@ -0,0 +1,21 @@
<template>
<svg style="width:24px;height:24px" viewBox="0 0 24 24">
<path
fill="currentColor"
d="M17,9H7V7H17M17,13H7V11H17M14,17H7V15H14M12,3A1,1 0 0,1 13,4A1,1 0 0,1 12,5A1,1 0 0,1 11,4A1,1 0 0,1 12,3M19,3H14.82C14.4,1.84 13.3,1 12,1C10.7,1 9.6,1.84 9.18,3H5A2,2 0 0,0 3,5V19A2,2 0 0,0 5,21H19A2,2 0 0,0 21,19V5A2,2 0 0,0 19,3Z"/>
</svg>
</template>
<script>
export default {
name: "ClipboardIcon"
}
</script>
<style scoped>
svg {
display: inline-block;
width: 20px;
height: 20px;
}
</style>

View File

@@ -0,0 +1,21 @@
<template>
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
<path
fill="currentColor"
d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm1 16.057v-3.057h2.994c-.059 1.143-.212 2.24-.456 3.279-.823-.12-1.674-.188-2.538-.222zm1.957 2.162c-.499 1.33-1.159 2.497-1.957 3.456v-3.62c.666.028 1.319.081 1.957.164zm-1.957-7.219v-3.015c.868-.034 1.721-.103 2.548-.224.238 1.027.389 2.111.446 3.239h-2.994zm0-5.014v-3.661c.806.969 1.471 2.15 1.971 3.496-.642.084-1.3.137-1.971.165zm2.703-3.267c1.237.496 2.354 1.228 3.29 2.146-.642.234-1.311.442-2.019.607-.344-.992-.775-1.91-1.271-2.753zm-7.241 13.56c-.244-1.039-.398-2.136-.456-3.279h2.994v3.057c-.865.034-1.714.102-2.538.222zm2.538 1.776v3.62c-.798-.959-1.458-2.126-1.957-3.456.638-.083 1.291-.136 1.957-.164zm-2.994-7.055c.057-1.128.207-2.212.446-3.239.827.121 1.68.19 2.548.224v3.015h-2.994zm1.024-5.179c.5-1.346 1.165-2.527 1.97-3.496v3.661c-.671-.028-1.329-.081-1.97-.165zm-2.005-.35c-.708-.165-1.377-.373-2.018-.607.937-.918 2.053-1.65 3.29-2.146-.496.844-.927 1.762-1.272 2.753zm-.549 1.918c-.264 1.151-.434 2.36-.492 3.611h-3.933c.165-1.658.739-3.197 1.617-4.518.88.361 1.816.67 2.808.907zm.009 9.262c-.988.236-1.92.542-2.797.9-.89-1.328-1.471-2.879-1.637-4.551h3.934c.058 1.265.231 2.488.5 3.651zm.553 1.917c.342.976.768 1.881 1.257 2.712-1.223-.49-2.326-1.211-3.256-2.115.636-.229 1.299-.435 1.999-.597zm9.924 0c.7.163 1.362.367 1.999.597-.931.903-2.034 1.625-3.257 2.116.489-.832.915-1.737 1.258-2.713zm.553-1.917c.27-1.163.442-2.386.501-3.651h3.934c-.167 1.672-.748 3.223-1.638 4.551-.877-.358-1.81-.664-2.797-.9zm.501-5.651c-.058-1.251-.229-2.46-.492-3.611.992-.237 1.929-.546 2.809-.907.877 1.321 1.451 2.86 1.616 4.518h-3.933z"/>
</svg>
</template>
<script>
export default {
name: "LanguageIcon"
}
</script>
<style scoped>
svg {
display: inline-block;
width: 20px;
height: 20px;
}
</style>

View File

@@ -5,6 +5,8 @@ export default {
advanced: "Advanced search",
fuzzy: "Fuzzy"
},
addTag: "Add",
deleteTag: "Delete",
download: "Download",
and: "and",
page: "page",
@@ -63,7 +65,9 @@ export default {
slideDuration: "Slide duration",
resultSize: "Number of results per page",
tagOrOperator: "Use OR operator when specifying multiple tags.",
hideDuplicates: "Hide duplicate results based on checksum"
hideDuplicates: "Hide duplicate results based on checksum",
hideLegacy: "Hide the 'legacyES' Elasticsearch notice",
updateMimeMap: "Update the Media Types tree in real time"
},
queryMode: {
simple: "Simple",
@@ -71,7 +75,8 @@ export default {
},
lang: {
en: "English",
fr: "Français"
fr: "Français",
"zh-CN": "简体中文",
},
displayMode: {
grid: "Grid",
@@ -125,18 +130,21 @@ export default {
esQueryErr: "Could not parse or execute query, please check the Advanced search documentation. " +
"See server logs for more information.",
dupeTagTitle: "Duplicate tag",
dupeTag: "This tag already exists for this document."
dupeTag: "This tag already exists for this document.",
copiedToClipboard: "Copied to clipboard"
},
saveTagModalTitle: "Add tag",
saveTagPlaceholder: "Tag name",
confirm: "Confirm",
indexPickerPlaceholder: "Select indices",
indexPickerPlaceholder: "Select an index",
sort: {
relevance: "Relevance",
dateAsc: "Date (Older first)",
dateDesc: "Date (Newer first)",
sizeAsc: "Size (Smaller first)",
sizeDesc: "Size (Larger first)",
nameAsc: "Name (A-z)",
nameDesc: "Name (Z-a)",
random: "Random",
},
d3: {
@@ -144,7 +152,13 @@ export default {
mimeSize: "Size distribution by media type",
dateHistogram: "File modification time distribution",
sizeHistogram: "File size distribution",
}
},
indexPicker: {
selectNone: "Select None",
selectAll: "Select All",
selectedIndex: "selected index",
selectedIndices: "selected indices",
},
},
fr: {
searchBar: {
@@ -152,6 +166,8 @@ export default {
advanced: "Recherche avancée",
fuzzy: "Approximatif"
},
addTag: "Ajouter",
deleteTag: "Supprimer",
download: "Télécharger",
and: "et",
page: "page",
@@ -211,7 +227,9 @@ export default {
slideDuration: "Durée des diapositives",
resultSize: "Nombre de résultats par page",
tagOrOperator: "Utiliser l'opérateur OU lors de la spécification de plusieurs tags",
hideDuplicates: "Masquer les résultats en double"
hideDuplicates: "Masquer les résultats en double",
hideLegacy: "Masquer la notice 'legacyES' Elasticsearch",
updateMimeMap: "Mettre à jour l'arbre de Types de médias en temps réel"
},
queryMode: {
simple: "Simple",
@@ -219,7 +237,8 @@ export default {
},
lang: {
en: "English",
fr: "Français"
fr: "Français",
"zh-CN": "简体中文",
},
displayMode: {
grid: "Grille",
@@ -274,7 +293,8 @@ export default {
esQueryErr: "Impossible d'analyser ou d'exécuter la requête, veuillez consulter la documentation sur la " +
"recherche avancée. Voir les journaux du serveur pour plus d'informations.",
dupeTagTitle: "Tag en double",
dupeTag: "Ce tag existe déjà pour ce document."
dupeTag: "Ce tag existe déjà pour ce document.",
copiedToClipboard: "Copié dans le presse-papier"
},
saveTagModalTitle: "Ajouter un tag",
saveTagPlaceholder: "Nom du tag",
@@ -286,6 +306,8 @@ export default {
dateDesc: "Date (Plus récent)",
sizeAsc: "Taille (Plus petit)",
sizeDesc: "Taille (Plus grand)",
nameAsc: "Nom (A-z)",
nameDesc: "Nom (Z-a)",
random: "Aléatoire",
},
d3: {
@@ -293,6 +315,173 @@ export default {
mimeSize: "Distribution des tailles de fichiers par type de média",
dateHistogram: "Distribution des dates de modification",
sizeHistogram: "Distribution des tailles de fichier",
}
}
}
},
indexPicker: {
selectNone: "Sélectionner aucun",
selectAll: "Sélectionner tout",
selectedIndex: "indice sélectionné",
selectedIndices: "indices sélectionnés",
},
},
"zh-CN": {
searchBar: {
simple: "搜索",
advanced: "高级搜索",
fuzzy: "模糊搜索"
},
addTag: "添加",
deleteTag: "删除",
download: "下载",
and: "与",
page: "页",
pages: "页",
mimeTypes: "文件类型",
tags: "标签",
help: {
simpleSearch: "简易搜索",
advancedSearch: "高级搜索",
help: "帮助",
term: "<关键词>",
and: "与操作",
or: "或操作",
not: "反选单个关键词",
quotes: "括起来的部分视为一个关键词,保序",
prefix: "在词尾使用时,匹配前缀",
parens: "表达式编组",
tildeTerm: "匹配编辑距离以内的关键词",
tildePhrase: "匹配短语,容忍一些非匹配词",
example1:
"例如: <code>\"番茄\" +(炒蛋 | 牛腩) -饭</code> 将匹配" +
"短语 <i>番茄炒蛋</i>、<i>炒蛋</i> 或者 <i>牛腩</i>,而忽略任何带有" +
"<i>饭</i>的关键词.",
defaultOperator:
"表达式中无<code>+</code>或者<code>|</code>时,默认使用" +
"<code>+</code>(与操作)。",
fuzzy:
"选中<b>模糊搜索</b>选项时返回部分匹配的结果3-grams)。",
moreInfoSimple: "详细信息:<a target=\"_blank\" " +
"rel=\"noreferrer\" href=\"//www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html\">Elasticsearch文档</a>",
moreInfoAdvanced: "高级搜索模式文档:<a target=\"_blank\" rel=\"noreferrer\" href=\"//www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax\">Elasticsearch文档</a>"
},
config: "配置",
configDescription: "配置在此浏览器中实时保存。",
configReset: "重置所有设置",
searchOptions: "搜索选项",
treemapOptions: "树状图选项",
displayOptions: "显示选项",
opt: {
lang: "语言",
highlight: "启用高亮",
fuzzy: "默认使用模糊搜索",
searchInPath: "匹配文档路径",
suggestPath: "搜索框启用自动补全",
fragmentSize: "高亮上下文大小",
queryMode: "搜索模式",
displayMode: "显示",
columns: "列数",
treemapType: "树状图类属性",
treemapTiling: "树状图平铺",
treemapColorGroupingDepth: "树状图颜色编组深度(展开)",
treemapColor: "树状图颜色(折叠)",
treemapSize: "树状图大小",
theme: "主题",
lightboxLoadOnlyCurrent: "在图片查看器中,不要预读相邻的全图",
slideDuration: "幻灯片时长",
resultSize: "每页结果数",
tagOrOperator: "使用或操作OR匹配多个标签。",
hideDuplicates: "使用校验码隐藏重复结果",
hideLegacy: "隐藏'legacyES' Elasticsearch 通知",
updateMimeMap: "媒体类型树的实时更新"
},
queryMode: {
simple: "简单",
advanced: "高级",
},
lang: {
en: "English",
fr: "Français",
"zh-CN": "简体中文",
},
displayMode: {
grid: "网格",
list: "列表",
},
columns: {
auto: "自动"
},
treemapType: {
cascaded: "折叠",
flat: "平铺(紧凑)"
},
treemapSize: {
small: "小",
medium: "中",
large: "大",
xLarge: "加大",
xxLarge: "加加大",
custom: "自订",
},
treemapTiling: {
binary: "Binary",
squarify: "Squarify",
slice: "Slice",
dice: "Dice",
sliceDice: "Slice & Dice",
},
theme: {
light: "亮",
black: "暗"
},
hit: "命中",
hits: "命中",
details: "详细信息",
stats: "统计信息",
queryTime: "查询时间",
totalSize: "总大小",
pathBar: {
placeholder: "过滤路径",
modalTitle: "选择路径"
},
debug: "调试信息",
debugDescription: "对调试除错有用的信息。 若您遇到bug或者想建议新功能请提交新Issue到" +
"<a href='https://github.com/simon987/sist2/issues/new/choose'>这里</a>.",
tagline: "标签栏",
toast: {
esConnErrTitle: "Elasticsearch连接错误",
esConnErr: "sist2 web 模块连接Elasticsearch出错。" +
"查看服务日志以获取更多信息。",
esQueryErrTitle: "查询错误",
esQueryErr: "无法识别或执行查询,请查阅高级搜索文档。" +
"查看服务日志以获取更多信息。",
dupeTagTitle: "重复标签",
dupeTag: "该标签已存在于此文档。",
copiedToClipboard: "复制到剪贴板"
},
saveTagModalTitle: "增加标签",
saveTagPlaceholder: "标签名",
confirm: "确认",
indexPickerPlaceholder: "选择一个索引",
sort: {
relevance: "相关度",
dateAsc: "日期(由旧到新)",
dateDesc: "日期(由新到旧)",
sizeAsc: "大小(从小到大)",
sizeDesc: "大小(从大到小)",
nameAsc: "名字A-z",
nameDesc: "名字 Z-a",
random: "随机",
},
d3: {
mimeCount: "各类文件数量分布",
mimeSize: "各类文件大小分布",
dateHistogram: "文件修改时间分布",
sizeHistogram: "文件大小分布",
},
indexPicker: {
selectNone: "清空",
selectAll: "全选",
selectedIndex: "选中索引",
selectedIndices: "选中索引",
},
},
}

View File

@@ -27,6 +27,7 @@ export default new Vuex.Store({
size: 60,
optLang: "en",
optLangIsDefault: true,
optHideDuplicates: true,
optTheme: "light",
optDisplay: "grid",
@@ -46,6 +47,8 @@ export default new Vuex.Store({
optTreemapColor: "PuBuGn",
optLightboxLoadOnlyCurrent: false,
optLightboxSlideDuration: 15,
optHideLegacy: false,
optUpdateMimeMap: true,
_onLoadSelectedIndices: [] as string[],
_onLoadSelectedMimeTypes: [] as string[],
@@ -70,9 +73,14 @@ export default new Vuex.Store({
uiLightboxSlide: 0,
uiReachedScrollEnd: false,
uiDetailsMimeAgg: null,
uiShowDetails: false,
uiMimeMap: [] as any[]
},
mutations: {
setUiShowDetails: (state, val) => state.uiShowDetails = val,
setUiDetailsMimeAgg: (state, val) => state.uiDetailsMimeAgg = val,
setUiReachedScrollEnd: (state, val) => state.uiReachedScrollEnd = val,
setTags: (state, val) => state.tags = val,
setPathText: (state, val) => state.pathText = val,
@@ -81,7 +89,10 @@ export default new Vuex.Store({
setSist2Info: (state, val) => state.sist2Info = val,
setSeed: (state, val) => state.seed = val,
setOptHideDuplicates: (state, val) => state.optHideDuplicates = val,
setOptLang: (state, val) => state.optLang = val,
setOptLang: (state, val) => {
state.optLang = val;
state.optLangIsDefault = false;
},
setSortMode: (state, val) => state.sortMode = val,
setIndices: (state, val) => {
state.indices = val;
@@ -144,8 +155,11 @@ export default new Vuex.Store({
setOptTreemapColorGroupingDepth: (state, val) => state.optTreemapColorGroupingDepth = val,
setOptTreemapSize: (state, val) => state.optTreemapSize = val,
setOptTreemapColor: (state, val) => state.optTreemapColor = val,
setOptHideLegacy: (state, val) => state.optHideLegacy = val,
setOptUpdateMimeMap: (state, val) => state.optUpdateMimeMap = val,
setOptLightboxLoadOnlyCurrent: (state, val) => state.optLightboxLoadOnlyCurrent = val,
setOptLightboxSlideDuration: (state, val) => state.optLightboxSlideDuration = val,
setUiMimeMap: (state, val) => state.uiMimeMap = val,
@@ -155,8 +169,18 @@ export default new Vuex.Store({
busUpdateTags: () => {
// noop
},
busSearch: () => {
// noop
},
},
actions: {
setSist2Info: (store, val) => {
store.commit("setSist2Info", val);
if (store.state.optLangIsDefault) {
store.commit("setOptLang", val.lang);
}
},
loadFromArgs({commit}, route: Route) {
if (route.query.q) {
@@ -276,6 +300,7 @@ export default new Vuex.Store({
commit("setUiLightboxTypes", []);
commit("setUiLightboxCaptions", []);
commit("setUiLightboxKey", 0);
commit("setUiDetailsMimeAgg", null);
}
},
modules: {},
@@ -339,5 +364,7 @@ export default new Vuex.Store({
optLightboxLoadOnlyCurrent: state => state.optLightboxLoadOnlyCurrent,
optLightboxSlideDuration: state => state.optLightboxSlideDuration,
optResultSize: state => state.size,
optHideLegacy: state => state.optHideLegacy,
optUpdateMimeMap: state => state.optUpdateMimeMap,
}
})

View File

@@ -97,6 +97,30 @@ export function getSelectedTreeNodes(tree: any) {
return Array.from(selectedNodes);
}
export function getTreeNodeAttributes(tree: any) {
const nodes = tree.selectable();
const attributes = {};
for (let i = 0; i < nodes.length; i++) {
let id = null;
if (nodes[i].text.indexOf("(") !== -1 && nodes[i].values) {
id = nodes[i].values.slice(-1)[0];
} else {
id = nodes[i].id
}
attributes[id] = {
checked: nodes[i].itree.state.checked,
collapsed: nodes[i].itree.state.collapsed,
}
}
return attributes;
}
export function serializeMimes(mimes: string[]): string | undefined {
if (mimes.length == 0) {
return undefined;

View File

@@ -15,11 +15,8 @@
<h4>{{ $t("displayOptions") }}</h4>
<b-card>
<b-form-checkbox :checked="optLightboxLoadOnlyCurrent" @input="setOptLightboxLoadOnlyCurrent">
{{ $t("opt.lightboxLoadOnlyCurrent") }}
</b-form-checkbox>
<label>{{ $t("opt.lang") }}</label>
<label><LanguageIcon/><span style="vertical-align: middle">&nbsp;{{ $t("opt.lang") }}</span></label>
<b-form-select :options="langOptions" :value="optLang" @input="setOptLang"></b-form-select>
<label>{{ $t("opt.theme") }}</label>
@@ -30,6 +27,20 @@
<label>{{ $t("opt.columns") }}</label>
<b-form-select :options="columnsOptions" :value="optColumns" @input="setOptColumns"></b-form-select>
<div style="height: 10px"></div>
<b-form-checkbox :checked="optLightboxLoadOnlyCurrent" @input="setOptLightboxLoadOnlyCurrent">
{{ $t("opt.lightboxLoadOnlyCurrent") }}
</b-form-checkbox>
<b-form-checkbox :checked="optHideLegacy" @input="setOptHideLegacy">
{{ $t("opt.hideLegacy") }}
</b-form-checkbox>
<b-form-checkbox :checked="optUpdateMimeMap" @input="setOptUpdateMimeMap">
{{ $t("opt.updateMimeMap") }}
</b-form-checkbox>
</b-card>
<br/>
@@ -113,15 +124,15 @@
</template>
<script>
import Vue from "vue";
import {mapGetters, mapMutations} from "vuex";
import {mapActions, mapGetters, mapMutations} from "vuex";
import DebugInfo from "@/components/DebugInfo.vue";
import Preloader from "@/components/Preloader.vue";
import sist2 from "@/Sist2Api";
import GearIcon from "@/components/GearIcon.vue";
import GearIcon from "@/components/icons/GearIcon.vue";
import LanguageIcon from "@/components/icons/LanguageIcon";
export default {
components: {GearIcon, DebugInfo, Preloader},
components: {LanguageIcon, GearIcon, DebugInfo, Preloader},
data() {
return {
loading: true,
@@ -129,6 +140,7 @@ export default {
langOptions: [
{value: "en", text: this.$t("lang.en")},
{value: "fr", text: this.$t("lang.fr")},
{value: "zh-CN", text: this.$t("lang.zh-CN")},
],
queryModeOptions: [
{value: "simple", text: this.$t("queryMode.simple")},
@@ -215,6 +227,8 @@ export default {
"optTagOrOperator",
"optLang",
"optHideDuplicates",
"optHideLegacy",
"optUpdateMimeMap",
]),
clientWidth() {
return window.innerWidth;
@@ -222,7 +236,7 @@ export default {
},
mounted() {
sist2.getSist2Info().then(data => {
this.$store.commit("setSist2Info", data)
this.setSist2Info(data);
this.loading = false;
});
@@ -233,6 +247,9 @@ export default {
});
},
methods: {
...mapActions({
setSist2Info: "setSist2Info",
}),
...mapMutations([
"setOptTheme",
"setOptDisplay",
@@ -250,11 +267,12 @@ export default {
"setOptTreemapSize",
"setOptLightboxLoadOnlyCurrent",
"setOptLightboxSlideDuration",
"setOptContainerWidth",
"setOptResultSize",
"setOptTagOrOperator",
"setOptLang",
"setOptHideDuplicates"
"setOptHideDuplicates",
"setOptHideLegacy",
"setOptUpdateMimeMap"
]),
onResetClick() {
localStorage.removeItem("sist2_configuration");

View File

@@ -31,7 +31,7 @@
</b-row>
</b-col>
<b-col>
<b-tabs>
<b-tabs justified>
<b-tab :title="$t('mimeTypes')">
<MimePicker></MimePicker>
</b-tab>
@@ -43,9 +43,13 @@
</b-row>
</b-card>
<Preloader v-if="searchBusy && docs.length === 0" class="mt-3"></Preloader>
<div v-show="docs.length === 0 && !uiLoading">
<Preloader v-if="searchBusy" class="mt-3"></Preloader>
<div v-else-if="docs.length > 0">
<ResultsCard></ResultsCard>
</div>
<div v-if="docs.length > 0">
<ResultsCard></ResultsCard>
<DocCardWall v-if="optDisplay==='grid'" :docs="docs" :append="appendFunc"></DocCardWall>
@@ -56,7 +60,7 @@
<script lang="ts">
import Preloader from "@/components/Preloader.vue";
import {mapGetters, mapMutations} from "vuex";
import {mapActions, mapGetters, mapMutations} from "vuex";
import sist2 from "../Sist2Api";
import Sist2Api, {EsHit, EsResult} from "../Sist2Api";
import SearchBar from "@/components/SearchBar.vue";
@@ -109,10 +113,6 @@ export default Vue.extend({
}, 350, {leading: false});
Sist2Api.getMimeTypes().then(mimeMap => {
this.$store.commit("setUiMimeMap", mimeMap);
});
this.$store.dispatch("loadFromArgs", this.$route).then(() => {
this.$store.subscribe(() => this.$store.dispatch("updateArgs", this.$router));
this.$store.subscribe((mutation) => {
@@ -138,17 +138,23 @@ export default Vue.extend({
sist2.getSist2Info().then(data => {
this.setSist2Info(data);
this.setIndices(data.indices);
this.uiLoading = false;
this.search(true);
Sist2Api.getMimeTypes(Sist2Query.searchQuery()).then(({mimeMap}) => {
this.$store.commit("setUiMimeMap", mimeMap);
this.uiLoading = false;
this.search(true);
});
}).catch(() => {
this.showErrorToast();
});
});
},
methods: {
...mapMutations({
...mapActions({
setSist2Info: "setSist2Info",
}),
...mapMutations({
setIndices: "setIndices",
setDateBoundsMin: "setDateBoundsMin",
setDateBoundsMax: "setDateBoundsMax",
@@ -179,6 +185,7 @@ export default Vue.extend({
async searchNow(q: any) {
this.searchBusy = true;
await this.$store.dispatch("incrementQuerySequence");
this.$store.commit("busSearch");
Sist2Api.esQuery(q).then(async (resp: EsResult) => {
await this.handleSearch(resp);
@@ -209,7 +216,7 @@ export default Vue.extend({
resp.hits.hits = resp.hits.hits.filter(hit => {
if (!("checksum" in hit._source)) {
return true;
return true;
}
const isDupe = !this.docChecksums.has(hit._source.checksum);
@@ -280,6 +287,11 @@ export default Vue.extend({
border: none;
}
.toast-header-info, .toast-body-info {
background: #2196f3;
color: #fff !important;
}
.toast-header-error, .toast-body-error {
background: #a94442;
color: #f2dede !important;

View File

@@ -22,6 +22,7 @@
const char *TESS_DATAPATHS[] = {
"/usr/share/tessdata/",
"/usr/share/tesseract-ocr/tessdata/",
"/usr/share/tesseract-ocr/4.00/tessdata/",
"./",
NULL
};
@@ -145,7 +146,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
if (args->name == NULL) {
args->name = g_path_get_basename(args->output);
} else {
char* tmp = malloc(strlen(args->name) + 1);
char *tmp = malloc(strlen(args->name) + 1);
strcpy(tmp, args->name);
args->name = tmp;
}
@@ -167,17 +168,50 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
return 1;
}
if (args->tesseract_lang != NULL) {
TessBaseAPI *api = TessBaseAPICreate();
if (args->ocr_images && args->tesseract_lang == NULL) {
fprintf(stderr, "You must specify --ocr-lang <LANG> to use --ocr-images");
return 1;
}
char filename[128];
sprintf(filename, "%s.traineddata", args->tesseract_lang);
const char *path = find_file_in_paths(TESS_DATAPATHS, filename);
if (path == NULL) {
LOG_FATAL("cli.c", "Could not find tesseract language file!");
if (args->ocr_ebooks && args->tesseract_lang == NULL) {
fprintf(stderr, "You must specify --ocr-lang <LANG> to use --ocr-ebooks");
return 1;
}
if (args->tesseract_lang != NULL) {
if (!args->ocr_ebooks && !args->ocr_images) {
fprintf(stderr, "You must specify at least one of --ocr-ebooks, --ocr-images");
return 1;
}
ret = TessBaseAPIInit3(api, path, args->tesseract_lang);
TessBaseAPI *api = TessBaseAPICreate();
const char *trained_data_path = NULL;
char *lang = malloc(strlen(args->tesseract_lang) + 1);
strcpy(lang, args->tesseract_lang);
lang = strtok(lang, "+");
while (lang != NULL) {
char filename[128];
sprintf(filename, "%s.traineddata", lang);
const char *path = find_file_in_paths(TESS_DATAPATHS, filename);
if (path == NULL) {
LOG_FATALF("cli.c", "Could not find tesseract language file: %s!", filename);
}
if (trained_data_path != NULL && path != trained_data_path) {
LOG_FATAL("cli.c", "When specifying more than one tesseract language, all the traineddata "
"files must be in the same folder")
}
trained_data_path = path;
lang = strtok(NULL, "+");
}
free(lang);
ret = TessBaseAPIInit3(api, trained_data_path, args->tesseract_lang);
if (ret != 0) {
fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang);
return 1;
@@ -185,7 +219,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
TessBaseAPIEnd(api);
TessBaseAPIDelete(api);
args->tesseract_path = path;
args->tesseract_path = trained_data_path;
}
if (args->exclude_regex != NULL) {
@@ -218,6 +252,19 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
args->max_memory_buffer = DEFAULT_MAX_MEM_BUFFER;
}
if (args->list_path != NULL) {
if (strcmp(args->list_path, "-") == 0) {
args->list_file = stdin;
LOG_DEBUG("cli.c", "Using stdin as list file")
} else {
args->list_file = fopen(args->list_path, "r");
if (args->list_file == NULL) {
LOG_FATALF("main.c", "List file could not be opened: %s (%s)", args->list_path, errno);
}
}
}
LOG_DEBUGF("cli.c", "arg quality=%f", args->quality)
LOG_DEBUGF("cli.c", "arg size=%d", args->size)
LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size)
@@ -237,6 +284,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
LOG_DEBUGF("cli.c", "arg fast_epub=%d", args->fast_epub)
LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold)
LOG_DEBUGF("cli.c", "arg max_memory_buffer=%d", args->max_memory_buffer)
LOG_DEBUGF("cli.c", "arg list_path=%s", args->list_path)
return 0;
}
@@ -362,15 +410,15 @@ int web_args_validate(web_args_t *args, int argc, const char **argv) {
args->es_index = DEFAULT_ES_INDEX;
}
if (args->lang == NULL) {
args->lang = DEFAULT_LANG;
}
if (args->tagline == NULL) {
args->tagline = DEFAULT_TAGLINE;
}
if (strlen(args->lang) != 2) {
if (args->lang == NULL) {
args->lang = DEFAULT_LANG;
}
if (strlen(args->lang) != 2 && strlen(args->lang) != 5) {
fprintf(stderr, "Invalid --lang value, see usage\n");
return 1;
}

View File

@@ -21,6 +21,8 @@ typedef struct scan_args {
char *archive_passphrase;
char *tesseract_lang;
const char *tesseract_path;
int ocr_images;
int ocr_ebooks;
char *exclude_regex;
int fast;
const char* treemap_threshold_str;
@@ -29,6 +31,8 @@ typedef struct scan_args {
int read_subtitles;
int fast_epub;
int calculate_checksums;
char *list_path;
FILE *list_file;
} scan_args_t;
scan_args_t *scan_args_create();

View File

@@ -2,6 +2,7 @@
ScanCtx_t ScanCtx = {
.stat_index_size = 0,
.stat_tn_size = 0,
.dbg_current_files = NULL,
.pool = NULL
};

View File

@@ -17,6 +17,7 @@
#include "libscan/wpd/wpd.h"
#include "libscan/json/json.h"
#include "src/io/store.h"
#include "src/index/elastic.h"
#include <glib.h>
#include <pcre.h>
@@ -40,6 +41,7 @@ typedef struct {
GHashTable *original_table;
GHashTable *copy_table;
pthread_mutex_t copy_table_mu;
pcre *exclude;
pcre_extra *exclude_extra;
@@ -75,6 +77,7 @@ typedef struct {
typedef struct {
char *es_url;
es_version_t *es_version;
char *es_index;
int batch_size;
tpool_t *pool;
@@ -86,6 +89,7 @@ typedef struct {
typedef struct {
char *es_url;
es_version_t *es_version;
char *es_index;
int index_count;
char *auth_user;
@@ -94,7 +98,7 @@ typedef struct {
int tag_auth_enabled;
char *tagline;
struct index_t indices[256];
char lang[3];
char lang[10];
int dev;
} WebCtx_t;

View File

@@ -253,7 +253,7 @@ void _elastic_flush(int max) {
} else {
print_errors(r);
LOG_INFOF("elastic.c", "Indexed %d documents (%zukB) <%d>", count, buf_len / 1024, r->status_code);
LOG_DEBUGF("elastic.c", "Indexed %d documents (%zukB) <%d>", count, buf_len / 1024, r->status_code);
delete_queue(max);
if (Indexer->queued != 0) {
@@ -356,7 +356,65 @@ void finish_indexer(char *script, int async_script, char *index_id) {
free_response(r);
}
void elastic_init(int force_reset, const char* user_mappings, const char* user_settings) {
es_version_t *elastic_get_version(const char *es_url) {
response_t *r = web_get(es_url, 30);
char *tmp = malloc(r->size + 1);
memcpy(tmp, r->body, r->size);
*(tmp + r->size) = '\0';
cJSON *response = cJSON_Parse(tmp);
free(tmp);
free_response(r);
if (response == NULL) {
return NULL;
}
if (cJSON_GetObjectItem(response, "version") == NULL ||
cJSON_GetObjectItem(cJSON_GetObjectItem(response, "version"), "number") == NULL) {
cJSON_Delete(response);
return NULL;
}
char *version_str = cJSON_GetObjectItem(cJSON_GetObjectItem(response, "version"), "number")->valuestring;
es_version_t *version = malloc(sizeof(es_version_t));
const char *tok = strtok(version_str, ".");
version->major = atoi(tok);
tok = strtok(NULL, ".");
version->minor = atoi(tok);
tok = strtok(NULL, ".");
version->patch = atoi(tok);
cJSON_Delete(response);
return version;
}
void elastic_init(int force_reset, const char *user_mappings, const char *user_settings) {
es_version_t *es_version = elastic_get_version(IndexCtx.es_url);
IndexCtx.es_version = es_version;
if (es_version == NULL) {
LOG_FATAL("elastic.c", "Could not get ES version")
}
LOG_INFOF("elastic.c",
"Elasticsearch version is %s (supported=%d, legacy=%d)",
format_es_version(es_version), IS_SUPPORTED_ES_VERSION(es_version), USE_LEGACY_ES_SETTINGS(es_version));
if (!IS_SUPPORTED_ES_VERSION(es_version)) {
LOG_FATAL("elastic.c", "sist2 only supports Elasticsearch v6.8 or newer")
}
char *settings = NULL;
if (USE_LEGACY_ES_SETTINGS(es_version)) {
settings = settings_json;
} else {
settings = settings_legacy_json;
}
// Check if index exists
char url[4096];
@@ -392,7 +450,7 @@ void elastic_init(int force_reset, const char* user_mappings, const char* user_s
free_response(r);
snprintf(url, sizeof(url), "%s/%s/_settings", IndexCtx.es_url, IndexCtx.es_index);
r = web_put(url, user_settings ? user_settings : settings_json);
r = web_put(url, user_settings ? user_settings : settings);
LOG_INFOF("elastic.c", "Update ES settings <%d>", r->status_code);
if (r->status_code != 200) {
print_error(r);

View File

@@ -9,6 +9,26 @@ typedef struct es_bulk_line {
char line[0];
} es_bulk_line_t;
typedef struct {
int major;
int minor;
int patch;
} es_version_t;
#define VERSION_GE(version, maj, min) ((version)->major > (maj) || ((version)->major == (maj) && (version)->minor >= (min)))
#define IS_SUPPORTED_ES_VERSION(es_version) VERSION_GE((es_version), 6, 8)
#define USE_LEGACY_ES_SETTINGS(es_version) (!VERSION_GE((es_version), 7, 14))
__always_inline
static const char *format_es_version(es_version_t *version) {
static char buf[64];
snprintf(buf, sizeof(buf), "%d.%d.%d", version->major, version->minor, version->patch);
return buf;
}
/**
* Note: indexer is *not* thread safe
*/
@@ -31,6 +51,8 @@ cJSON *elastic_get_document(const char *id_str);
char *elastic_get_status();
es_version_t *elastic_get_version(const char *es_url);
void execute_update_script(const char *script, int async, const char index_id[MD5_STR_LENGTH]);
#endif

File diff suppressed because one or more lines are too long

View File

@@ -38,6 +38,8 @@ char *get_meta_key_text(enum metakey meta_key) {
return "parent";
case MetaExifMake:
return "exif_make";
case MetaExifDescription:
return "exif_description";
case MetaExifSoftware:
return "exif_software";
case MetaExifExposureTime:
@@ -150,6 +152,7 @@ char *build_json_string(document_t *doc) {
case MetaFontName:
case MetaParent:
case MetaExifMake:
case MetaExifDescription:
case MetaExifSoftware:
case MetaExifExposureTime:
case MetaExifFNumber:

View File

@@ -23,7 +23,6 @@ store_t *store_create(const char *path, size_t chunk_size) {
}
store->size = (size_t) store->chunk_size;
ScanCtx.stat_tn_size = 0;
mdb_env_set_mapsize(store->env, store->size);
// Open dbi

View File

@@ -4,6 +4,8 @@
#include <ftw.h>
#define STR_STARTS_WITH(x, y) (strncmp(y, x, strlen(y) - 1) == 0)
__always_inline
parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info, int base) {
int len = (int) strlen(filepath);
@@ -43,26 +45,91 @@ int sub_strings[30];
int handle_entry(const char *filepath, const struct stat *info, int typeflag, struct FTW *ftw) {
if (typeflag == FTW_F && S_ISREG(info->st_mode) && ftw->level <= ScanCtx.depth) {
if (ftw->level > ScanCtx.depth) {
if (typeflag == FTW_D) {
return FTW_SKIP_SUBTREE;
}
return FTW_CONTINUE;
}
if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) {
LOG_DEBUGF("walk.c", "Excluded: %s", filepath)
if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) {
LOG_DEBUGF("walk.c", "Excluded: %s", filepath)
if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
ScanCtx.dbg_excluded_files_count += 1;
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
return 0;
} else if (typeflag == FTW_D) {
return FTW_SKIP_SUBTREE;
}
return FTW_CONTINUE;
}
if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
parse_job_t *job = create_fs_parse_job(filepath, info, ftw->base);
tpool_add_work(ScanCtx.pool, parse, job);
}
return 0;
return FTW_CONTINUE;
}
#define MAX_FILE_DESCRIPTORS 64
int walk_directory_tree(const char *dirpath) {
return nftw(dirpath, handle_entry, MAX_FILE_DESCRIPTORS, FTW_PHYS | FTW_DEPTH);
return nftw(dirpath, handle_entry, MAX_FILE_DESCRIPTORS, FTW_PHYS | FTW_ACTIONRETVAL);
}
int iterate_file_list(void *input_file) {
char buf[PATH_MAX];
struct stat info;
while (fgets(buf, sizeof(buf), input_file) != NULL) {
// Remove trailing newline
*(buf + strlen(buf) - 1) = '\0';
int stat_ret = stat(buf, &info);
if (stat_ret != 0) {
LOG_ERRORF("walk.c", "Could not stat file %s (%s)", buf, strerror(errno));
continue;
}
if (!S_ISREG(info.st_mode)) {
LOG_ERRORF("walk.c", "Is not a regular file: %s", buf);
continue;
}
char *absolute_path = canonicalize_file_name(buf);
if (absolute_path == NULL) {
LOG_FATALF("walk.c", "FIXME: Could not get absolute path of %s", buf);
}
if (ScanCtx.exclude != NULL && EXCLUDED(absolute_path)) {
LOG_DEBUGF("walk.c", "Excluded: %s", absolute_path)
if (S_ISREG(info.st_mode)) {
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
ScanCtx.dbg_excluded_files_count += 1;
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
}
continue;
}
if (!STR_STARTS_WITH(absolute_path, ScanCtx.index.desc.root)) {
LOG_FATALF("walk.c", "File is not a children of root folder (%s): %s", ScanCtx.index.desc.root, buf);
}
int base = (int) (strrchr(buf, '/') - buf) + 1;
parse_job_t *job = create_fs_parse_job(absolute_path, &info, base);
free(absolute_path);
tpool_add_work(ScanCtx.pool, parse, job);
}
return 0;
}

View File

@@ -5,4 +5,6 @@
int walk_directory_tree(const char *);
int iterate_file_list(void* input_file);
#endif

View File

@@ -55,10 +55,14 @@ void vsist_logf(const char *filepath, int level, char *format, va_list ap) {
log_len += 1;
}
int ret = write(STDERR_FILENO, log_str, log_len);
if (ret == -1) {
LOG_FATALF("serialize.c", "Could not write index descriptor: %s", strerror(errno))
if (PrintingProgressBar) {
PrintingProgressBar = FALSE;
memmove(log_str + 1, log_str, log_len);
log_str[0] = '\n';
log_len += 1;
}
write(STDERR_FILENO, log_str, log_len);
}
void sist_logf(const char *filepath, int level, char *format, ...) {
@@ -104,8 +108,12 @@ void sist_log(const char *filepath, int level, char *str) {
);
}
int ret = write(STDERR_FILENO, log_str, log_len);
if (ret == -1) {
LOG_FATALF("serialize.c", "Could not write index descriptor: %s", strerror(errno));
if (PrintingProgressBar) {
PrintingProgressBar = FALSE;
memmove(log_str + 1, log_str, log_len);
log_str[0] = '\n';
log_len += 1;
}
write(STDERR_FILENO, log_str, log_len);
}

View File

@@ -14,6 +14,9 @@
#include "parsing/mime.h"
#include "parsing/parse.h"
#include <signal.h>
#include <unistd.h>
#include "stats.h"
#define DESCRIPTION "Lightning-fast file system indexer and search tool."
@@ -29,8 +32,6 @@ static const char *const usage[] = {
NULL,
};
#include<signal.h>
#include<unistd.h>
static __sighandler_t sigsegv_handler = NULL;
static __sighandler_t sigabrt_handler = NULL;
@@ -169,6 +170,7 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.dbg_current_files = g_hash_table_new_full(g_int64_hash, g_int64_equal, NULL, NULL);
pthread_mutex_init(&ScanCtx.dbg_current_files_mu, NULL);
pthread_mutex_init(&ScanCtx.dbg_file_counts_mu, NULL);
pthread_mutex_init(&ScanCtx.copy_table_mu, NULL);
ScanCtx.calculate_checksums = args->calculate_checksums;
@@ -218,6 +220,11 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.media_ctx.store = _store;
ScanCtx.media_ctx.max_media_buffer = (long) args->max_memory_buffer * 1024 * 1024;
ScanCtx.media_ctx.read_subtitles = args->read_subtitles;
if (args->ocr_images) {
ScanCtx.media_ctx.tesseract_lang = args->tesseract_lang;
ScanCtx.media_ctx.tesseract_path = args->tesseract_path;
}
init_media();
// OOXML
@@ -334,10 +341,20 @@ void sist2_scan(scan_args_t *args) {
ScanCtx.writer_pool = tpool_create(1, writer_cleanup, TRUE, FALSE);
tpool_start(ScanCtx.writer_pool);
int walk_ret = walk_directory_tree(ScanCtx.index.desc.root);
if (walk_ret == -1) {
LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno)
if (args->list_path) {
// Scan using file list
int list_ret = iterate_file_list(args->list_file);
if (list_ret != 0) {
LOG_FATALF("main.c", "iterate_file_list() failed! (%d)", list_ret)
}
} else {
// Scan directory recursively
int walk_ret = walk_directory_tree(ScanCtx.index.desc.root);
if (walk_ret == -1) {
LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno)
}
}
tpool_wait(ScanCtx.pool);
tpool_destroy(ScanCtx.pool);
@@ -433,7 +450,7 @@ void sist2_index(index_args_t *args) {
cleanup = elastic_cleanup;
}
IndexCtx.pool = tpool_create(args->threads, cleanup, FALSE, FALSE);
IndexCtx.pool = tpool_create(args->threads, cleanup, FALSE, args->print == 0);
tpool_start(IndexCtx.pool);
struct dirent *de;
@@ -489,7 +506,7 @@ void sist2_web(web_args_t *args) {
WebCtx.tag_auth_enabled = args->tag_auth_enabled;
WebCtx.tagline = args->tagline;
WebCtx.dev = args->dev;
strcpy(WebCtx.lang, "en");
strcpy(WebCtx.lang, args->lang);
for (int i = 0; i < args->index_count; i++) {
char *abs_path = abspath(args->indices[i]);
@@ -518,8 +535,8 @@ void sist2_web(web_args_t *args) {
int main(int argc, const char *argv[]) {
// sigsegv_handler = signal(SIGSEGV, sig_handler);
// sigabrt_handler = signal(SIGABRT, sig_handler);
sigsegv_handler = signal(SIGSEGV, sig_handler);
sigabrt_handler = signal(SIGABRT, sig_handler);
setlocale(LC_ALL, "");
@@ -564,8 +581,11 @@ int main(int argc, const char *argv[]) {
OPT_STRING(0, "archive-passphrase", &scan_args->archive_passphrase,
"Passphrase for encrypted archive files"),
OPT_STRING(0, "ocr", &scan_args->tesseract_lang, "Tesseract language (use tesseract --list-langs to see "
"which are installed on your machine)"),
OPT_STRING(0, "ocr-lang", &scan_args->tesseract_lang,
"Tesseract language (use 'tesseract --list-langs' to see "
"which are installed on your machine)"),
OPT_BOOLEAN(0, "ocr-images", &scan_args->ocr_images, "Enable OCR'ing of image files."),
OPT_BOOLEAN(0, "ocr-ebooks", &scan_args->ocr_ebooks, "Enable OCR'ing of ebook files."),
OPT_STRING('e', "exclude", &scan_args->exclude_regex, "Files that match this regex will not be scanned"),
OPT_BOOLEAN(0, "fast", &scan_args->fast, "Only index file names & mime type"),
OPT_STRING(0, "treemap-threshold", &scan_args->treemap_threshold_str, "Relative size threshold for treemap "
@@ -577,6 +597,9 @@ int main(int argc, const char *argv[]) {
OPT_BOOLEAN(0, "fast-epub", &scan_args->fast_epub,
"Faster but less accurate EPUB parsing (no thumbnails, metadata)"),
OPT_BOOLEAN(0, "checksums", &scan_args->calculate_checksums, "Calculate file checksums when scanning."),
OPT_STRING(0, "list-file", &scan_args->list_path, "Specify a list of newline-delimited paths to be scanned"
" instead of normal directory traversal. Use '-' to read"
" from stdin."),
OPT_GROUP("Index options"),
OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"),
@@ -599,6 +622,7 @@ int main(int argc, const char *argv[]) {
OPT_STRING(0, "tag-auth", &web_args->tag_credentials, "Basic auth in user:password format for tagging"),
OPT_STRING(0, "tagline", &web_args->tagline, "Tagline in navbar"),
OPT_BOOLEAN(0, "dev", &web_args->dev, "Serve html & js files from disk (for development)"),
OPT_STRING(0, "lang", &web_args->lang, "Default UI language. Can be changed by the user"),
OPT_GROUP("Exec-script options"),
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"),

View File

@@ -79,7 +79,9 @@ void parse(void *arg) {
int inc_ts = incremental_get(ScanCtx.original_table, doc->path_md5);
if (inc_ts != 0 && inc_ts == job->vfile.info.st_mtim.tv_sec) {
pthread_mutex_lock(&ScanCtx.copy_table_mu);
incremental_mark_file_for_copy(ScanCtx.copy_table, doc->path_md5);
pthread_mutex_unlock(&ScanCtx.copy_table_mu);
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
ScanCtx.dbg_skipped_files_count += 1;

View File

@@ -1,6 +1,8 @@
#ifndef SIST_H
#define SIST_H
#define _GNU_SOURCE
#ifndef FALSE
#define FALSE (0)
#define BOOL int
@@ -51,7 +53,7 @@
#include <ctype.h>
#include "git_hash.h"
#define VERSION "2.11.3"
#define VERSION "2.11.6"
static const char *const Version = VERSION;
#ifndef SIST_PLATFORM

View File

@@ -177,7 +177,7 @@ static void *tpool_worker(void *arg) {
}
void tpool_wait(tpool_t *pool) {
LOG_INFO("tpool.c", "Waiting for worker threads to finish")
LOG_DEBUG("tpool.c", "Waiting for worker threads to finish")
pthread_mutex_lock(&(pool->work_mutex));
while (TRUE) {
if (pool->done_cnt < pool->work_cnt) {
@@ -191,7 +191,9 @@ void tpool_wait(tpool_t *pool) {
}
}
}
progress_bar_print(1.0, ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
if (pool->print_progress) {
progress_bar_print(1.0, ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
}
pthread_mutex_unlock(&(pool->work_mutex));
LOG_INFO("tpool.c", "Worker threads finished")

View File

@@ -84,11 +84,13 @@ char *expandpath(const char *path) {
return expanded;
}
int PrintingProgressBar = 0;
void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {
static int last_val = -1;
int val = (int) (percentage * 100);
if (last_val == val || val > 100 || index_size < 1024) {
if (last_val == val || val > 100) {
return;
}
last_val = val;
@@ -114,13 +116,21 @@ void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {
index_unit = 'M';
}
printf(
"\r%3d%%[%.*s>%*s] TN:%3d%c IDX:%3d%c",
val, lpad, PBSTR, rpad, "",
(int) tn_size, tn_unit,
(int) index_size, index_unit
);
fflush(stdout);
if (tn_size == 0 && index_size == 0) {
fprintf(stderr,
"\r%3d%%[%.*s>%*s]",
val, lpad, PBSTR, rpad, ""
);
} else {
fprintf(stderr,
"\r%3d%%[%.*s>%*s] TN:%3d%c IDX:%3d%c",
val, lpad, PBSTR, rpad, "",
(int) tn_size, tn_unit,
(int) index_size, index_unit
);
}
PrintingProgressBar = TRUE;
}
GHashTable *incremental_get_table() {

View File

@@ -19,6 +19,8 @@ char *expandpath(const char *path);
dyn_buffer_t url_escape(char *str);
extern int PrintingProgressBar;
void progress_bar_print(double percentage, size_t tn_size, size_t index_size);
GHashTable *incremental_get_table();
@@ -131,6 +133,9 @@ static int incremental_get_str(GHashTable *table, const char *path_md5) {
}
}
/**
* Not thread safe!
*/
__always_inline
static int incremental_mark_file_for_copy(GHashTable *table, const unsigned char path_md5[MD5_DIGEST_LENGTH]) {
char *ptr = malloc(MD5_STR_LENGTH);

View File

@@ -252,15 +252,34 @@ void serve_file_from_disk(cJSON *json, index_t *idx, struct mg_connection *nc, s
mg_http_serve_file(nc, hm, full_path, mime, disposition);
}
void cache_es_version() {
static int is_cached = FALSE;
if (is_cached == TRUE) {
return;
}
es_version_t *es_version = elastic_get_version(WebCtx.es_url);
if (es_version != NULL) {
WebCtx.es_version = es_version;
is_cached = TRUE;
}
}
void index_info(struct mg_connection *nc) {
cache_es_version();
cJSON *json = cJSON_CreateObject();
cJSON *arr = cJSON_AddArrayToObject(json, "indices");
cJSON_AddStringToObject(json, "esIndex", WebCtx.es_index);
cJSON_AddStringToObject(json, "version", Version);
cJSON_AddStringToObject(json, "esVersion", format_es_version(WebCtx.es_version));
cJSON_AddBoolToObject(json, "esVersionSupported", IS_SUPPORTED_ES_VERSION(WebCtx.es_version));
cJSON_AddBoolToObject(json, "esVersionLegacy", USE_LEGACY_ES_SETTINGS(WebCtx.es_version));
cJSON_AddStringToObject(json, "platform", QUOTE(SIST_PLATFORM));
cJSON_AddStringToObject(json, "sist2Hash", Sist2CommitHash);
cJSON_AddStringToObject(json, "libscanHash", LibScanCommitHash);
cJSON_AddStringToObject(json, "lang", WebCtx.lang);
cJSON_AddBoolToObject(json, "dev", WebCtx.dev);
#ifdef SIST_DEBUG

File diff suppressed because one or more lines are too long

1
third-party/libscan vendored

Submodule third-party/libscan deleted from 3787475ecb

12
third-party/libscan/.gitignore vendored Normal file
View File

@@ -0,0 +1,12 @@
.idea/
cmake_install.cmake
Makefile
libscan.a
libscan.so
*.cbp
CMakeFiles
CMakeCache.txt
scan_test
third-party/ext_*
libscan-test-files
scan_*_test

233
third-party/libscan/CMakeLists.txt vendored Normal file
View File

@@ -0,0 +1,233 @@
cmake_minimum_required(VERSION 3.15)
project(scan)
set(CMAKE_C_STANDARD 11)
option(BUILD_TESTS "Build tests" on)
add_subdirectory(third-party/antiword)
add_compile_definitions(
antiword
NDEBUG
)
add_library(
scan
libscan/util.c libscan/util.h
libscan/scan.h
libscan/macros.h
libscan/text/text.c libscan/text/text.h
libscan/arc/arc.c libscan/arc/arc.h
libscan/ebook/ebook.c libscan/ebook/ebook.h
libscan/comic/comic.c libscan/comic/comic.h
libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h
libscan/media/media.c libscan/media/media.h
libscan/font/font.c libscan/font/font.h
libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h
libscan/json/json.c libscan/json/json.h
libscan/wpd/wpd.c libscan/wpd/wpd.h libscan/wpd/libwpd_c_api.h libscan/wpd/libwpd_c_api.cpp
third-party/utf8.h
libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h)
set_target_properties(scan PROPERTIES LINKER_LANGUAGE C)
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib .so)
find_package(cJSON CONFIG REQUIRED)
find_package(LibArchive REQUIRED)
find_package(BZip2 REQUIRED)
find_package(lz4 REQUIRED)
find_package(Threads REQUIRED)
find_package(Tesseract CONFIG REQUIRED)
find_package(OpenJPEG CONFIG REQUIRED)
find_package(JPEG REQUIRED)
find_package(LibXml2 REQUIRED)
find_package(LibLZMA REQUIRED)
find_package(ZLIB REQUIRED)
find_package(unofficial-pcre CONFIG REQUIRED)
find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec)
find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd)
find_library(FREETYPE_LIB NAMES freetype freetyped)
find_package(unofficial-brotli CONFIG REQUIRED)
find_library(LZO2_LIB NAMES lzo2)
find_library(RAW_LIB NAMES libraw.a)
find_library(MUPDF_LIB NAMES liblibmupdf.a)
find_library(CMS_LIB NAMES lcms2)
find_library(JAS_LIB NAMES jasper)
find_library(GUMBO_LIB NAMES gumbo)
find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/)
target_compile_options(
scan
PRIVATE
-g
)
include(ExternalProject)
find_program(MAKE_EXE NAMES gmake nmake make)
ExternalProject_Add(
libmobi
GIT_REPOSITORY https://github.com/simon987/libmobi.git
GIT_TAG "public"
UPDATE_COMMAND ""
PATCH_COMMAND ""
TEST_COMMAND ""
CONFIGURE_COMMAND ./autogen.sh && ./configure
INSTALL_COMMAND ""
PREFIX "third-party/ext_libmobi"
SOURCE_DIR "third-party/ext_libmobi/src/libmobi"
BINARY_DIR "third-party/ext_libmobi/src/libmobi"
BUILD_COMMAND ${MAKE_EXE} -j 8 --silent
)
SET(MOBI_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/.libs/)
SET(MOBI_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/)
if (SIST_DEBUG)
SET(FFMPEG_DEBUG "--enable-debug=3" "--disable-optimizations")
else()
SET(FFMPEG_DEBUG "")
endif()
ExternalProject_Add(
ffmpeg
GIT_REPOSITORY https://git.ffmpeg.org/ffmpeg.git
GIT_TAG "n4.4"
UPDATE_COMMAND ""
PATCH_COMMAND ""
TEST_COMMAND ""
CONFIGURE_COMMAND ./configure --disable-shared --enable-static --disable-ffmpeg --disable-ffplay
--disable-ffprobe --disable-doc --disable-manpages --disable-postproc --disable-avfilter --disable-alsa
--disable-lzma --disable-xlib --disable-vdpau --disable-vaapi --disable-sdl2
--disable-network ${FFMPEG_DEBUG}
INSTALL_COMMAND ""
PREFIX "third-party/ext_ffmpeg"
SOURCE_DIR "third-party/ext_ffmpeg/src/ffmpeg"
BINARY_DIR "third-party/ext_ffmpeg/src/ffmpeg"
BUILD_COMMAND ${MAKE_EXE} -j33 --silent
)
SET(FFMPEG_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
SET(FFMPEG_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
ExternalProject_Add(
libwpd
URL http://prdownloads.sourceforge.net/libwpd/libwpd-0.9.9.tar.gz
UPDATE_COMMAND ""
PATCH_COMMAND ""
TEST_COMMAND ""
CONFIGURE_COMMAND ./configure --without-docs --enable-static --disable-shared
INSTALL_COMMAND ""
PREFIX "third-party/ext_libwpd"
SOURCE_DIR "third-party/ext_libwpd/src/libwpd"
BINARY_DIR "third-party/ext_libwpd/src/libwpd"
BUILD_COMMAND ${MAKE_EXE} -j33
)
SET(WPD_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/src/lib/.libs/)
SET(WPD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/inc/)
add_dependencies(
scan
libmobi
ffmpeg
antiword
libwpd
)
target_link_libraries(
scan
PUBLIC
cjson
${LibArchive_LIBRARIES}
ZLIB::ZLIB
BZip2::BZip2
lz4::lz4
${LZO2_LIB}
LibLZMA::LibLZMA
${MUPDF_LIB}
openjp2
${MOBI_LIB_DIR}/libmobi.a
${WPD_LIB_DIR}/libwpd-0.9.a
${WPD_LIB_DIR}/libwpd-stream-0.9.a
${FREETYPE_LIB}
${HARFBUZZ_LIB}
${JBIG2DEC_LIB}
stdc++
-Wl,--whole-archive
m
-Wl,--no-whole-archive
${JPEG_LIBRARIES}
${Tesseract_LIBRARIES}
${LIBXML2_LIBRARIES}
${FREETYPE_LIB}
unofficial::brotli::brotlidec-static
${FFMPEG_LIB_DIR}/libavformat/libavformat.a
${FFMPEG_LIB_DIR}/libavcodec/libavcodec.a
${FFMPEG_LIB_DIR}/libavutil/libavutil.a
${FFMPEG_LIB_DIR}/libswresample/libswresample.a
${FFMPEG_LIB_DIR}/libswscale/libswscale.a
z
${CMAKE_THREAD_LIBS_INIT}
${RAW_LIB}
${GOMP_LIB}
${CMS_LIB}
${JAS_LIB}
${GUMBO_LIB}
dl
antiword
unofficial::pcre::pcre unofficial::pcre::pcre16 unofficial::pcre::pcre32 unofficial::pcre::pcrecpp
)
target_include_directories(
scan
PUBLIC
${MUPDF_INC_DIR}
${JPEG_INCLUDE_DIR}
${LIBXML2_INCLUDE_DIR}
${FFMPEG_INCLUDE_DIR}
${MOBI_INCLUDE_DIR}
${WPD_INCLUDE_DIR}
)
if (BUILD_TESTS)
find_package(GTest CONFIG REQUIRED)
add_executable(scan_ub_test test/main.cpp test/test_util.cpp test/test_util.h)
target_compile_options(scan_ub_test PRIVATE -g -fsanitize=undefined -fno-omit-frame-pointer)
target_link_libraries(scan_ub_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=undefined scan)
add_executable(scan_a_test test/main.cpp test/test_util.cpp test/test_util.h)
target_compile_options(scan_a_test PRIVATE -g -fsanitize=address -fno-omit-frame-pointer)
target_link_libraries(scan_a_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=address scan)
add_executable(scan_test test/main.cpp test/test_util.cpp test/test_util.h)
target_compile_options(scan_test PRIVATE -g -fno-omit-frame-pointer)
target_link_libraries(scan_test PRIVATE GTest::gtest GTest::gtest_main scan)
endif()

4
third-party/libscan/README.md vendored Normal file
View File

@@ -0,0 +1,4 @@
### Run fuzz tests:
```bash
./scan_a_test --gtest_filter=*Fuzz* --gtest_repeat=100
```

244
third-party/libscan/libscan/arc/arc.c vendored Normal file
View File

@@ -0,0 +1,244 @@
#include "arc.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <openssl/evp.h>
#include <pcre.h>
int should_parse_filtered_file(const char *filepath, int ext) {
char tmp[PATH_MAX * 2];
if (ext == 0) {
return FALSE;
}
if (strncmp(filepath + ext, "tgz", 3) == 0) {
return TRUE;
}
memcpy(tmp, filepath, ext - 1);
*(tmp + ext - 1) = '\0';
char *idx = strrchr(tmp, '.');
if (idx == NULL) {
return FALSE;
}
if (strcmp(idx, ".tar") == 0) {
return TRUE;
}
return FALSE;
}
void arc_close(struct vfile *f) {
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
if (f->rewind_buffer != NULL) {
free(f->rewind_buffer);
f->rewind_buffer = NULL;
f->rewind_buffer_size = 0;
f->rewind_buffer_cursor = 0;
}
}
int arc_read(struct vfile *f, void *buf, size_t size) {
int bytes_copied = 0;
if (f->rewind_buffer_size != 0) {
if (size > f->rewind_buffer_size) {
memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, f->rewind_buffer_size);
bytes_copied = f->rewind_buffer_size;
size -= f->rewind_buffer_size;
buf += f->rewind_buffer_size;
f->rewind_buffer_size = 0;
} else {
memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, size);
f->rewind_buffer_size -= (int) size;
f->rewind_buffer_cursor += (int) size;
return (int) size;
}
}
size_t bytes_read = archive_read_data(f->arc, buf, size);
if (bytes_read != 0 && bytes_read <= size && f->calculate_checksum) {
f->has_checksum = TRUE;
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, bytes_read);
}
if (bytes_read != size && archive_errno(f->arc) != 0) {
const char *error_str = archive_error_string(f->arc);
if (error_str != NULL) {
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
}
return -1;
}
return (int) bytes_read + bytes_copied;
}
int arc_read_rewindable(struct vfile *f, void *buf, size_t size) {
if (f->rewind_buffer != NULL) {
fprintf(stderr, "Allocated rewind buffer more than once for %s", f->filepath);
exit(-1);
}
size_t bytes_read = archive_read_data(f->arc, buf, size);
if (bytes_read != size && archive_errno(f->arc) != 0) {
const char *error_str = archive_error_string(f->arc);
if (error_str != NULL) {
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
}
return -1;
}
f->rewind_buffer = malloc(size);
f->rewind_buffer_size = (int) size;
f->rewind_buffer_cursor = 0;
memcpy(f->rewind_buffer, buf, size);
return (int) bytes_read;
}
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) {
arc_data->f = f;
if (f->is_fs_file) {
*a = archive_read_new();
archive_read_support_filter_all(*a);
archive_read_support_format_all(*a);
if (ctx->passphrase[0] != 0) {
archive_read_add_passphrase(*a, ctx->passphrase);
}
return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE);
} else if (allow_recurse) {
*a = archive_read_new();
archive_read_support_filter_all(*a);
archive_read_support_format_all(*a);
if (ctx->passphrase[0] != 0) {
archive_read_add_passphrase(*a, ctx->passphrase);
}
return archive_read_open(
*a, arc_data,
vfile_open_callback,
vfile_read_callback,
vfile_close_callback
);
} else {
return ARC_SKIPPED;
}
}
static __thread int sub_strings[30];
#define EXCLUDED(str) (pcre_exec(exclude, exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
arc_data.f = f;
int ret = arc_open(ctx, f, &a, &arc_data, ctx->mode == ARC_MODE_RECURSE);
if (ret == ARC_SKIPPED) {
return SCAN_OK;
}
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(arc.c) [%d] %s", ret, archive_error_string(a))
archive_read_free(a);
return SCAN_ERR_READ;
}
if (ctx->mode == ARC_MODE_LIST) {
dyn_buffer_t buf = dyn_buffer_create();
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
dyn_buffer_append_string(&buf, file_path);
dyn_buffer_write_char(&buf, ' ');
}
}
dyn_buffer_write_char(&buf, '\0');
meta_line_t *meta_list = malloc(sizeof(meta_line_t) + buf.cur);
meta_list->key = MetaContent;
strcpy(meta_list->str_val, buf.buf);
APPEND_META(doc, meta_list)
dyn_buffer_destroy(&buf);
} else {
parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2);
sub_job->vfile.close = arc_close;
sub_job->vfile.read = arc_read;
sub_job->vfile.read_rewindable = arc_read_rewindable;
sub_job->vfile.reset = NULL;
sub_job->vfile.arc = a;
sub_job->vfile.filepath = sub_job->filepath;
sub_job->vfile.is_fs_file = FALSE;
sub_job->vfile.rewind_buffer_size = 0;
sub_job->vfile.rewind_buffer = NULL;
sub_job->vfile.log = ctx->log;
sub_job->vfile.logf = ctx->logf;
sub_job->vfile.has_checksum = FALSE;
sub_job->vfile.calculate_checksum = f->calculate_checksum;
memcpy(sub_job->parent, doc->path_md5, MD5_DIGEST_LENGTH);
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
sub_job->vfile.info = *archive_entry_stat(entry);
if (S_ISREG(sub_job->vfile.info.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
if (utf8_name == NULL) {
sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry));
} else {
sprintf(sub_job->filepath, "%s#/%s", f->filepath, utf8_name);
}
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
// Handle excludes
if (exclude != NULL && EXCLUDED(sub_job->filepath)) {
CTX_LOG_DEBUGF("arc.c", "Excluded: %s", sub_job->filepath)
continue;
}
char *p = strrchr(sub_job->filepath, '.');
if (p != NULL && (p - sub_job->filepath) > strlen(f->filepath)) {
sub_job->ext = (int) (p - sub_job->filepath + 1);
} else {
sub_job->ext = (int) strlen(sub_job->filepath);
}
SHA1_Init(&sub_job->vfile.sha1_ctx);
ctx->parse(sub_job);
}
}
free(sub_job);
}
archive_read_free(a);
return SCAN_OK;
}

80
third-party/libscan/libscan/arc/arc.h vendored Normal file
View File

@@ -0,0 +1,80 @@
#ifndef SCAN_ARC_H
#define SCAN_ARC_H
#include <archive.h>
#include <archive_entry.h>
#include <fcntl.h>
#include <pcre.h>
#include "../scan.h"
# define ARC_SKIPPED (-1)
#define ARC_MODE_SKIP 0
#define ARC_MODE_LIST 1
#define ARC_MODE_SHALLOW 2
#define ARC_MODE_RECURSE 3
typedef int archive_mode_t;
typedef struct {
archive_mode_t mode;
parse_callback_t parse;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
char passphrase[4096];
} scan_arc_ctx_t;
#define ARC_BUF_SIZE 8192
typedef struct {
vfile_t *f;
char buf[ARC_BUF_SIZE];
} arc_data_t;
static int vfile_open_callback(struct archive *a, void *user_data) {
arc_data_t *data = (arc_data_t *) user_data;
if (!data->f->is_fs_file) {
SHA1_Init(&data->f->sha1_ctx);
}
return ARCHIVE_OK;
}
static long vfile_read_callback(struct archive *a, void *user_data, const void **buf) {
arc_data_t *data = (arc_data_t *) user_data;
*buf = data->buf;
long ret = data->f->read(data->f, data->buf, sizeof(data->buf));
if (!data->f->is_fs_file && ret > 0) {
data->f->has_checksum = TRUE;
safe_sha1_update(&data->f->sha1_ctx, (unsigned char*)data->buf, ret);
}
return ret;
}
static int vfile_close_callback(struct archive *a, void *user_data) {
arc_data_t *data = (arc_data_t *) user_data;
if (!data->f->is_fs_file) {
SHA1_Final((unsigned char *) data->f->sha1_digest, &data->f->sha1_ctx);
}
return ARCHIVE_OK;
}
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse);
int should_parse_filtered_file(const char *filepath, int ext);
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra);
int arc_read(struct vfile *f, void *buf, size_t size);
int arc_read_rewindable(struct vfile *f, void *buf, size_t size);
void arc_close(struct vfile *f);
#endif

View File

@@ -0,0 +1,58 @@
#include "comic.h"
#include "../media/media.h"
#include "../arc/arc.h"
#include <stdlib.h>
#include <archive.h>
static scan_arc_ctx_t arc_ctx = (scan_arc_ctx_t) {.passphrase = {0,}};
void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
if (ctx->tn_size <= 0) {
return;
}
int ret = arc_open(&arc_ctx, f, &a, &arc_data, TRUE);
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(cbr.c) [%d] %s", ret, archive_error_string(a))
archive_read_free(a);
return;
}
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
struct stat info = *archive_entry_stat(entry);
if (S_ISREG(info.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
char *p = strrchr(file_path, '.');
if (p != NULL && (strcmp(p, ".png") == 0 || strcmp(p, ".jpg") == 0 || strcmp(p, ".jpeg") == 0)) {
size_t entry_size = archive_entry_size(entry);
void *buf = malloc(entry_size);
size_t read = archive_read_data(a, buf, entry_size);
if (read != entry_size) {
const char *err_str = archive_error_string(a);
if (err_str) {
CTX_LOG_ERRORF("comic.c", "Error while reading entry: %s", err_str)
}
free(buf);
break;
}
ret = store_image_thumbnail((scan_media_ctx_t *) ctx, buf, entry_size, doc, file_path);
free(buf);
if (ret == TRUE) {
break;
}
}
}
}
archive_read_free(a);
}

View File

@@ -0,0 +1,31 @@
#ifndef SCAN_CBR_H
#define SCAN_CBR_H
#include <stdlib.h>
#include "../ebook/ebook.h"
typedef struct {
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int tn_size;
float tn_qscale;
unsigned int cbr_mime;
unsigned int cbz_mime;
} scan_comic_ctx_t;
__always_inline
static int is_cbr(scan_comic_ctx_t *ctx, unsigned int mime) {
return mime == ctx->cbr_mime;
}
__always_inline
static int is_cbz(scan_comic_ctx_t *ctx, unsigned int mime) {
return mime == ctx->cbz_mime;
}
void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

View File

@@ -0,0 +1,478 @@
#include "ebook.h"
#include <mupdf/fitz.h>
#include <pthread.h>
#include <tesseract/capi.h>
#include "../media/media.h"
#include "../arc/arc.h"
#include "../ocr/ocr.h"
/* fill_image callback doesn't let us pass opaque pointers unless I create my own device */
__thread text_buffer_t thread_buffer;
__thread scan_ebook_ctx_t thread_ctx;
pthread_mutex_t Mutex;
static void my_fz_lock(UNUSED(void *user), int lock) {
if (lock == FZ_LOCK_FREETYPE) {
pthread_mutex_lock(&Mutex);
}
}
static void my_fz_unlock(UNUSED(void *user), int lock) {
if (lock == FZ_LOCK_FREETYPE) {
pthread_mutex_unlock(&Mutex);
}
}
int pixmap_is_blank(const fz_pixmap *pixmap) {
int pixmap_size = pixmap->n * pixmap->w * pixmap->h;
const int pixel0 = pixmap->samples[0];
for (int i = 0; i < pixmap_size; i++) {
if (pixmap->samples[i] != pixel0) {
return FALSE;
}
}
return TRUE;
}
fz_pixmap *
load_pixmap(scan_ebook_ctx_t *ctx, int page, fz_context *fzctx, fz_document *fzdoc, document_t *doc, fz_page **cover) {
int err = 0;
fz_var(cover);
fz_var(err);
fz_try(fzctx)*cover = fz_load_page(fzctx, fzdoc, page);
fz_catch(fzctx)err = 1;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message)
return NULL;
}
fz_rect bounds = fz_bound_page(fzctx, *cover);
float scale;
float w = bounds.x1 - bounds.x0;
float h = bounds.y1 - bounds.y0;
if (w > h) {
scale = (float) ctx->tn_size / w;
} else {
scale = (float) ctx->tn_size / h;
}
fz_matrix m = fz_scale(scale, scale);
bounds = fz_transform_rect(bounds, m);
fz_irect bbox = fz_round_rect(bounds);
fz_pixmap *pixmap = fz_new_pixmap_with_bbox(fzctx, fz_device_rgb(fzctx), bbox, NULL, 0);
fz_clear_pixmap_with_value(fzctx, pixmap, 0xFF);
fz_device *dev = fz_new_draw_device(fzctx, m, pixmap);
fz_var(err);
fz_try(fzctx) {
fz_run_page(fzctx, *cover, dev, fz_identity, NULL);
} fz_always(fzctx) {
fz_close_device(fzctx, dev);
fz_drop_device(fzctx, dev);
} fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message)
fz_drop_page(fzctx, *cover);
fz_drop_pixmap(fzctx, pixmap);
return NULL;
}
if (pixmap->n != 3) {
CTX_LOG_ERRORF(doc->filepath, "Got unexpected pixmap depth: %d", pixmap->n)
fz_drop_page(fzctx, *cover);
fz_drop_pixmap(fzctx, pixmap);
return NULL;
}
return pixmap;
}
int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_document *fzdoc) {
fz_page *cover = NULL;
fz_pixmap *pixmap = load_pixmap(ctx, 0, fzctx, fzdoc, doc, &cover);
if (pixmap == NULL) {
return FALSE;
}
if (pixmap_is_blank(pixmap)) {
fz_drop_page(fzctx, cover);
fz_drop_pixmap(fzctx, pixmap);
CTX_LOG_DEBUG(doc->filepath, "Cover page is blank, using page 1 instead")
pixmap = load_pixmap(ctx, 1, fzctx, fzdoc, doc, &cover);
if (pixmap == NULL) {
return FALSE;
}
}
// RGB24 -> YUV420p
AVFrame *scaled_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
pixmap->w, pixmap->h, AV_PIX_FMT_RGB24,
pixmap->w, pixmap->h, AV_PIX_FMT_YUV420P,
SIST_SWS_ALGO, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, pixmap->w, pixmap->h, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len);
av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, pixmap->w, pixmap->h,
1);
unsigned char *samples = calloc(1, 1024 * 1024 * 1024);
memcpy(samples, pixmap->samples, pixmap->stride * pixmap->h);
const uint8_t *in_data[1] = {samples,};
int in_line_size[1] = {(int) pixmap->stride};
sws_scale(sws_ctx,
in_data, in_line_size,
0, pixmap->h,
scaled_frame->data, scaled_frame->linesize
);
scaled_frame->width = pixmap->w;
scaled_frame->height = pixmap->h;
scaled_frame->format = AV_PIX_FMT_YUV420P;
sws_freeContext(sws_ctx);
// YUV420p -> JPEG
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(pixmap->w, pixmap->h, ctx->tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
APPEND_TN_META(doc, pixmap->w, pixmap->h)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
free(samples);
av_packet_unref(&jpeg_packet);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
avcodec_free_context(&jpeg_encoder);
fz_drop_pixmap(fzctx, pixmap);
fz_drop_page(fzctx, cover);
return TRUE;
}
void fz_err_callback(void *user, const char *message) {
document_t *doc = (document_t *) user;
const scan_ebook_ctx_t *ctx = &thread_ctx;
CTX_LOG_WARNINGF(doc->filepath, "FZ: %s", message)
}
void fz_warn_callback(void *user, const char *message) {
document_t *doc = (document_t *) user;
const scan_ebook_ctx_t *ctx = &thread_ctx;
CTX_LOG_DEBUGF(doc->filepath, "FZ: %s", message)
}
static void init_fzctx(fz_context *fzctx, document_t *doc) {
fz_register_document_handlers(fzctx);
static int mu_is_initialized = FALSE;
if (!mu_is_initialized) {
pthread_mutex_init(&Mutex, NULL);
mu_is_initialized = TRUE;
}
fzctx->warn.print_user = doc;
fzctx->warn.print = fz_warn_callback;
fzctx->error.print_user = doc;
fzctx->error.print = fz_err_callback;
fzctx->locks.lock = my_fz_lock;
fzctx->locks.unlock = my_fz_unlock;
}
static int read_stext_block(fz_stext_block *block, text_buffer_t *tex) {
if (block->type != FZ_STEXT_BLOCK_TEXT) {
return 0;
}
fz_stext_line *line = block->u.t.first_line;
while (line != NULL) {
text_buffer_append_char(tex, ' ');
fz_stext_char *c = line->first_char;
while (c != NULL) {
if (text_buffer_append_char(tex, c->c) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
c = c->next;
}
line = line->next;
}
text_buffer_append_char(tex, ' ');
return 0;
}
static void fill_image_ocr_cb(const char* text, size_t len) {
text_buffer_append_string(&thread_buffer, text, len - 1);
}
void fill_image(fz_context *fzctx, UNUSED(fz_device *dev),
fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
UNUSED(fz_color_params color_params)) {
int l2factor = 0;
if (img->w >= MIN_OCR_WIDTH && img->h >= MIN_OCR_HEIGHT && OCR_IS_VALID_BPP(img->n)) {
fz_pixmap *pix = img->get_pixmap(fzctx, img, NULL, img->w, img->h, &l2factor);
ocr_extract_text(thread_ctx.tesseract_path, thread_ctx.tesseract_lang, pix->samples, pix->w, pix->h, pix->n, pix->stride, pix->xres, fill_image_ocr_cb);
fz_drop_pixmap(fzctx, pix);
}
}
void
parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only) {
fz_context *fzctx = fz_new_context(NULL, NULL, FZ_STORE_DEFAULT);
thread_ctx = *ctx;
init_fzctx(fzctx, doc);
int err = 0;
fz_document *fzdoc = NULL;
fz_stream *stream = NULL;
fz_var(fzdoc);
fz_var(stream);
fz_var(err);
fz_try(fzctx) {
stream = fz_open_memory(fzctx, buf, buf_len);
fzdoc = fz_open_document_with_stream(fzctx, mime_str, stream);
} fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
int page_count = -1;
fz_var(err);
fz_try(fzctx)page_count = fz_count_pages(fzctx, fzdoc);
fz_catch(fzctx)err = fzctx->error.errcode;
if (err) {
CTX_LOG_WARNINGF(doc->filepath, "fz_count_pages() returned error code [%d] %s", err, fzctx->error.message)
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
APPEND_LONG_META(doc, MetaPages, page_count)
if (ctx->tn_size > 0) {
if (render_cover(ctx, fzctx, doc, fzdoc) == FALSE) {
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
}
if (tn_only) {
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
char title[8192] = {'\0',};
fz_try(fzctx)fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title));
fz_catch(fzctx);
if (strlen(title) > 0) {
APPEND_UTF8_META(doc, MetaTitle, title)
}
char author[4096] = {'\0',};
fz_try(fzctx)fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_AUTHOR, author, sizeof(author));
fz_catch(fzctx);
if (strlen(author) > 0) {
APPEND_UTF8_META(doc, MetaAuthor, author)
}
if (ctx->content_size > 0) {
fz_stext_options opts = {0};
thread_buffer = text_buffer_create(ctx->content_size);
for (int current_page = 0; current_page < page_count; current_page++) {
fz_page *page = NULL;
fz_var(err);
fz_try(fzctx)page = fz_load_page(fzctx, fzdoc, current_page);
fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message)
text_buffer_destroy(&thread_buffer);
fz_drop_page(fzctx, page);
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
fz_stext_page *stext = fz_new_stext_page(fzctx, fz_bound_page(fzctx, page));
fz_device *dev = fz_new_stext_device(fzctx, stext, &opts);
dev->stroke_path = NULL;
dev->stroke_text = NULL;
dev->clip_text = NULL;
dev->clip_stroke_path = NULL;
dev->clip_stroke_text = NULL;
if (ctx->tesseract_lang != NULL) {
dev->fill_image = fill_image;
}
fz_var(err);
fz_try(fzctx)fz_run_page(fzctx, page, dev, fz_identity, NULL);
fz_always(fzctx) {
fz_close_device(fzctx, dev);
fz_drop_device(fzctx, dev);
} fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message)
text_buffer_destroy(&thread_buffer);
fz_drop_page(fzctx, page);
fz_drop_stext_page(fzctx, stext);
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
fz_stext_block *block = stext->first_block;
while (block != NULL) {
int ret = read_stext_block(block, &thread_buffer);
if (ret == TEXT_BUF_FULL) {
break;
}
block = block->next;
}
fz_drop_stext_page(fzctx, stext);
fz_drop_page(fzctx, page);
if (thread_buffer.dyn_buffer.cur >= ctx->content_size) {
break;
}
}
text_buffer_terminate_string(&thread_buffer);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + thread_buffer.dyn_buffer.cur);
meta_content->key = MetaContent;
memcpy(meta_content->str_val, thread_buffer.dyn_buffer.buf, thread_buffer.dyn_buffer.cur);
APPEND_META(doc, meta_content)
text_buffer_destroy(&thread_buffer);
}
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
}
static scan_arc_ctx_t arc_ctx = (scan_arc_ctx_t) {.passphrase = {0,}};
void parse_epub_fast(scan_ebook_ctx_t *ctx, vfile_t *f, document_t *doc) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
text_buffer_t content_buffer = text_buffer_create(ctx->content_size);
if (ctx->tn_size <= 0) {
return;
}
int ret = arc_open(&arc_ctx, f, &a, &arc_data, TRUE);
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(ebook.c) [%d] %s", ret, archive_error_string(a))
archive_read_free(a);
return;
}
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
struct stat info = *archive_entry_stat(entry);
if (S_ISREG(info.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
char *p = strrchr(file_path, '.');
if (p != NULL && (strcmp(p, ".html") == 0 || (strcmp(p, ".xhtml") == 0))) {
size_t entry_size = archive_entry_size(entry);
void *buf = malloc(entry_size + 1);
size_t read = archive_read_data(a, buf, entry_size);
*(char *) (buf + entry_size) = '\0';
if (read != entry_size) {
const char *err_str = archive_error_string(a);
if (err_str) {
CTX_LOG_ERRORF("ebook.c", "Error while reading entry: %s", err_str)
}
free(buf);
break;
}
ret = text_buffer_append_markup(&content_buffer, buf);
free(buf);
if (ret == TEXT_BUF_FULL) {
break;
}
}
}
}
text_buffer_terminate_string(&content_buffer);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + content_buffer.dyn_buffer.cur);
meta_content->key = MetaContent;
memcpy(meta_content->str_val, content_buffer.dyn_buffer.buf, content_buffer.dyn_buffer.cur);
APPEND_META(doc, meta_content)
text_buffer_destroy(&content_buffer);
archive_read_free(a);
}
void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, document_t *doc) {
if (ctx->fast_epub_parse && is_epub(mime_str)) {
parse_epub_fast(ctx, f, doc);
return;
}
size_t buf_len;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
parse_ebook_mem(ctx, buf, buf_len, mime_str, doc, FALSE);
free(buf);
}

View File

@@ -0,0 +1,30 @@
#ifndef SCAN_EBOOK_H
#define SCAN_EBOOK_H
#include "../scan.h"
typedef struct {
long content_size;
int tn_size;
const char *tesseract_lang;
const char *tesseract_path;
pthread_mutex_t mupdf_mutex;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int fast_epub_parse;
float tn_qscale;
} scan_ebook_ctx_t;
void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, document_t *doc);
void
parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only);
__always_inline
static int is_epub(const char *mime_string) {
return strcmp(mime_string, "application/epub+zip") == 0;
}
#endif

246
third-party/libscan/libscan/font/font.c vendored Normal file
View File

@@ -0,0 +1,246 @@
#include "font.h"
#include <ft2build.h>
#include <freetype/freetype.h>
#include "../util.h"
__thread FT_Library ft_lib = NULL;
typedef struct text_dimensions {
unsigned int width;
unsigned int height;
unsigned int baseline;
} text_dimensions_t;
typedef struct glyph {
int top;
int height;
int width;
int descent;
int ascent;
int advance_width;
unsigned char *pixmap;
} glyph_t;
__always_inline
int kerning_offset(char c, char pc, FT_Face face) {
FT_Vector kerning;
FT_Get_Kerning(face, c, pc, FT_KERNING_DEFAULT, &kerning);
return (int) (kerning.x / 64);
}
__always_inline
glyph_t ft_glyph_to_glyph(FT_GlyphSlot slot) {
glyph_t glyph;
glyph.pixmap = slot->bitmap.buffer;
glyph.width = (int) slot->bitmap.width;
glyph.height = (int) slot->bitmap.rows;
glyph.top = slot->bitmap_top;
glyph.advance_width = (int) slot->advance.x / 64;
glyph.descent = MAX(0, glyph.height - glyph.top);
glyph.ascent = MAX(0, MAX(glyph.top, glyph.height) - glyph.descent);
return glyph;
}
text_dimensions_t text_dimension(char *text, FT_Face face) {
text_dimensions_t dimensions;
dimensions.width = 0;
int num_chars = (int) strlen(text);
unsigned int max_ascent = 0;
int max_descent = 0;
char pc = 0;
for (int i = 0; i < num_chars; i++) {
char c = text[i];
FT_Load_Char(face, c, 0);
glyph_t glyph = ft_glyph_to_glyph(face->glyph);
max_descent = MAX(max_descent, glyph.descent);
max_ascent = MAX(max_ascent, MAX(glyph.height, glyph.ascent));
int kerning_x = kerning_offset(c, pc, face);
dimensions.width += MAX(glyph.advance_width, glyph.width) + kerning_x;
pc = c;
}
dimensions.height = max_ascent + max_descent;
dimensions.baseline = max_descent;
return dimensions;
}
void draw_glyph(glyph_t *glyph, int x, int y, struct text_dimensions text_info, unsigned char *bitmap) {
unsigned int src = 0;
unsigned int dst = y * text_info.width + x;
unsigned int row_offset = text_info.width - glyph->width;
unsigned int buf_len = text_info.width * text_info.height;
for (unsigned int sy = 0; sy < glyph->height; sy++) {
for (unsigned int sx = 0; sx < glyph->width; sx++) {
if (dst < buf_len) {
bitmap[dst] |= glyph->pixmap[src];
}
src++;
dst++;
}
dst += row_offset;
}
}
void bmp_format(dyn_buffer_t *buf, text_dimensions_t dimensions, const unsigned char *bitmap) {
dyn_buffer_write_short(buf, 0x4D42); // Magic
dyn_buffer_write_int(buf, 0); // Size placeholder
dyn_buffer_write_int(buf, 0x5157); //Reserved
dyn_buffer_write_int(buf, 14 + 40 + 256 * 4); // pixels offset
dyn_buffer_write_int(buf, 40); // DIB size
dyn_buffer_write_int(buf, (int) dimensions.width);
dyn_buffer_write_int(buf, (int) dimensions.height);
dyn_buffer_write_short(buf, 1); // Color planes
dyn_buffer_write_short(buf, 8); // bits per pixel
dyn_buffer_write_int(buf, 0); // compression
dyn_buffer_write_int(buf, 0); // Ignored
dyn_buffer_write_int(buf, 3800); // hres
dyn_buffer_write_int(buf, 3800); // vres
dyn_buffer_write_int(buf, 256); // Color count
dyn_buffer_write_int(buf, 0); // Ignored
// RGBA32 Color table (Grayscale)
for (int i = 255; i >= 0; i--) {
dyn_buffer_write_int(buf, i + (i << 8) + (i << 16));
}
// Pixel array: write from bottom to top, with rows padded to multiples of 4-bytes
for (int y = (int) dimensions.height - 1; y >= 0; y--) {
for (unsigned int x = 0; x < dimensions.width; x++) {
dyn_buffer_write_char(buf, (char) bitmap[y * dimensions.width + x]);
}
while (buf->cur % 4 != 0) {
dyn_buffer_write_char(buf, 0);
}
}
// Size
*(int *) ((char *) buf->buf + 2) = buf->cur;
}
void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc) {
if (ft_lib == NULL) {
FT_Init_FreeType(&ft_lib);
}
size_t buf_len = 0;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FT_Face face;
FT_Error err = FT_New_Memory_Face(ft_lib, (unsigned char *) buf, (int) buf_len, 0, &face);
if (err != 0) {
CTX_LOG_ERRORF(doc->filepath, "(font.c) FT_New_Memory_Face() returned error code [%d] %s", err,
FT_Error_String(err))
free(buf);
return;
}
char font_name[4096];
if (face->style_name == NULL || (strcmp(face->style_name, "?") == 0)) {
if (face->family_name == NULL) {
strcpy(font_name, "(null)");
} else {
strncpy(font_name, face->family_name, sizeof(font_name));
}
} else {
snprintf(font_name, sizeof(font_name), "%s %s", face->family_name, face->style_name);
}
meta_line_t *meta_name = malloc(sizeof(meta_line_t) + strlen(font_name));
meta_name->key = MetaFontName;
strcpy(meta_name->str_val, font_name);
APPEND_META(doc, meta_name)
if (ctx->enable_tn == TRUE) {
FT_Done_Face(face);
free(buf);
return;
}
int pixel = 64;
int num_chars = (int) strlen(font_name);
err = FT_Set_Pixel_Sizes(face, 0, pixel);
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "(font.c) FT_Set_Pixel_Sizes() returned error code [%d] %s", err,
FT_Error_String(err))
FT_Done_Face(face);
free(buf);
return;
}
text_dimensions_t dimensions = text_dimension(font_name, face);
unsigned char *bitmap = calloc(dimensions.width * dimensions.height, 1);
FT_Vector pen;
pen.x = 0;
char pc = 0;
for (int i = 0; i < num_chars; i++) {
char c = font_name[i];
err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
if (err != 0) {
c = c >= 'a' && c <= 'z' ? c - 32 : c + 32;
err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "(font.c) FT_Load_Char() returned error code [%d] %s", err,
FT_Error_String(err))
continue;
}
}
glyph_t glyph = ft_glyph_to_glyph(face->glyph);
pen.x += kerning_offset(c, pc, face);
if (pen.x <= 0) {
pen.x = ABS(glyph.advance_width - glyph.width);
}
pen.y = dimensions.height - glyph.ascent - dimensions.baseline;
draw_glyph(&glyph, pen.x, pen.y, dimensions, bitmap);
pen.x += glyph.advance_width;
pc = c;
}
dyn_buffer_t bmp_data = dyn_buffer_create();
bmp_format(&bmp_data, dimensions, bitmap);
APPEND_TN_META(doc, dimensions.width, dimensions.height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) bmp_data.buf, bmp_data.cur);
dyn_buffer_destroy(&bmp_data);
free(bitmap);
FT_Done_Face(face);
free(buf);
}
void cleanup_font() {
FT_Done_FreeType(ft_lib);
}

17
third-party/libscan/libscan/font/font.h vendored Normal file
View File

@@ -0,0 +1,17 @@
#ifndef SCAN_FONT_H
#define SCAN_FONT_H
#include "../scan.h"
typedef struct {
int enable_tn;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
} scan_font_ctx_t;
void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc);
void cleanup_font();
#endif

119
third-party/libscan/libscan/json/json.c vendored Normal file
View File

@@ -0,0 +1,119 @@
#include "json.h"
#include "cjson/cJSON.h"
#define JSON_MAX_FILE_SIZE (1024 * 1024 * 50)
int json_extract_text(cJSON *json, text_buffer_t *tex) {
if (cJSON_IsObject(json)) {
for (cJSON *child = json->child; child != NULL; child = child->next) {
if (json_extract_text(child, tex)) {
return TRUE;
}
}
} else if (cJSON_IsArray(json)) {
cJSON *child;
cJSON_ArrayForEach(child, json) {
if (json_extract_text(child, tex)) {
return TRUE;
}
}
} else if (cJSON_IsString(json)) {
if (text_buffer_append_string0(tex, json->valuestring) == TEXT_BUF_FULL) {
return TRUE;
}
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
return TRUE;
}
}
return FALSE;
}
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
if (f->info.st_size > JSON_MAX_FILE_SIZE) {
CTX_LOG_WARNINGF("json.c", "File larger than maximum allowed [%s]", f->filepath)
return SCAN_ERR_SKIP;
}
size_t buf_len;
char *buf = read_all(f, &buf_len);
if (buf == NULL) {
return SCAN_ERR_READ;
}
buf_len += 1;
buf = realloc(buf, buf_len);
*(buf + buf_len - 1) = '\0';
cJSON *json = cJSON_ParseWithOpts(buf, NULL, TRUE);
text_buffer_t tex = text_buffer_create(ctx->content_size);
json_extract_text(json, &tex);
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
cJSON_Delete(json);
free(buf);
text_buffer_destroy(&tex);
return SCAN_OK;
}
#define JSON_BUF_SIZE (1024 * 1024 * 5)
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
char *buf = calloc(JSON_BUF_SIZE + 1, sizeof(char));
*(buf + JSON_BUF_SIZE) = '\0';
text_buffer_t tex = text_buffer_create(ctx->content_size);
size_t ret;
int eof = FALSE;
const char *parse_end = buf;
size_t to_read;
char *ptr = buf;
while (TRUE) {
cJSON *json;
if (!eof) {
to_read = parse_end == buf ? JSON_BUF_SIZE : parse_end - buf;
ret = f->read(f, ptr, to_read);
if (ret != to_read) {
eof = TRUE;
}
}
json = cJSON_ParseWithOpts(buf, &parse_end, FALSE);
if (parse_end == buf + JSON_BUF_SIZE) {
CTX_LOG_ERRORF("json.c", "Line too large for buffer [%s]", doc->filepath);
cJSON_Delete(json);
break;
}
if (parse_end == buf) {
cJSON_Delete(json);
break;
}
json_extract_text(json, &tex);
cJSON_Delete(json);
memmove(buf, parse_end, (buf + JSON_BUF_SIZE - parse_end));
ptr = buf + JSON_BUF_SIZE - parse_end + buf;
}
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
free(buf);
text_buffer_destroy(&tex);
}

30
third-party/libscan/libscan/json/json.h vendored Normal file
View File

@@ -0,0 +1,30 @@
#ifndef SCAN_JSON_H
#define SCAN_JSON_H
#include "../scan.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
unsigned int json_mime;
unsigned int ndjson_mime;
} scan_json_ctx_t;
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
__always_inline
static int is_json(scan_json_ctx_t *ctx, unsigned int mime) {
return mime == ctx->json_mime;
}
__always_inline
static int is_ndjson(scan_json_ctx_t *ctx, unsigned int mime) {
return mime == ctx->ndjson_mime;
}
#endif

62
third-party/libscan/libscan/macros.h vendored Normal file
View File

@@ -0,0 +1,62 @@
#ifndef FALSE
#define FALSE (0)
#define BOOL int
#endif
#ifndef TRUE
#define TRUE (!FALSE)
#endif
#undef MAX
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#undef MIN
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
#ifndef PATH_MAX
#define PATH_MAX 4096
#endif
#undef ABS
#define ABS(a) (((a) < 0) ? -(a) : (a))
#define SHA1_STR_LENGTH 41
#define SHA1_DIGEST_LENGTH 20
#define APPEND_STR_META(doc, keyname, value) \
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \
meta_str->key = keyname; \
strcpy(meta_str->str_val, value); \
APPEND_META(doc, meta_str)}
#define APPEND_LONG_META(doc, keyname, value) \
{meta_line_t *meta_long = malloc(sizeof(meta_line_t)); \
meta_long->key = keyname; \
meta_long->long_val = value; \
APPEND_META(doc, meta_long)}
#define APPEND_TN_META(doc, width, height) \
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + 4 + 1 + 4); \
meta_str->key = MetaThumbnail; \
sprintf(meta_str->str_val, "%04d,%04d", width, height); \
APPEND_META(doc, meta_str)}
#define APPEND_META(doc, meta) \
meta->next = NULL;\
if (doc->meta_head == NULL) {\
doc->meta_head = meta;\
doc->meta_tail = doc->meta_head;\
} else {\
doc->meta_tail->next = meta;\
doc->meta_tail = meta;\
}
#define APPEND_UTF8_META(doc, keyname, str) \
text_buffer_t tex = text_buffer_create(-1); \
text_buffer_append_string0(&tex, str); \
text_buffer_terminate_string(&tex); \
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \
meta_tag->key = keyname; \
strcpy(meta_tag->str_val, tex.dyn_buffer.buf); \
APPEND_META(doc, meta_tag) \
text_buffer_destroy(&tex);

View File

@@ -0,0 +1,809 @@
#include "media.h"
#include "../ocr/ocr.h"
#include <ctype.h>
#define MIN_SIZE 32
#define AVIO_BUF_SIZE 8192
#define IS_VIDEO(fmt) ((fmt)->iformat->name && strcmp((fmt)->iformat->name, "image2") != 0)
#define STREAM_IS_IMAGE (stream->nb_frames <= 1)
#define STORE_AS_IS ((void*)-1)
// Pointer to document being processed
__thread document_t *thread_doc;
const char *get_filepath_with_ext(document_t *doc, const char *filepath, const char *mime_str) {
int has_extension = doc->ext > doc->base;
if (!has_extension) {
if (strcmp(mime_str, "image/png") == 0) {
return "file.png";
} else if (strcmp(mime_str, "image/jpeg") == 0) {
return "file.jpg";
}
}
return filepath;
}
__always_inline
void *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) {
if (frame->pict_type == AV_PICTURE_TYPE_NONE) {
return NULL;
}
int dstW;
int dstH;
if (frame->width <= size && frame->height <= size) {
if (decoder->codec_id == AV_CODEC_ID_MJPEG || decoder->codec_id == AV_CODEC_ID_PNG) {
return STORE_AS_IS;
}
dstW = frame->width;
dstH = frame->height;
} else {
double ratio = (double) frame->width / frame->height;
if (frame->width > frame->height) {
dstW = size;
dstH = (int) (size / ratio);
} else {
dstW = (int) (size * ratio);
dstH = size;
}
}
if (dstW <= MIN_SIZE || dstH <= MIN_SIZE) {
return NULL;
}
AVFrame *scaled_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
decoder->width, decoder->height, decoder->pix_fmt,
dstW, dstH, AV_PIX_FMT_YUVJ420P,
SIST_SWS_ALGO, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, dstW, dstH, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len * 2);
av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, dstW, dstH, 1);
sws_scale(sws_ctx,
(const uint8_t *const *) frame->data, frame->linesize,
0, decoder->height,
scaled_frame->data, scaled_frame->linesize
);
scaled_frame->width = dstW;
scaled_frame->height = dstH;
scaled_frame->format = AV_PIX_FMT_YUV420P;
sws_freeContext(sws_ctx);
return scaled_frame;
}
typedef struct {
AVPacket *packet;
AVFrame *frame;
} frame_and_packet_t;
static void frame_and_packet_free(frame_and_packet_t *frame_and_packet) {
if (frame_and_packet->packet != NULL) {
av_packet_free(&frame_and_packet->packet);
}
if (frame_and_packet->frame != NULL) {
av_frame_free(&frame_and_packet->frame);
}
free(frame_and_packet->packet);
free(frame_and_packet);
}
__always_inline
static void read_subtitles(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, int stream_idx, document_t *doc) {
text_buffer_t tex = text_buffer_create(-1);
AVPacket packet;
AVSubtitle subtitle;
AVCodec *subtitle_codec = avcodec_find_decoder(pFormatCtx->streams[stream_idx]->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(subtitle_codec);
avcodec_parameters_to_context(decoder, pFormatCtx->streams[stream_idx]->codecpar);
avcodec_open2(decoder, subtitle_codec, NULL);
decoder->sub_text_format = FF_SUB_TEXT_FMT_ASS;
int got_sub;
while (1) {
int read_frame_ret = av_read_frame(pFormatCtx, &packet);
if (read_frame_ret != 0) {
break;
}
if (packet.stream_index != stream_idx) {
av_packet_unref(&packet);
continue;
}
avcodec_decode_subtitle2(decoder, &subtitle, &got_sub, &packet);
if (got_sub) {
for (int i = 0; i < subtitle.num_rects; i++) {
const char *text = subtitle.rects[i]->ass;
if (text == NULL) {
continue;
}
char *idx = strstr(text, "\\N");
if (idx != NULL && strlen(idx + 2) > 1) {
text_buffer_append_string0(&tex, idx + 2);
text_buffer_append_char(&tex, ' ');
}
}
avsubtitle_free(&subtitle);
}
av_packet_unref(&packet);
}
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
text_buffer_destroy(&tex);
avcodec_free_context(&decoder);
}
__always_inline
static frame_and_packet_t *
read_frame(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVCodecContext *decoder, int stream_idx,
document_t *doc) {
frame_and_packet_t *result = calloc(1, sizeof(frame_and_packet_t));
result->packet = av_packet_alloc();
result->frame = av_frame_alloc();
av_init_packet(result->packet);
int receive_ret = -EAGAIN;
while (receive_ret == -EAGAIN) {
// Get video frame
while (1) {
int read_frame_ret = av_read_frame(pFormatCtx, result->packet);
if (read_frame_ret != 0) {
if (read_frame_ret != AVERROR_EOF) {
CTX_LOG_WARNINGF(doc->filepath,
"(media.c) avcodec_read_frame() returned error code [%d] %s",
read_frame_ret, av_err2str(read_frame_ret)
)
}
frame_and_packet_free(result);
return NULL;
}
//Ignore audio/other frames
if (result->packet->stream_index != stream_idx) {
av_packet_unref(result->packet);
continue;
}
break;
}
// Feed it to decoder
int decode_ret = avcodec_send_packet(decoder, result->packet);
if (decode_ret != 0) {
CTX_LOG_ERRORF(doc->filepath,
"(media.c) avcodec_send_packet() returned error code [%d] %s",
decode_ret, av_err2str(decode_ret)
)
frame_and_packet_free(result);
return NULL;
}
receive_ret = avcodec_receive_frame(decoder, result->frame);
if (receive_ret == -EAGAIN && result->packet != NULL) {
av_packet_unref(result->packet);
}
}
return result;
}
void append_tag_meta_if_not_exists(scan_media_ctx_t *ctx, document_t *doc, AVDictionaryEntry *tag, enum metakey key) {
meta_line_t *meta = doc->meta_head;
while (meta != NULL) {
if (meta->key == key) {
CTX_LOG_DEBUGF(doc->filepath, "Ignoring duplicate tag: '%02x=%s' and '%02x=%s'",
key, meta->str_val, key, tag->value)
return;
}
meta = meta->next;
}
text_buffer_t tex = text_buffer_create(-1);
text_buffer_append_string0(&tex, tag->value);
text_buffer_terminate_string(&tex);
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta_tag->key = key;
strcpy(meta_tag->str_val, tex.dyn_buffer.buf);
APPEND_META(doc, meta_tag)
text_buffer_destroy(&tex);
}
#define APPEND_TAG_META(keyname) \
APPEND_UTF8_META(doc, keyname, tag->value)
#define STRCPY_TOLOWER(dst, str) \
strncpy(dst, str, sizeof(dst)); \
char *ptr = dst; \
for (; *ptr; ++ptr) *ptr = (char) tolower(*ptr);
__always_inline
static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
AVDictionaryEntry *tag = NULL;
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
char key[256];
STRCPY_TOLOWER(key, tag->key)
if (strcmp(key, "artist") == 0) {
APPEND_TAG_META(MetaArtist)
} else if (strcmp(key, "genre") == 0) {
APPEND_TAG_META(MetaGenre)
} else if (strcmp(key, "title") == 0) {
APPEND_TAG_META(MetaTitle)
} else if (strcmp(key, "album_artist") == 0) {
APPEND_TAG_META(MetaAlbumArtist)
} else if (strcmp(key, "album") == 0) {
APPEND_TAG_META(MetaAlbum)
} else if (strcmp(key, "comment") == 0) {
APPEND_TAG_META(MetaContent)
}
}
}
__always_inline
static void
append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *frame, document_t *doc, int is_video) {
if (is_video) {
meta_line_t *meta_duration = malloc(sizeof(meta_line_t));
meta_duration->key = MetaMediaDuration;
meta_duration->long_val = pFormatCtx->duration / AV_TIME_BASE;
if (meta_duration->long_val > INT32_MAX) {
meta_duration->long_val = 0;
}
APPEND_META(doc, meta_duration)
meta_line_t *meta_bitrate = malloc(sizeof(meta_line_t));
meta_bitrate->key = MetaMediaBitrate;
meta_bitrate->long_val = pFormatCtx->bit_rate;
APPEND_META(doc, meta_bitrate)
}
AVDictionaryEntry *tag = NULL;
if (is_video) {
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
char key[256];
STRCPY_TOLOWER(key, tag->key)
if (strcmp(key, "title") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaTitle);
} else if (strcmp(key, "comment") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaContent);
} else if (strcmp(key, "artist") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
}
}
} else {
// EXIF metadata
while ((tag = av_dict_get(frame->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
char key[256];
STRCPY_TOLOWER(key, tag->key)
if (strcmp(key, "artist") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
} else if (strcmp(key, "imagedescription") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaContent);
} else if (strcmp(key, "make") == 0) {
APPEND_TAG_META(MetaExifMake)
} else if (strcmp(key, "model") == 0) {
APPEND_TAG_META(MetaExifModel)
} else if (strcmp(key, "software") == 0) {
APPEND_TAG_META(MetaExifSoftware)
} else if (strcmp(key, "fnumber") == 0) {
APPEND_TAG_META(MetaExifFNumber)
} else if (strcmp(key, "focallength") == 0) {
APPEND_TAG_META(MetaExifFocalLength)
} else if (strcmp(key, "usercomment") == 0) {
APPEND_TAG_META(MetaExifUserComment)
} else if (strcmp(key, "isospeedratings") == 0) {
APPEND_TAG_META(MetaExifIsoSpeedRatings)
} else if (strcmp(key, "exposuretime") == 0) {
APPEND_TAG_META(MetaExifExposureTime)
} else if (strcmp(key, "datetime") == 0) {
APPEND_TAG_META(MetaExifDateTime)
} else if (strcmp(key, "gpslatitude") == 0) {
APPEND_TAG_META(MetaExifGpsLatitudeDMS)
} else if (strcmp(key, "gpslatituderef") == 0) {
APPEND_TAG_META(MetaExifGpsLatitudeRef)
} else if (strcmp(key, "gpslongitude") == 0) {
APPEND_TAG_META(MetaExifGpsLongitudeDMS)
} else if (strcmp(key, "gpslongituderef") == 0) {
APPEND_TAG_META(MetaExifGpsLongitudeRef)
}
}
}
}
static void ocr_image_cb(const char *text, size_t len) {
APPEND_STR_META(thread_doc, MetaContent, text);
}
#define OCR_PIXEL_FORMAT AV_PIX_FMT_RGB32
#define OCR_BYTES_PER_PIXEL 4
#define OCR_PIXELS_PER_INCH 70
void ocr_image(scan_media_ctx_t *ctx, document_t *doc, const AVCodecContext *decoder, AVFrame *frame) {
// Convert to RGB32
AVFrame *rgb_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
frame->width, frame->height, decoder->pix_fmt,
frame->width, frame->height, OCR_PIXEL_FORMAT,
SWS_LANCZOS, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(OCR_PIXEL_FORMAT, frame->width, frame->height, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len * 2);
av_image_fill_arrays(rgb_frame->data, rgb_frame->linesize, dst_buf, OCR_PIXEL_FORMAT, frame->width, frame->height,
1);
sws_scale(sws_ctx,
(const uint8_t *const *) frame->data, frame->linesize,
0, frame->height,
rgb_frame->data, rgb_frame->linesize
);
thread_doc = doc;
ocr_extract_text(
ctx->tesseract_path,
ctx->tesseract_lang,
rgb_frame->data[0],
frame->width,
frame->height,
OCR_BYTES_PER_PIXEL,
rgb_frame->linesize[0],
OCR_PIXELS_PER_INCH,
ocr_image_cb
);
sws_freeContext(sws_ctx);
av_free(*rgb_frame->data);
av_frame_free(&rgb_frame);
}
void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) {
int video_stream = -1;
int audio_stream = -1;
int subtitle_stream = -1;
avformat_find_stream_info(pFormatCtx, NULL);
for (int i = (int) pFormatCtx->nb_streams - 1; i >= 0; i--) {
AVStream *stream = pFormatCtx->streams[i];
if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
if (audio_stream == -1) {
const AVCodecDescriptor *desc = avcodec_descriptor_get(stream->codecpar->codec_id);
if (desc != NULL) {
APPEND_STR_META(doc, MetaMediaAudioCodec, desc->name)
}
audio_stream = i;
}
} else if (stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
if (video_stream == -1) {
const AVCodecDescriptor *desc = avcodec_descriptor_get(stream->codecpar->codec_id);
if (desc != NULL) {
APPEND_STR_META(doc, MetaMediaVideoCodec, desc->name)
}
meta_line_t *meta_w = malloc(sizeof(meta_line_t));
meta_w->key = MetaWidth;
meta_w->long_val = stream->codecpar->width;
APPEND_META(doc, meta_w)
meta_line_t *meta_h = malloc(sizeof(meta_line_t));
meta_h->key = MetaHeight;
meta_h->long_val = stream->codecpar->height;
APPEND_META(doc, meta_h)
video_stream = i;
}
} else if (stream->codecpar->codec_type == AVMEDIA_TYPE_SUBTITLE) {
subtitle_stream = i;
}
}
if (subtitle_stream != -1 && ctx->read_subtitles) {
read_subtitles(ctx, pFormatCtx, subtitle_stream, doc);
// Reset stream
if (video_stream != -1) {
av_seek_frame(pFormatCtx, video_stream, 0, 0);
}
}
if (audio_stream != -1) {
append_audio_meta(pFormatCtx, doc);
}
if (video_stream != -1 && ctx->tn_size > 0) {
AVStream *stream = pFormatCtx->streams[video_stream];
if (stream->codecpar->width <= MIN_SIZE || stream->codecpar->height <= MIN_SIZE) {
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
// Decoder
AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
avcodec_parameters_to_context(decoder, stream->codecpar);
avcodec_open2(decoder, video_codec, NULL);
//Seek
if (!STREAM_IS_IMAGE && stream->codecpar->codec_id != AV_CODEC_ID_GIF) {
int seek_ret;
for (int i = 20; i >= 0; i--) {
seek_ret = av_seek_frame(pFormatCtx, video_stream,
(long) ((double) stream->duration * 0.10), 0);
if (seek_ret == 0) {
break;
}
}
}
frame_and_packet_t *frame_and_packet = read_frame(ctx, pFormatCtx, decoder, video_stream, doc);
if (frame_and_packet == NULL) {
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
if (ctx->tesseract_lang != NULL && STREAM_IS_IMAGE) {
ocr_image(ctx, doc, decoder, frame_and_packet->frame);
}
// NOTE: OCR'd content takes precedence over exif image description
append_video_meta(ctx, pFormatCtx, frame_and_packet->frame, doc, IS_VIDEO(pFormatCtx));
// Scale frame
AVFrame *scaled_frame = scale_frame(decoder, frame_and_packet->frame, ctx->tn_size);
if (scaled_frame == NULL) {
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
if (scaled_frame == STORE_AS_IS) {
APPEND_TN_META(doc, frame_and_packet->frame->width, frame_and_packet->frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data,
frame_and_packet->packet->size);
} else {
// Encode frame to jpeg
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
ctx->tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
// Save thumbnail
APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
avcodec_free_context(&jpeg_encoder);
av_packet_unref(&jpeg_packet);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
}
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
}
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
}
void parse_media_filename(scan_media_ctx_t *ctx, const char *filepath, document_t *doc) {
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
return;
}
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
if (res < 0) {
CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
parse_media_format_ctx(ctx, pFormatCtx, doc);
}
int vfile_read(void *ptr, uint8_t *buf, int buf_size) {
struct vfile *f = ptr;
int ret = f->read(f, buf, buf_size);
if (ret == 0) {
return AVERROR_EOF;
}
return ret;
}
typedef struct {
size_t size;
FILE *file;
void *buf;
} memfile_t;
int memfile_read(void *ptr, uint8_t *buf, int buf_size) {
memfile_t *mem = ptr;
size_t ret = fread(buf, 1, buf_size, mem->file);
if (ret == 0 && feof(mem->file)) {
return AVERROR_EOF;
}
return (int) ret;
}
long memfile_seek(void *ptr, long offset, int whence) {
memfile_t *mem = ptr;
if (whence == 0x10000) {
return (long) mem->size;
}
int ret = fseek(mem->file, offset, whence);
if (ret != 0) {
return AVERROR_EOF;
}
return ftell(mem->file);
}
int memfile_open(vfile_t *f, memfile_t *mem) {
mem->size = f->info.st_size;
mem->buf = malloc(mem->size);
if (mem->buf == NULL) {
return -1;
}
int ret = f->read(f, mem->buf, mem->size);
mem->file = fmemopen(mem->buf, mem->size, "rb");
if (f->calculate_checksum) {
SHA1_Init(&f->sha1_ctx);
safe_sha1_update(&f->sha1_ctx, mem->buf, mem->size);
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
f->has_checksum = TRUE;
}
return (ret == mem->size && mem->file != NULL) ? 0 : -1;
}
int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) {
mem->size = (int) buf_len;
mem->buf = buf;
mem->file = fmemopen(mem->buf, mem->size, "rb");
return mem->file != NULL ? 0 : -1;
}
void memfile_close(memfile_t *mem) {
if (mem->buf != NULL) {
free(mem->buf);
fclose(mem->file);
}
}
void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc, const char *mime_str) {
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
return;
}
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
AVIOContext *io_ctx = NULL;
memfile_t memfile = {0, 0, 0};
const char *filepath = get_filepath_with_ext(doc, f->filepath, mime_str);
if (f->info.st_size <= ctx->max_media_buffer) {
int ret = memfile_open(f, &memfile);
if (ret == 0) {
CTX_LOG_DEBUGF(f->filepath, "Loading media file in memory (%ldB)", f->info.st_size)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, &memfile, memfile_read, NULL, memfile_seek);
}
}
if (io_ctx == NULL) {
CTX_LOG_DEBUGF(f->filepath, "Reading media file without seek support", f->info.st_size)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, f, vfile_read, NULL, NULL);
}
pFormatCtx->pb = io_ctx;
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
if (res < 0) {
if (res != -5) {
CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
}
av_free(io_ctx->buffer);
memfile_close(&memfile);
avio_context_free(&io_ctx);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
parse_media_format_ctx(ctx, pFormatCtx, doc);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
memfile_close(&memfile);
}
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char *mime_str) {
if (f->is_fs_file) {
parse_media_filename(ctx, f->filepath, doc);
} else {
parse_media_vfile(ctx, f, doc, mime_str);
}
}
void init_media() {
av_log_set_level(AV_LOG_QUIET);
}
int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) {
memfile_t memfile = {0, 0, 0};
AVIOContext *io_ctx = NULL;
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
return FALSE;
}
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
int ret = memfile_open_buf(buf, buf_len, &memfile);
if (ret == 0) {
CTX_LOG_DEBUGF(doc->filepath, "Loading media file in memory (%ldB)", buf_len)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, &memfile, memfile_read, NULL, memfile_seek);
} else {
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
fclose(memfile.file);
return FALSE;
}
pFormatCtx->pb = io_ctx;
int res = avformat_open_input(&pFormatCtx, url, NULL, NULL);
if (res != 0) {
av_free(io_ctx->buffer);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
AVStream *stream = pFormatCtx->streams[0];
// Decoder
const AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
avcodec_parameters_to_context(decoder, stream->codecpar);
avcodec_open2(decoder, video_codec, NULL);
frame_and_packet_t *frame_and_packet = read_frame(ctx, pFormatCtx, decoder, 0, doc);
if (frame_and_packet == NULL) {
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
// Scale frame
AVFrame *scaled_frame = scale_frame(decoder, frame_and_packet->frame, ctx->tn_size);
if (scaled_frame == NULL) {
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
if (scaled_frame == STORE_AS_IS) {
APPEND_TN_META(doc, frame_and_packet->frame->width, frame_and_packet->frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data,
frame_and_packet->packet->size);
} else {
// Encode frame to jpeg
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
ctx->tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
// Save thumbnail
APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
av_packet_unref(&jpeg_packet);
avcodec_free_context(&jpeg_encoder);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
}
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return TRUE;
}

View File

@@ -0,0 +1,55 @@
#ifndef SIST2_MEDIA_H
#define SIST2_MEDIA_H
#include "../scan.h"
#include "libavformat/avformat.h"
#include "libswscale/swscale.h"
#include "libswresample/swresample.h"
#include "libavcodec/avcodec.h"
#include "libavutil/imgutils.h"
typedef struct {
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int tn_size;
float tn_qscale;
long max_media_buffer;
int read_subtitles;
const char *tesseract_lang;
const char *tesseract_path;
} scan_media_ctx_t;
__always_inline
static AVCodecContext *alloc_jpeg_encoder(int w, int h, float qscale) {
const AVCodec *jpeg_codec = avcodec_find_encoder(AV_CODEC_ID_MJPEG);
AVCodecContext *jpeg = avcodec_alloc_context3(jpeg_codec);
jpeg->width = w;
jpeg->height = h;
jpeg->time_base.den = 1000000;
jpeg->time_base.num = 1;
jpeg->i_quant_factor = qscale;
jpeg->pix_fmt = AV_PIX_FMT_YUVJ420P;
int ret = avcodec_open2(jpeg, jpeg_codec, NULL);
if (ret != 0) {
return NULL;
}
return jpeg;
}
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char*mime_str);
void init_media();
int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url);
#endif

View File

@@ -0,0 +1,79 @@
#include "scan_mobi.h"
#include <mobi.h>
#include <errno.h>
#include "stdlib.h"
void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc) {
MOBIData *m = mobi_init();
if (m == NULL) {
CTX_LOG_ERROR(f->filepath, "mobi_init() failed")
return;
}
size_t buf_len;
char* buf = read_all(f, &buf_len);
if (buf == NULL) {
mobi_free(m);
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FILE *file = fmemopen(buf, buf_len, "rb");
if (file == NULL) {
mobi_free(m);
free(buf);
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
return;
}
MOBI_RET mobi_ret = mobi_load_file(m, file);
fclose(file);
if (mobi_ret != MOBI_SUCCESS) {
mobi_free(m);
free(buf);
CTX_LOG_ERRORF(f->filepath, "mobi_laod_file() returned error code [%d]", mobi_ret)
return;
}
char *author = mobi_meta_get_author(m);
if (author != NULL) {
APPEND_STR_META(doc, MetaAuthor, author)
free(author);
}
char *title = mobi_meta_get_title(m);
if (title != NULL) {
APPEND_STR_META(doc, MetaTitle, title)
free(title);
}
const size_t maxlen = mobi_get_text_maxsize(m);
if (maxlen == MOBI_NOTSET) {
free(buf);
CTX_LOG_DEBUGF("%s", "Invalid text maxsize: %zu", maxlen)
return;
}
char *content_str = malloc(maxlen + 1);
size_t length = maxlen;
mobi_ret = mobi_get_rawml(m, content_str, &length);
if (mobi_ret != MOBI_SUCCESS) {
mobi_free(m);
free(content_str);
free(buf);
CTX_LOG_ERRORF(f->filepath, "mobi_get_rawml() returned error code [%d]", mobi_ret)
return;
}
text_buffer_t tex = text_buffer_create(ctx->content_size);
text_buffer_append_markup(&tex, content_str);
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
free(content_str);
free(buf);
text_buffer_destroy(&tex);
mobi_free(m);
}

View File

@@ -0,0 +1,14 @@
#ifndef SCAN_SCAN_MOBI_H
#define SCAN_SCAN_MOBI_H
#include "../scan.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
} scan_mobi_ctx_t;
void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

View File

@@ -0,0 +1,147 @@
#include "msdoc.h"
#include <errno.h>
#include <sys/mman.h>
#include "../../third-party/antiword/src/antiword.h"
#include "../ebook/ebook.h"
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void *buf, size_t buf_len) {
// Open word doc
options_type *opts = direct_vGetOptions();
opts->iParagraphBreak = 74;
opts->eConversionType = conversion_text;
opts->bHideHiddenText = 1;
opts->bRemoveRemovedText = 1;
opts->bUseLandscape = 0;
opts->eEncoding = encoding_utf_8;
opts->iPageHeight = 842; // A4
opts->iPageWidth = 595;
opts->eImageLevel = level_ps_3;
int doc_word_version = iGuessVersionNumber(file_in, (int) buf_len);
if (doc_word_version < 0 || doc_word_version == 3) {
free(buf);
return;
}
rewind(file_in);
size_t out_len;
char *out_buf;
FILE *file_out = open_memstream(&out_buf, &out_len);
diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
if (diag == NULL) {
fclose(file_in);
return;
}
iInitDocument(file_in, (int) buf_len);
const char *author = szGetAuthor();
if (author != NULL) {
APPEND_UTF8_META(doc, MetaAuthor, author)
}
const char *title = szGetTitle();
if (title != NULL) {
APPEND_UTF8_META(doc, MetaTitle, title)
}
vFreeDocument();
bWordDecryptor(file_in, (int) buf_len, diag);
vDestroyDiagram(diag);
fclose(file_out);
if (buf_len > 0) {
text_buffer_t tex = text_buffer_create(ctx->content_size);
text_buffer_append_string(&tex, out_buf, out_len);
text_buffer_terminate_string(&tex);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta_content->key = MetaContent;
memcpy(meta_content->str_val, tex.dyn_buffer.buf, tex.dyn_buffer.cur);
APPEND_META(doc, meta_content)
text_buffer_destroy(&tex);
}
free(buf);
free(out_buf);
}
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void *buf, size_t buf_len) {
scan_ebook_ctx_t ebook_ctx = {
.content_size = ctx->content_size,
.tn_size = ctx->tn_size,
.log = ctx->log,
.logf = ctx->logf,
.store = ctx->store,
};
// Open word doc
options_type *opts = direct_vGetOptions();
opts->iParagraphBreak = 74;
opts->eConversionType = conversion_pdf;
opts->bHideHiddenText = 1;
opts->bRemoveRemovedText = 1;
opts->bUseLandscape = 0;
opts->eEncoding = encoding_latin_1;
opts->iPageHeight = 842; // A4
opts->iPageWidth = 595;
opts->eImageLevel = level_ps_3;
int doc_word_version = iGuessVersionNumber(file, (int) buf_len);
if (doc_word_version < 0 || doc_word_version == 3) {
free(buf);
return;
}
rewind(file);
size_t out_len;
char *out_buf;
FILE *file_out = open_memstream(&out_buf, &out_len);
diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
if (diag == NULL) {
return;
}
bWordDecryptor(file, (int) buf_len, diag);
vDestroyDiagram(diag);
fclose(file_out);
parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc, TRUE);
free(buf);
free(out_buf);
}
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
char *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FILE *file = fmemopen(buf, buf_len, "rb");
if (file == NULL) {
free(buf);
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
return;
}
if (ctx->tn_size > 0) {
char *buf_pdf = malloc(buf_len);
memcpy(buf_pdf, buf, buf_len);
parse_msdoc_pdf(ctx, doc, file, buf_pdf, buf_len);
}
parse_msdoc_text(ctx, doc, file, buf, buf_len);
fclose(file);
}

View File

@@ -0,0 +1,24 @@
#ifndef SCAN_SCAN_MSDOC_H
#define SCAN_SCAN_MSDOC_H
#include "../scan.h"
typedef struct {
long content_size;
int tn_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
unsigned int msdoc_mime;
} scan_msdoc_ctx_t;
__always_inline
static int is_msdoc(scan_msdoc_ctx_t *ctx, unsigned int mime) {
return mime == ctx->msdoc_mime;
}
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc);
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void* buf, size_t buf_len);
#endif

47
third-party/libscan/libscan/ocr/ocr.h vendored Normal file
View File

@@ -0,0 +1,47 @@
#ifndef OCR_H
#define OCR_H
#include "../scan.h"
#include <tesseract/capi.h>
#define MIN_OCR_WIDTH 350
#define MIN_OCR_HEIGHT 100
#define MIN_OCR_LEN 10
#define OCR_IS_VALID_BPP(d) \
((d) == 1 || (d) == 2 || (d) == 4 || (d) == 8 || (d) == 16 || (d) == 24 || \
(d) == 32)
typedef void (*ocr_extract_callback_t)(const char *, size_t);
__always_inline static void
ocr_extract_text(const char *tesseract_path, const char *tesseract_lang,
const unsigned char *img_buf, const int img_w, const int img_h,
const int img_bpp, const int img_stride, const int img_xres,
const ocr_extract_callback_t cb) {
if (img_w < MIN_OCR_WIDTH || img_h < MIN_OCR_HEIGHT || img_xres <= 0 ||
!OCR_IS_VALID_BPP(img_bpp)) {
return;
}
TessBaseAPI *api = TessBaseAPICreate();
TessBaseAPIInit3(api, tesseract_path, tesseract_lang);
TessBaseAPISetImage(api, img_buf, img_w, img_h, img_bpp, img_stride);
TessBaseAPISetSourceResolution(api, img_xres);
char *text = TessBaseAPIGetUTF8Text(api);
if (text != NULL) {
size_t len = strlen(text);
if (len >= MIN_OCR_LEN) {
cb(text, len);
}
TessDeleteText(text);
}
TessBaseAPIEnd(api);
TessBaseAPIDelete(api);
}
#endif

View File

@@ -0,0 +1,260 @@
#include "ooxml.h"
#include <archive.h>
#include <archive_entry.h>
#include <libxml/xmlstring.h>
#include <libxml/parser.h>
#define _X(str) ((const xmlChar*)str)
__always_inline
static int should_read_part(const char *part) {
if (part == NULL) {
return FALSE;
}
if ( // Word
STR_STARTS_WITH_CONSTANT(part, "word/document.xml")
|| STR_STARTS_WITH_CONSTANT(part, "word/footnotes.xml")
|| STR_STARTS_WITH_CONSTANT(part, "word/endnotes.xml")
|| STR_STARTS_WITH_CONSTANT(part, "word/footer")
|| STR_STARTS_WITH_CONSTANT(part, "word/header")
// PowerPoint
|| STR_STARTS_WITH_CONSTANT(part, "ppt/slides/slide")
|| STR_STARTS_WITH_CONSTANT(part, "ppt/notesSlides/slide")
// Excel
|| STR_STARTS_WITH_CONSTANT(part, "xl/worksheets/sheet")
|| STR_STARTS_WITH_CONSTANT(part, "xl/sharedStrings.xml")
|| STR_STARTS_WITH_CONSTANT(part, "xl/workbook.xml")
) {
return TRUE;
}
return FALSE;
}
int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_t *buf) {
//TODO: Check which nodes are likely to have a 't' child, and ignore nodes that aren't
xmlErrorPtr err = xmlGetLastError();
if (err != NULL) {
if (err->level == XML_ERR_FATAL) {
CTX_LOG_ERRORF("ooxml.c", "Got fatal XML error while parsing document: %s", err->message)
return -1;
}
}
for (xmlNode *child = node; child; child = child->next) {
if (child->name != NULL && *child->name == 't' && *(child->name + 1) == '\0') {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text) {
int ret = text_buffer_append_string0(buf, (char *) text);
text_buffer_append_char(buf, ' ');
xmlFree(text);
if (ret == TEXT_BUF_FULL) {
return ret;
}
}
}
if (extract_text(ctx, xml, child->children, buf) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
}
return 0;
}
int xml_io_read(void *context, char *buffer, int len) {
struct archive *a = context;
return (int) archive_read_data(a, buffer, len);
}
int xml_io_close(UNUSED(void *context)) {
//noop
return 0;
}
#define READ_PART_ERR (-2)
__always_inline
static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
return READ_PART_ERR;
}
xmlNode *root = xmlDocGetRootElement(xml);
if (root == NULL) {
CTX_LOG_ERROR(doc->filepath, "Empty document")
xmlFreeDoc(xml);
return READ_PART_ERR;
}
int ret = extract_text(ctx, xml, root, buf);
xmlFreeDoc(xml);
return ret;
}
__always_inline
static int read_doc_props_app(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
return -1;
}
xmlNode *root = xmlDocGetRootElement(xml);
if (root == NULL) {
CTX_LOG_ERROR(doc->filepath, "Empty document")
xmlFreeDoc(xml);
return -1;
}
if (xmlStrEqual(root->name, _X("Properties"))) {
for (xmlNode *child = root->children; child; child = child->next) {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text == NULL) {
continue;
}
if (xmlStrEqual(child->name, _X("Pages"))) {
APPEND_LONG_META(doc, MetaPages, strtol((char *) text, NULL, 10))
}
xmlFree(text);
}
}
xmlFreeDoc(xml);
return 0;
}
__always_inline
static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
return -1;
}
xmlNode *root = xmlDocGetRootElement(xml);
if (root == NULL) {
CTX_LOG_ERROR(doc->filepath, "Empty document")
xmlFreeDoc(xml);
return -1;
}
if (xmlStrEqual(root->name, _X("coreProperties"))) {
for (xmlNode *child = root->children; child; child = child->next) {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text == NULL) {
continue;
}
if (xmlStrEqual(child->name, _X("title"))) {
APPEND_STR_META(doc, MetaTitle, (char *) text)
} else if (xmlStrEqual(child->name, _X("creator"))) {
APPEND_STR_META(doc, MetaAuthor, (char *) text)
} else if (xmlStrEqual(child->name, _X("lastModifiedBy"))) {
APPEND_STR_META(doc, MetaModifiedBy, (char *) text)
}
xmlFree(text);
}
}
xmlFreeDoc(xml);
return 0;
}
#define MAX_TN_SIZE (1024 * 1024 * 15)
void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, struct archive_entry *entry) {
size_t entry_size = archive_entry_size(entry);
if (entry_size <= 0 || entry_size > MAX_TN_SIZE) {
return;
}
char *buf = malloc(entry_size);
archive_read_data(a, buf, entry_size);
APPEND_TN_META(doc, 1, 1) // Size unknown
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), buf, entry_size);
free(buf);
}
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
struct archive *a = archive_read_new();
archive_read_support_format_zip(a);
int ret = archive_read_open_memory(a, buf, buf_len);
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a))
archive_read_free(a);
free(buf);
return;
}
text_buffer_t tex = text_buffer_create(ctx->content_size);
struct archive_entry *entry;
int buffer_full = FALSE;
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
const char *path = archive_entry_pathname(entry);
if (!buffer_full && should_read_part(path) && ctx->content_size > 0) {
ret = read_part(ctx, a, &tex, doc);
if (ret == READ_PART_ERR) {
break;
} else if (ret == TEXT_BUF_FULL) {
buffer_full = TRUE;
}
} else if (strcmp(path, "docProps/app.xml") == 0) {
if (read_doc_props_app(ctx, a, doc) != 0) {
break;
}
} else if (strcmp(path, "docProps/core.xml") == 0) {
if (read_doc_props(ctx, a, doc) != 0) {
break;
}
} else if (strcmp(path, "docProps/thumbnail.jpeg") == 0) {
read_thumbnail(ctx, doc, a, entry);
}
}
}
if (tex.dyn_buffer.cur > 0) {
text_buffer_terminate_string(&tex);
meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta->key = MetaContent;
strcpy(meta->str_val, tex.dyn_buffer.buf);
APPEND_META(doc, meta)
}
archive_read_close(a);
archive_read_free(a);
text_buffer_destroy(&tex);
free(buf);
}

View File

@@ -0,0 +1,16 @@
#ifndef SCAN_OOXML_H
#define SCAN_OOXML_H
#include <stdlib.h>
#include "../scan.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
} scan_ooxml_ctx_t;
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

224
third-party/libscan/libscan/raw/raw.c vendored Normal file
View File

@@ -0,0 +1,224 @@
#include "raw.h"
#include <libraw/libraw.h>
#include "../media/media.h"
#include <unistd.h>
#define MIN_SIZE 32
int store_thumbnail_jpeg(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
return store_image_thumbnail((scan_media_ctx_t *) ctx, img->data, img->data_size, doc, "x.jpeg");
}
int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
int dstW;
int dstH;
if (img->width <= ctx->tn_size && img->height <= ctx->tn_size) {
dstW = img->width;
dstH = img->height;
} else {
double ratio = (double) img->width / img->height;
if (img->width > img->height) {
dstW = ctx->tn_size;
dstH = (int) (ctx->tn_size / ratio);
} else {
dstW = (int) (ctx->tn_size * ratio);
dstH = ctx->tn_size;
}
}
if (dstW <= MIN_SIZE || dstH <= MIN_SIZE) {
return FALSE;
}
AVFrame *scaled_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
img->width, img->height, AV_PIX_FMT_RGB24,
dstW, dstH, AV_PIX_FMT_YUVJ420P,
SIST_SWS_ALGO, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, dstW, dstH, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len);
av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, dstW, dstH, 1);
const uint8_t *in_data[1] = {img->data};
int in_line_size[1] = {3 * img->width};
sws_scale(sws_ctx,
in_data, in_line_size,
0, img->height,
scaled_frame->data, scaled_frame->linesize
);
scaled_frame->width = dstW;
scaled_frame->height = dstH;
scaled_frame->format = AV_PIX_FMT_YUV420P;
sws_freeContext(sws_ctx);
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, 1.0f);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
av_packet_unref(&jpeg_packet);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
avcodec_free_context(&jpeg_encoder);
return TRUE;
}
#define DMS_REF(ref) (((ref) == 'S' || (ref) == 'W') ? -1 : 1)
void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
libraw_data_t *libraw_lib = libraw_init(0);
if (!libraw_lib) {
CTX_LOG_ERROR("raw.c", "Cannot create libraw handle")
return;
}
size_t buf_len = 0;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
int ret = libraw_open_buffer(libraw_lib, buf, buf_len);
if (ret != 0) {
CTX_LOG_ERROR(f->filepath, "Could not open raw file")
free(buf);
libraw_close(libraw_lib);
return;
}
if (*libraw_lib->idata.model != '\0') {
APPEND_STR_META(doc, MetaExifModel, libraw_lib->idata.model)
}
if (*libraw_lib->idata.make != '\0') {
APPEND_STR_META(doc, MetaExifMake, libraw_lib->idata.make)
}
if (*libraw_lib->idata.software != '\0') {
APPEND_STR_META(doc, MetaExifSoftware, libraw_lib->idata.software)
}
APPEND_LONG_META(doc, MetaWidth, libraw_lib->sizes.width)
APPEND_LONG_META(doc, MetaHeight, libraw_lib->sizes.height)
char tmp[1024];
snprintf(tmp, sizeof(tmp), "%g", libraw_lib->other.iso_speed);
APPEND_STR_META(doc, MetaExifIsoSpeedRatings, tmp)
if (*libraw_lib->other.desc != '\0') {
APPEND_STR_META(doc, MetaContent, libraw_lib->other.desc)
}
if (*libraw_lib->other.artist != '\0') {
APPEND_STR_META(doc, MetaArtist, libraw_lib->other.artist)
}
struct tm *time = localtime(&libraw_lib->other.timestamp);
strftime(tmp, sizeof(tmp), "%Y:%m:%d %H:%M:%S", time);
APPEND_STR_META(doc, MetaExifDateTime, tmp)
snprintf(tmp, sizeof(tmp), "%.1f", libraw_lib->other.focal_len);
APPEND_STR_META(doc, MetaExifFocalLength, tmp)
snprintf(tmp, sizeof(tmp), "%.1f", libraw_lib->other.aperture);
APPEND_STR_META(doc, MetaExifFNumber, tmp)
int denominator = (int) roundf(1 / libraw_lib->other.shutter);
snprintf(tmp, sizeof(tmp), "1/%d", denominator);
APPEND_STR_META(doc, MetaExifExposureTime, tmp)
libraw_gps_info_t gps = libraw_lib->other.parsed_gps;
double gps_longitude_dec =
(gps.longitude[0] + gps.longitude[1] / 60 + gps.longitude[2] / 3600) * DMS_REF(gps.longref);
snprintf(tmp, sizeof(tmp), "%.15f", gps_longitude_dec);
if (gps_longitude_dec != 0.0) {
APPEND_STR_META(doc, MetaExifGpsLongitudeDec, tmp)
}
double gps_latitude_dec = (gps.latitude[0] + gps.latitude[1] / 60 + gps.latitude[2] / 3600) * DMS_REF(gps.latref);
snprintf(tmp, sizeof(tmp), "%.15f", gps_latitude_dec);
if (gps_latitude_dec != 0.0) {
APPEND_STR_META(doc, MetaExifGpsLatitudeDec, tmp)
}
APPEND_STR_META(doc, MetaMediaVideoCodec, "raw")
if (ctx->tn_size <= 0) {
free(buf);
libraw_close(libraw_lib);
return;
}
int unpack_ret = libraw_unpack_thumb(libraw_lib);
if (unpack_ret != 0) {
CTX_LOG_ERRORF(f->filepath, "libraw_unpack_thumb returned error code %d", unpack_ret)
free(buf);
libraw_close(libraw_lib);
return;
}
int errc = 0;
libraw_processed_image_t *thumb = libraw_dcraw_make_mem_thumb(libraw_lib, &errc);
if (errc != 0) {
free(buf);
libraw_dcraw_clear_mem(thumb);
libraw_close(libraw_lib);
return;
}
int tn_ok = 0;
if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_JPEG) {
tn_ok = store_thumbnail_jpeg(ctx, thumb, doc);
} else if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_BITMAP) {
// TODO: technically this should work but is currently untested
tn_ok = store_thumbnail_rgb24(ctx, thumb, doc);
}
libraw_dcraw_clear_mem(thumb);
if (tn_ok == TRUE) {
free(buf);
libraw_close(libraw_lib);
return;
}
ret = libraw_unpack(libraw_lib);
if (ret != 0) {
CTX_LOG_ERROR(f->filepath, "Could not unpack raw file")
free(buf);
libraw_close(libraw_lib);
return;
}
libraw_dcraw_process(libraw_lib);
errc = 0;
libraw_processed_image_t *img = libraw_dcraw_make_mem_image(libraw_lib, &errc);
if (errc != 0) {
free(buf);
libraw_dcraw_clear_mem(img);
libraw_close(libraw_lib);
return;
}
store_thumbnail_rgb24(ctx, img, doc);
libraw_dcraw_clear_mem(img);
libraw_close(libraw_lib);
free(buf);
}

17
third-party/libscan/libscan/raw/raw.h vendored Normal file
View File

@@ -0,0 +1,17 @@
#ifndef SIST2_RAW_H
#define SIST2_RAW_H
#include "../scan.h"
typedef struct {
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int tn_size;
float tn_qscale;
} scan_raw_ctx_t;
void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif //SIST2_RAW_H

171
third-party/libscan/libscan/scan.h vendored Normal file
View File

@@ -0,0 +1,171 @@
#ifndef SCAN_SCAN_H
#define SCAN_SCAN_H
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <sys/stat.h>
#include <openssl/md5.h>
#include <openssl/sha.h>
#include "macros.h"
#define SIST_SWS_ALGO SWS_LANCZOS
#define UNUSED(x) __attribute__((__unused__)) x
typedef void (*store_callback_t)(char *key, size_t key_len, char *buf, size_t buf_len);
typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...);
typedef void (*log_callback_t)(const char *filepath, int level, char *str);
typedef int scan_code_t;
#define SCAN_OK (scan_code_t) 0
#define SCAN_ERR_READ (scan_code_t) (-1)
#define SCAN_ERR_SKIP (scan_code_t) (-2)
#define LEVEL_DEBUG 0
#define LEVEL_INFO 1
#define LEVEL_WARNING 2
#define LEVEL_ERROR 3
#define LEVEL_FATAL 4
#define CTX_LOG_DEBUGF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_DEBUG, fmt, __VA_ARGS__);
#define CTX_LOG_DEBUG(filepath, str) ctx->log(filepath, LEVEL_DEBUG, str);
#define CTX_LOG_INFOF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_INFO, fmt, __VA_ARGS__);
#define CTX_LOG_INFO(filepath, str) ctx->log(filepath, LEVEL_INFO, str);
#define CTX_LOG_WARNINGF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_WARNING, fmt, __VA_ARGS__);
#define CTX_LOG_WARNING(filepath, str) ctx->log(filepath, LEVEL_WARNING, str);
#define CTX_LOG_ERRORF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_ERROR, fmt, __VA_ARGS__);
#define CTX_LOG_ERROR(filepath, str) ctx->log(filepath, LEVEL_ERROR, str);
#define CTX_LOG_FATALF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_FATAL, fmt, __VA_ARGS__); exit(-1);
#define CTX_LOG_FATAL(filepath, str) ctx->log(filepath, LEVEL_FATAL, str); exit(-1);
enum metakey {
// String
MetaContent = 1,
MetaMediaAudioCodec,
MetaMediaVideoCodec,
MetaArtist,
MetaAlbum,
MetaAlbumArtist,
MetaGenre,
MetaTitle,
MetaFontName,
MetaParent,
MetaExifMake,
MetaExifDescription,
MetaExifSoftware,
MetaExifExposureTime,
MetaExifFNumber,
MetaExifFocalLength,
MetaExifUserComment,
MetaExifModel,
MetaExifIsoSpeedRatings,
MetaExifDateTime,
MetaAuthor,
MetaModifiedBy,
MetaThumbnail,
MetaChecksum,
// Number
MetaWidth,
MetaHeight,
MetaMediaDuration,
MetaMediaBitrate,
MetaPages,
// ??
MetaExifGpsLongitudeDMS,
MetaExifGpsLongitudeRef,
MetaExifGpsLatitudeDMS,
MetaExifGpsLatitudeRef,
MetaExifGpsLatitudeDec,
MetaExifGpsLongitudeDec,
};
typedef struct meta_line {
struct meta_line *next;
enum metakey key;
union {
char str_val[0];
unsigned long long_val;
double double_val;
};
} meta_line_t;
typedef struct document {
unsigned char path_md5[MD5_DIGEST_LENGTH];
unsigned long size;
unsigned int mime;
int mtime;
short base;
short ext;
char has_parent;
meta_line_t *meta_head;
meta_line_t *meta_tail;
char *filepath;
} document_t;
typedef struct vfile vfile_t;
__attribute__((warn_unused_result))
typedef int (*read_func_t)(struct vfile *, void *buf, size_t size);
__attribute__((warn_unused_result))
typedef long (*seek_func_t)(struct vfile *, long offset, int whence);
typedef void (*close_func_t)(struct vfile *);
typedef void (*reset_func_t)(struct vfile *);
typedef struct vfile {
union {
int fd;
struct archive *arc;
const void *_test_data;
};
int is_fs_file;
int has_checksum;
int calculate_checksum;
const char *filepath;
struct stat info;
SHA_CTX sha1_ctx;
unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
void *rewind_buffer;
int rewind_buffer_size;
int rewind_buffer_cursor;
read_func_t read;
read_func_t read_rewindable;
close_func_t close;
reset_func_t reset;
log_callback_t log;
logf_callback_t logf;
} vfile_t;
typedef struct parse_job_t {
int base;
int ext;
struct vfile vfile;
unsigned char parent[MD5_DIGEST_LENGTH];
char filepath[1];
} parse_job_t;
#include "util.h"
typedef void (*parse_callback_t)(parse_job_t *job);
#endif

64
third-party/libscan/libscan/text/text.c vendored Normal file
View File

@@ -0,0 +1,64 @@
#include "text.h"
scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
int to_read = MIN(ctx->content_size, f->info.st_size);
if (to_read <= 2) {
return SCAN_OK;
}
char *buf = malloc(to_read);
int ret = f->read(f, buf, to_read);
if (ret < 0) {
CTX_LOG_ERRORF(doc->filepath, "read() returned error code: [%d]", ret)
free(buf);
return SCAN_ERR_READ;
}
text_buffer_t tex = text_buffer_create(ctx->content_size);
if ((*(int16_t*)buf) == (int16_t)0xFFFE) {
text_buffer_append_string16_le(&tex, buf + 2, to_read - 2);
} else if((*(int16_t*)buf) == (int16_t)0xFEFF) {
text_buffer_append_string16_be(&tex, buf + 2, to_read - 2);
} else {
text_buffer_append_string(&tex, buf, to_read);
}
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
free(buf);
text_buffer_destroy(&tex);
return SCAN_OK;
}
#define MAX_MARKUP_SIZE (1024 * 1024)
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
int to_read = MIN(MAX_MARKUP_SIZE, f->info.st_size);
char *buf = malloc(to_read + 1);
int ret = f->read(f, buf, to_read);
if (ret < 0) {
CTX_LOG_ERRORF(doc->filepath, "read() returned error code: [%d]", ret)
free(buf);
return SCAN_ERR_READ;
}
*(buf + to_read) = '\0';
text_buffer_t tex = text_buffer_create(ctx->content_size);
text_buffer_append_markup(&tex, buf);
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
free(buf);
text_buffer_destroy(&tex);
return SCAN_OK;
}

18
third-party/libscan/libscan/text/text.h vendored Normal file
View File

@@ -0,0 +1,18 @@
#ifndef SCAN_TEXT_H
#define SCAN_TEXT_H
#include "../scan.h"
#include "../util.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
} scan_text_ctx_t;
scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc);
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

0
third-party/libscan/libscan/util.c vendored Normal file
View File

361
third-party/libscan/libscan/util.h vendored Normal file
View File

@@ -0,0 +1,361 @@
#ifndef SCAN_UTIL_H
#define SCAN_UTIL_H
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
#include "../third-party/utf8.h/utf8.h"
#include "macros.h"
#define STR_STARTS_WITH_CONSTANT(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
#define TEXT_BUF_FULL (-1)
#define INITIAL_BUF_SIZE (1024 * 16)
#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c))
#define SHOULD_KEEP_CHAR(c) (\
((c) >= '\'' && (c) <= ';') || \
((c) >= 'A' && (c) <= 'z') || \
((c) > 127 && (c) != 0x00A0 && (c) && (c) != 0xFFFD))
typedef struct dyn_buffer {
char *buf;
size_t cur;
size_t size;
} dyn_buffer_t;
typedef struct text_buffer {
long max_size;
int last_char_was_whitespace;
dyn_buffer_t dyn_buffer;
} text_buffer_t;
static int utf8_validchr2(const char *s) {
if (0x00 == (0x80 & *s)) {
return TRUE;
} else if (0xf0 == (0xf8 & *s)) {
if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) ||
(0x80 != (0xc0 & s[3]))) {
return FALSE;
}
if (0x80 == (0xc0 & s[4])) {
return FALSE;
}
if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) {
return FALSE;
}
} else if (0xe0 == (0xf0 & *s)) {
if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) {
return FALSE;
}
if (0x80 == (0xc0 & s[3])) {
return FALSE;
}
if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) {
return FALSE;
}
} else if (0xc0 == (0xe0 & *s)) {
if (0x80 != (0xc0 & s[1])) {
return FALSE;
}
if (0x80 == (0xc0 & s[2])) {
return FALSE;
}
if (0 == (0x1e & s[0])) {
return FALSE;
}
} else {
return FALSE;
}
return TRUE;
}
static dyn_buffer_t dyn_buffer_create() {
dyn_buffer_t buf;
buf.size = INITIAL_BUF_SIZE;
buf.cur = 0;
buf.buf = (char *) malloc(INITIAL_BUF_SIZE);
return buf;
}
static void grow_buffer(dyn_buffer_t *buf, size_t size) {
if (buf->cur + size > buf->size) {
do {
buf->size *= 2;
} while (buf->cur + size > buf->size);
buf->buf = (char *) realloc(buf->buf, buf->size);
}
}
static void grow_buffer_small(dyn_buffer_t *buf) {
if (buf->cur + sizeof(long) > buf->size) {
buf->size *= 2;
buf->buf = (char *) realloc(buf->buf, buf->size);
}
}
static void dyn_buffer_write(dyn_buffer_t *buf, const void *data, size_t size) {
grow_buffer(buf, size);
memcpy(buf->buf + buf->cur, data, size);
buf->cur += size;
}
static void dyn_buffer_write_char(dyn_buffer_t *buf, char c) {
grow_buffer_small(buf);
*(buf->buf + buf->cur) = c;
buf->cur += sizeof(c);
}
static void dyn_buffer_write_str(dyn_buffer_t *buf, const char *str) {
dyn_buffer_write(buf, str, strlen(str));
dyn_buffer_write_char(buf, '\0');
}
static void dyn_buffer_append_string(dyn_buffer_t *buf, const char *str) {
dyn_buffer_write(buf, str, strlen(str));
}
static void dyn_buffer_write_int(dyn_buffer_t *buf, int d) {
grow_buffer_small(buf);
*(int *) (buf->buf + buf->cur) = d;
buf->cur += sizeof(int);
}
static void dyn_buffer_write_short(dyn_buffer_t *buf, uint16_t s) {
grow_buffer_small(buf);
*(uint16_t *) (buf->buf + buf->cur) = s;
buf->cur += sizeof(uint16_t);
}
static void dyn_buffer_write_long(dyn_buffer_t *buf, unsigned long l) {
grow_buffer_small(buf);
*(unsigned long *) (buf->buf + buf->cur) = l;
buf->cur += sizeof(unsigned long);
}
static void dyn_buffer_destroy(dyn_buffer_t *buf) {
free(buf->buf);
}
static void text_buffer_destroy(text_buffer_t *buf) {
dyn_buffer_destroy(&buf->dyn_buffer);
}
static text_buffer_t text_buffer_create(long max_size) {
text_buffer_t text_buf;
text_buf.dyn_buffer = dyn_buffer_create();
text_buf.max_size = max_size;
text_buf.last_char_was_whitespace = FALSE;
return text_buf;
}
static int text_buffer_append_char(text_buffer_t *buf, int c) {
if (SHOULD_IGNORE_CHAR(c) || c == ' ') {
if (!buf->last_char_was_whitespace && buf->dyn_buffer.cur != 0) {
dyn_buffer_write_char(&buf->dyn_buffer, ' ');
buf->last_char_was_whitespace = TRUE;
if (buf->max_size > 0 && buf->dyn_buffer.cur > buf->max_size) {
return TEXT_BUF_FULL;
}
}
} else {
buf->last_char_was_whitespace = FALSE;
grow_buffer_small(&buf->dyn_buffer);
if (((utf8_int32_t) 0xffffff80 & c) == 0) {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = (char) c;
} else if (((utf8_int32_t) 0xfffff800 & c) == 0) {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xc0 | (char) (c >> 6);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
} else if (((utf8_int32_t) 0xffff0000 & c) == 0) {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xe0 | (char) (c >> 12);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
} else {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xf0 | (char) (c >> 18);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 12) & 0x3f);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
}
if (buf->max_size > 0 && buf->dyn_buffer.cur > buf->max_size) {
return TEXT_BUF_FULL;
}
}
return 0;
}
static void text_buffer_terminate_string(text_buffer_t *buf) {
if (buf->dyn_buffer.cur > 0 && *(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) == ' ') {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) = '\0';
} else {
dyn_buffer_write_char(&buf->dyn_buffer, '\0');
}
}
// Naive UTF16 -> ascii conversion
static int text_buffer_append_string16_le(text_buffer_t *buf, const char *str, size_t len) {
int ret = 0;
for (int i = 1; i < len; i += 2) {
ret = text_buffer_append_char(buf, str[i]);
}
return ret;
}
static int text_buffer_append_string16_be(text_buffer_t *buf, const char *str, size_t len) {
int ret = 0;
for (int i = 0; i < len; i += 2) {
ret = text_buffer_append_char(buf, str[i]);
}
return ret;
}
#define UTF8_END_OF_STRING \
(ptr - str >= len || *ptr == 0 || \
(0xc0 == (0xe0 & *ptr) && ptr - str > len - 2) || \
(0xe0 == (0xf0 & *ptr) && ptr - str > len - 3) || \
(0xf0 == (0xf8 & *ptr) && ptr - str > len - 4))
static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t len) {
const char *ptr = str;
const char *oldPtr = ptr;
if (str == NULL || UTF8_END_OF_STRING) {
return 0;
}
if (len <= 4) {
for (int i = 0; i < len; i++) {
if (((utf8_int32_t) 0xffffff80 & str[i]) == 0 && SHOULD_KEEP_CHAR(str[i])) {
dyn_buffer_write_char(&buf->dyn_buffer, str[i]);
}
}
return 0;
}
utf8_int32_t c;
char tmp[16] = {0};
do {
ptr = (char *) utf8codepoint(ptr, &c);
*(int *) tmp = 0x00000000;
memcpy(tmp, oldPtr, ptr - oldPtr);
oldPtr = ptr;
if (!utf8_validchr2(tmp)) {
continue;
}
int ret = text_buffer_append_char(buf, c);
if (ret != 0) {
return ret;
}
} while (!UTF8_END_OF_STRING);
return 0;
}
static int text_buffer_append_string0(text_buffer_t *buf, const char *str) {
return text_buffer_append_string(buf, str, strlen(str));
}
static int text_buffer_append_markup(text_buffer_t *buf, const char *markup) {
int tag_open = TRUE;
const char *ptr = markup;
const char *start = markup;
while (*ptr != '\0') {
if (tag_open) {
if (*ptr == '>') {
tag_open = FALSE;
start = ptr + 1;
}
} else {
if (*ptr == '<') {
tag_open = TRUE;
if (ptr != start) {
if (text_buffer_append_string(buf, start, (ptr - start)) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
if (text_buffer_append_char(buf, ' ') == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
}
}
}
ptr += 1;
}
if (ptr != start) {
if (text_buffer_append_string(buf, start, (ptr - start)) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
if (text_buffer_append_char(buf, ' ') == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
}
return 0;
}
static void *read_all(vfile_t *f, size_t *size) {
void *buf = malloc(f->info.st_size);
*size = f->read(f, buf, f->info.st_size);
if (*size != f->info.st_size) {
free(buf);
return NULL;
}
return buf;
}
#define STACK_BUFFER_SIZE (size_t)(4096 * 8)
__always_inline
static void safe_sha1_update(SHA_CTX *ctx, void *buf, size_t size) {
unsigned char stack_buf[STACK_BUFFER_SIZE];
void *sha1_buf;
if (size <= STACK_BUFFER_SIZE) {
sha1_buf = stack_buf;
} else {
void *heap_sha1_buf = malloc(size);
sha1_buf = heap_sha1_buf;
}
memcpy(sha1_buf, buf, size);
SHA1_Update(ctx, (const void *) sha1_buf, size);
if (sha1_buf != stack_buf) {
free(sha1_buf);
}
}
#endif

View File

@@ -0,0 +1,200 @@
#include "libwpd_c_api.h"
#include "libwpd/libwpd.h"
#include "libwpd/WPXProperty.h"
#include "libwpd-stream/libwpd-stream.h"
class StringDocument : public WPXDocumentInterface {
private:
text_buffer_t *tex;
document_t *doc;
bool is_full;
public:
StringDocument(text_buffer_t *tex, document_t *doc) {
this->tex = tex;
this->doc = doc;
this->is_full = false;
}
void setDocumentMetaData(const WPXPropertyList &propList) override {
WPXPropertyList::Iter propIter(propList);
for (propIter.rewind(); propIter.next();) {
// TODO: Read metadata here ?!
}
}
void endDocument() override {
text_buffer_terminate_string(this->tex);
}
void closeParagraph() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void closeSpan() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void closeSection() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertTab() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertSpace() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertText(const WPXString &text) override {
if (!this->is_full) {
if (text_buffer_append_string0(tex, text.cstr()) == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertLineBreak() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void definePageStyle(const WPXPropertyList &propList) override { /* noop */ }
void closePageSpan() override { /* noop */ }
void openHeader(const WPXPropertyList &propList) override { /* noop */ }
void closeHeader() override { /* noop */ }
void openFooter(const WPXPropertyList &propList) override { /* noop */ }
void closeFooter() override { /* noop */ }
void
defineParagraphStyle(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void openParagraph(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void defineCharacterStyle(const WPXPropertyList &propList) override { /* noop */ }
void openSpan(const WPXPropertyList &propList) override { /* noop */ }
void
defineSectionStyle(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void openSection(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void insertField(const WPXString &type, const WPXPropertyList &propList) override { /* noop */ }
void defineOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void defineUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void openOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void openUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void closeOrderedListLevel() override { /* noop */ }
void closeUnorderedListLevel() override { /* noop */ }
void openListElement(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void closeListElement() override { /* noop */ }
void openFootnote(const WPXPropertyList &propList) override { /* noop */ }
void closeFootnote() override { /* noop */ }
void openEndnote(const WPXPropertyList &propList) override { /* noop */ }
void closeEndnote() override { /* noop */ }
void openComment(const WPXPropertyList &propList) override { /* noop */ }
void closeComment() override { /* noop */ }
void openTextBox(const WPXPropertyList &propList) override { /* noop */ }
void closeTextBox() override { /* noop */ }
void openTable(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void openTableRow(const WPXPropertyList &propList) override { /* noop */ }
void closeTableRow() override { /* noop */ }
void openTableCell(const WPXPropertyList &propList) override { /* noop */ }
void closeTableCell() override { /* noop */ }
void insertCoveredTableCell(const WPXPropertyList &propList) override { /* noop */ }
void closeTable() override { /* noop */ }
void openFrame(const WPXPropertyList &propList) override { /* noop */ }
void closeFrame() override { /* noop */ }
void insertBinaryObject(const WPXPropertyList &propList, const WPXBinaryData &data) override { /* noop */ }
void insertEquation(const WPXPropertyList &propList, const WPXString &data) override { /* noop */ }
void openPageSpan(const WPXPropertyList &propList) override { /* noop */ }
void startDocument() override { /* noop */ };
};
wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len) {
auto *input = new WPXStringStream(buf, buf_len);
return input;
}
wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t ptr) {
auto *stream = (WPXStringStream *) ptr;
WPDConfidence confidence = WPDocument::isFileFormatSupported(stream);
return (wpd_confidence_t) confidence;
}
wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc) {
auto *stream = (WPXStringStream *) ptr;
auto myDoc = StringDocument(tex, doc);
WPDResult result2 = WPDocument::parse(stream, &myDoc, nullptr);
return (wpd_result_t) result2;
}
void wpd_memory_stream_destroy(wpd_stream_t ptr) {
auto *stream = (WPXStringStream *) ptr;
delete stream;
}

View File

@@ -0,0 +1,50 @@
#ifndef SIST2_LIBWPD_C_API_H
#define SIST2_LIBWPD_C_API_H
#include "stdlib.h"
#ifdef __cplusplus
#define EXTERNC extern "C"
#else
#define EXTERNC
#endif
#ifdef __cplusplus
extern "C" {
#endif
#include "../scan.h"
#include "../util.h"
#ifdef __cplusplus
};
#endif
typedef void *wpd_stream_t;
typedef enum {
C_WPD_CONFIDENCE_NONE = 0,
C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION,
C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION,
C_WPD_CONFIDENCE_EXCELLENT
} wpd_confidence_t;
typedef enum {
C_WPD_OK,
C_WPD_FILE_ACCESS_ERROR,
C_WPD_PARSE_ERROR,
C_WPD_UNSUPPORTED_ENCRYPTION_ERROR,
C_WPD_PASSWORD_MISSMATCH_ERROR,
C_WPD_OLE_ERROR,
C_WPD_UNKNOWN_ERROR
} wpd_result_t;
EXTERNC wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t stream);
EXTERNC wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len);
EXTERNC void wpd_memory_stream_destroy(wpd_stream_t stream);
EXTERNC wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc);
#endif

41
third-party/libscan/libscan/wpd/wpd.c vendored Normal file
View File

@@ -0,0 +1,41 @@
#include "wpd.h"
#include "libwpd_c_api.h"
scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
void *buf = read_all(f, &buf_len);
void *stream = wpd_memory_stream_create(buf, buf_len);
wpd_confidence_t conf = wpd_is_file_format_supported(stream);
if (conf == C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION || conf == C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION) {
CTX_LOG_DEBUGF("wpd.c", "File is encrypted! Password-protected WPD files are not supported yet (conf=%d)", conf)
wpd_memory_stream_destroy(stream);
free(buf);
return SCAN_ERR_READ;
}
if (conf != C_WPD_CONFIDENCE_EXCELLENT) {
CTX_LOG_ERRORF("wpd.c", "Unsupported file format! [%s] (conf=%d)", doc->filepath, conf)
wpd_memory_stream_destroy(stream);
free(buf);
return SCAN_ERR_READ;
}
text_buffer_t tex = text_buffer_create(-1);
wpd_result_t res = wpd_parse(stream, &tex, doc);
if (res != C_WPD_OK) {
CTX_LOG_ERRORF("wpd.c", "Error while parsing WPD file [%s] (%d)",
doc->filepath, res)
}
if (tex.dyn_buffer.cur != 0) {
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
}
text_buffer_destroy(&tex);
wpd_memory_stream_destroy(stream);
free(buf);
}

23
third-party/libscan/libscan/wpd/wpd.h vendored Normal file
View File

@@ -0,0 +1,23 @@
#ifndef SIST2_WPD_H
#define SIST2_WPD_H
#include "../scan.h"
#include "../util.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
unsigned int wpd_mime;
} scan_wpd_ctx_t;
scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc);
__always_inline
static int is_wpd(scan_wpd_ctx_t *ctx, unsigned int mime) {
return mime == ctx->wpd_mime;
}
#endif

1182
third-party/libscan/test/main.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More