Compare commits

...

68 Commits

Author SHA1 Message Date
4dedd281f1 Push compiled vue changes 2022-01-09 09:30:31 -05:00
65c499e477 Merge pull request #231 from simon987/dev
v2.11.6
2022-01-09 09:28:24 -05:00
625f3d0d6e Option to update media type tab in real time, add media type table in details 2022-01-08 18:23:22 -05:00
64b8aab8bf Validate that all the tesseract data files are in the same folder 2022-01-08 15:04:07 -05:00
ad95684771 Update --ocr-* args, enable OCR'ing images 2022-01-08 14:24:50 -05:00
b37e5a4ad4 Fix some warnings in media.c 2022-01-08 11:06:14 -05:00
15ae2190cf Fix tesseract lang validation, update README.md, fix tesseract memory leak 2022-01-08 11:04:52 -05:00
255bc2d689 Tweak MIN_OCR_SIZE behavior, update gitignore 2022-01-08 10:33:02 -05:00
fe1aa6dd4c Merge pull request #227 from yatli/dev
refactor: split ocr_extract_text from ebook
2022-01-08 10:25:41 -05:00
cd2a44e016 Update ocr.h
Fix minimum image size validation in ocr_extract_text
2022-01-08 10:24:57 -05:00
ed2a3f342a Localize tag add/delete, fix some translations, add LanguageIcon, add --lang arg, fix lightbox slideshow time, fix gif hover 2022-01-08 10:03:38 -05:00
1107fe9a53 Remove libscan hash debug info 2022-01-08 10:00:34 -05:00
a96e65d039 Add zh-CN option in language dropdown 2022-01-07 17:44:49 -05:00
87936eecd4 Merge pull request #229 from yatli/master
add zh-CN translation
2022-01-07 13:55:14 -05:00
Yatao Li
d817a0e9dd add zh-CN translation 2022-01-08 01:39:50 +08:00
Yatao Li
94a5e0ac59 refactor: split ocr_extract_text from ebook 2022-01-07 23:20:35 +08:00
d40f5052f9 static link for libasan in debug build 2021-12-29 19:25:03 -05:00
ee9a8fa514 Add thread lock for incremental_mark_file_for_copy() 2021-12-29 19:18:10 -05:00
81008d8936 Add --list-file argument 2021-12-29 18:54:13 -05:00
52466d5d8a Update tesseract datapaths 2021-12-25 11:12:00 -05:00
5f73fc024b Version bump, update readme 2021-12-25 11:08:52 -05:00
f2fd7ccf41 Fix raw parsing maybe, fix index picker css 2021-12-25 11:08:52 -05:00
d87fee8e00 Merge pull request #214 from dpieski/patch-2
Update USAGE.md
2021-12-22 09:55:24 -05:00
Andrew
672d1344d7 Update USAGE.md
Get-WmiObject is deprecated in favor of Get-CimInstance
2021-12-15 15:00:36 -06:00
27e32db1ed Fix attempt for excludes 2021-11-17 20:18:48 -05:00
bb91139ffb console log fixes, version bump 2021-11-15 20:52:24 -05:00
70cfa8c37c Fix Dockerfile.arm64 2021-11-13 18:25:24 -05:00
7493dedc8c Merge pull request #208 from simon987/dev
v2.11.4
2021-11-13 17:37:47 -05:00
c786a31bb2 Merge remote-tracking branch 'origin/master' into dev
# Conflicts:
#	README.md
2021-11-13 17:36:55 -05:00
48d024e751 Update dockerfiles 2021-11-13 17:36:30 -05:00
08b2ca9d43 Update lcms -> lcms2 2021-11-12 11:29:50 -05:00
ed8b4f4fad Add natural sorting support 2021-11-12 10:33:51 -05:00
66de93a8bd Language & formatting 2021-11-12 10:17:32 -05:00
e3f78fb693 Shift click & select all/none in index picker 2021-11-12 10:12:25 -05:00
030643cee0 Move CI scripts to script folder 2021-11-12 09:05:37 -05:00
b17b9439df Print progress bar in index module 2021-11-07 13:20:05 -05:00
414f65346c Update docker command in README.md 2021-11-07 13:18:32 -05:00
be8eedc9c7 Skip subtree of excluded directories 2021-11-07 11:56:09 -05:00
5b62fe77f2 Update demo URL 2021-11-07 09:52:28 -05:00
61ab68ce15 Update argparse repo URL 2021-11-07 09:42:17 -05:00
82ecb8bb85 Update gitignore 2021-11-07 09:36:39 -05:00
a41b5dcc1f Remove libscan git submodule 2021-11-07 09:30:14 -05:00
06f21d5f0f Remove libscan submodule 2021-11-07 09:17:02 -05:00
e82a388d1e Don't show resolution badge on narrow images 2021-10-22 10:21:35 -04:00
bf02e571b3 Forgot to add that file two commits ago 2021-10-22 09:44:56 -04:00
750a392a61 Show reduced ResuldCard when there are no results 2021-10-22 09:32:17 -04:00
3d7b977a82 Read ES version, handle legacy versions, add notice & debug info 2021-10-21 19:14:43 -04:00
cd71551a22 Some documentation updates 2021-09-25 09:30:53 -04:00
58741058cf Merge pull request #200 from simon987/dev
v2.11.3
2021-09-24 20:56:00 -04:00
0a7e59b646 Some documentation updates 2021-09-24 20:55:08 -04:00
43a566fe2f Version bump 2021-09-24 20:33:19 -04:00
b2631a86c8 Rework index picker 2021-09-24 20:31:11 -04:00
d0a1deca30 Fix thumbnail in DocInfoModal.vue 2021-09-24 19:40:06 -04:00
b03ce90a05 Fix max_analyzed_offset error 2021-09-20 21:01:23 -04:00
a5eacb4950 Set list item color for sub-documents 2021-09-20 20:40:48 -04:00
0887046b41 Fix sidecar files, better error handling in store_write 2021-09-20 20:34:05 -04:00
17fda1e540 Support for rewind buffer 2021-09-11 20:46:40 -04:00
34b363bfd8 Add argument to calculate checksums 2021-09-11 14:31:48 -04:00
c9aa4bed72 Add argument to calculate checksums 2021-09-11 14:31:31 -04:00
7267d4bd2c Add basic JSON/NDJSON support 2021-09-07 08:14:32 -04:00
43470e9ce6 Add basic JSON/NDJSON support 2021-09-06 21:27:17 -04:00
0331d46fff Merge pull request #186 from simon987/dev
v2.11.2
2021-09-06 14:14:51 -04:00
bbf1aca936 Version bump 2021-09-06 14:14:00 -04:00
27560a82bb Basic support for WordPerfect files 2021-09-06 14:08:53 -04:00
f16ead1902 Parse page numbers from .docx files 2021-09-06 09:50:00 -04:00
e2e07e80c7 Install libasan5 in Dockerfile 2021-09-06 09:25:01 -04:00
9499c6b189 Add v prefix in version badge 2021-09-06 09:18:28 -04:00
c5cd00b76c Update USAGE.md 2021-09-05 20:26:09 -04:00
115 changed files with 7450 additions and 848 deletions

View File

@@ -10,22 +10,7 @@ steps:
- name: build
image: simon987/sist2-build
commands:
- ./ci/build.sh
- name: docker
image: plugins/docker
settings:
username:
from_secret: DOCKER_USER
password:
from_secret: DOCKER_PASSWORD
repo: simon987/sist2
context: ./
dockerfile: ./Dockerfile
auto_tag: true
auto_tag_suffix: x64-linux
when:
event:
- tag
- ./scripts/build.sh
- name: scp files
image: appleboy/drone-scp
settings:
@@ -42,6 +27,21 @@ steps:
- ./VERSION
- ./sist2-x64-linux
- ./sist2-x64-linux-debug
- name: docker
image: plugins/docker
settings:
username:
from_secret: DOCKER_USER
password:
from_secret: DOCKER_PASSWORD
repo: simon987/sist2
context: ./
dockerfile: ./Dockerfile
auto_tag: true
auto_tag_suffix: x64-linux
when:
event:
- tag
---
kind: pipeline
@@ -55,7 +55,7 @@ steps:
- name: build
image: simon987/sist2-build-arm64
commands:
- ./ci/build_arm64.sh
- ./scripts/build_arm64.sh
- name: scp files
image: appleboy/drone-scp
settings:

8
.gitignore vendored
View File

@@ -10,17 +10,19 @@ Makefile
LOG
sist2*
!sist2-vue/
index.sist2/
*.sist2/
bundle*.css
bundle.js
*.a
vgcore.*
build/
third-party/
third-party/argparse
*.idx/
VERSION
git_hash.h
Testing/
test_i
test_i_inc
node_modules/
node_modules/
.cmake/
i_inc/

11
.gitmodules vendored
View File

@@ -1,6 +1,9 @@
[submodule "third-party/libscan"]
path = third-party/libscan
url = https://github.com/simon987/libscan
[submodule "third-party/argparse"]
path = third-party/argparse
url = https://github.com/cofyc/argparse
url = https://github.com/simon987/argparse
[submodule "third-party/libscan/third-party/utf8.h"]
path = third-party/libscan/third-party/utf8.h
url = https://github.com/sheredom/utf8.h
[submodule "third-party/libscan/third-party/antiword"]
path = third-party/libscan/third-party/antiword
url = https://github.com/simon987/antiword

View File

@@ -22,9 +22,6 @@ add_subdirectory(third-party/argparse)
add_executable(sist2
# argparse
third-party/argparse/argparse.h third-party/argparse/argparse.c
src/main.c
src/sist.h
src/io/walk.h src/io/walk.c
@@ -41,7 +38,11 @@ add_executable(sist2
src/log.c src/log.h
src/cli.c src/cli.h
src/stats.c src/stats.h src/ctx.c
src/parsing/sidecar.c src/parsing/sidecar.h)
src/parsing/sidecar.c src/parsing/sidecar.h
# argparse
third-party/argparse/argparse.h third-party/argparse/argparse.c
)
target_link_directories(sist2 PRIVATE BEFORE ${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib/)
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib)
@@ -86,6 +87,7 @@ if (SIST_DEBUG)
sist2
PRIVATE
-fsanitize=address
-static-libasan
)
set_target_properties(
sist2

View File

@@ -6,12 +6,10 @@ COPY . .
RUN cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake .
RUN make -j$(nproc)
RUN strip sist2
RUN ls -lh
RUN ls -lh sist2-vue/dist/
FROM ubuntu:20.10
FROM ubuntu:21.10
RUN apt update && apt install -y curl
RUN apt update && apt install -y curl libasan5 && rm -rf /var/lib/apt/lists/*
RUN mkdir -p /usr/share/tessdata && \
cd /usr/share/tessdata/ && \
@@ -22,9 +20,9 @@ RUN mkdir -p /usr/share/tessdata && \
curl -o /usr/share/tessdata/rus.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/rus.traineddata &&\
curl -o /usr/share/tessdata/spa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata
COPY --from=build /build/sist2 /root/sist2
ENTRYPOINT ["/root/sist2"]
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
ENTRYPOINT ["/root/sist2"]
COPY --from=build /build/sist2 /root/sist2

View File

@@ -7,9 +7,9 @@ RUN cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE
RUN make -j$(nproc)
RUN strip sist2
FROM ubuntu:20.10
FROM --platform="linux/arm64/v8" ubuntu:21.10
RUN apt update && apt install -y curl
RUN apt update && apt install -y curl libasan5 && rm -rf /var/lib/apt/lists/*
RUN mkdir -p /usr/share/tessdata && \
cd /usr/share/tessdata/ && \
@@ -20,9 +20,9 @@ RUN mkdir -p /usr/share/tessdata && \
curl -o /usr/share/tessdata/rus.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/rus.traineddata &&\
curl -o /usr/share/tessdata/spa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata
COPY --from=build /build/sist2 /root/sist2
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
ENTRYPOINT ["/root/sist2"]
ENTRYPOINT ["/root/sist2"]
COPY --from=build /build/sist2 /root/sist2

View File

@@ -2,7 +2,7 @@
[![CodeFactor](https://www.codefactor.io/repository/github/simon987/sist2/badge?s=05daa325188aac4eae32c786f3d9cf4e0593f822)](https://www.codefactor.io/repository/github/simon987/sist2)
[![Development snapshots](https://ci.simon987.net/api/badges/simon987/sist2/status.svg)](https://files.simon987.net/.gate/sist2/simon987_sist2/)
**Demo**: [sist2.simon987.net](https://sist2.simon987.net/?i=Demo%20files)
**Demo**: [sist2.simon987.net](https://sist2.simon987.net/)
# sist2
@@ -10,7 +10,7 @@ sist2 (Simple incremental search tool)
*Warning: sist2 is in early development*
![sist2.png](docs/sist2.png)
![search panel](docs/sist2.png)
## Features
@@ -33,12 +33,11 @@ sist2 (Simple incremental search tool)
## Getting Started
1. Have an Elasticsearch (>= 6.X.X) instance running
1. Have an Elasticsearch (>= 6.8.X, ideally >=7.14.0) instance running
1. Download [from official website](https://www.elastic.co/downloads/elasticsearch)
1. *(or)* Run using docker:
```bash
docker run -d --name es1 --net sist2_net -p 9200:9200 \
-e "discovery.type=single-node" elasticsearch:7.14.0
docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.14.0
```
1. *(or)* Run using docker-compose:
```yaml
@@ -50,8 +49,9 @@ sist2 (Simple incremental search tool)
```
1. Download sist2 executable
1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases) *
1. *(or)* Download a [development snapshot](https://files.simon987.net/.gate/sist2/simon987_sist2/) *(Not recommended!)*
1. *(or)* `docker pull simon987/sist2:2.11.0-x64-linux`
1. *(or)* Download a [development snapshot](https://files.simon987.net/.gate/sist2/simon987_sist2/) *(Not
recommended!)*
1. *(or)* `docker pull simon987/sist2:2.11.6-x64-linux`
1. See [Usage guide](docs/USAGE.md)
@@ -67,21 +67,23 @@ See [Usage guide](docs/USAGE.md) for more details
## Format support
File type | Library | Content | Thumbnail | Metadata
:---|:---|:---|:---|:---
pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
cbz,cbr | *(none)* | - | yes | - |
`audio/*` | ffmpeg | - | yes | ID3 tags |
`video/*` | ffmpeg | - | yes | title, comment, artist |
`image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags |
ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
`text/plain` | *(none)* | yes | no | - |
html, xml | *(none)* | yes | no | - |
tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
docx, xlsx, pptx | *(none)* | yes | if embedded | creator, modified_by, title |
doc (MS Word 97-2003) | antiword | yes | yes | author, title |
mobi, azw, azw3 | libmobi | yes | no | author, title |
| File type | Library | Content | Thumbnail | Metadata |
|:--------------------------------------------------------------------------|:-----------------------------------------------------------------------------|:---------|:------------|:---------------------------------------------------------------------------------------------------------------------------------------|
| pdf,xps,fb2,epub | MuPDF | text+ocr | yes | author, title |
| cbz,cbr | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | - | yes | - |
| `audio/*` | ffmpeg | - | yes | ID3 tags |
| `video/*` | ffmpeg | - | yes | title, comment, artist |
| `image/*` | ffmpeg | - | yes | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190), GPS tags |
| raw, rw2, dng, cr2, crw, dcr, k25, kdc, mrw, pef, xf3, arw, sr2, srf, erf | LibRaw | - | yes | Common EXIF tags, GPS tags |
| ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
| `text/plain` | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - |
| html, xml | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | no | - |
| tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
| docx, xlsx, pptx | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | if embedded | creator, modified_by, title |
| doc (MS Word 97-2003) | antiword | yes | yes | author, title |
| mobi, azw, azw3 | libmobi | yes | no | author, title |
| wpd (WordPerfect) | libwpd | yes | no | *planned* |
| json, jsonl, ndjson | [libscan](https://github.com/simon987/sist2/tree/master/third-party/libscan) | yes | - | - |
\* *See [Archive files](#archive-files)*
@@ -100,18 +102,24 @@ scan is also supported.
### OCR
You can enable OCR support for pdf,xps,fb2,epub file types with the
`--ocr <lang>` option. Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
You can enable OCR support for ebook (pdf,xps,fb2,epub) or image file types with the
`--ocr-lang <lang>` option in combination with `--ocr-images` and/or `--ocr-ebooks`.
Download the language data files with your package manager (`apt install tesseract-ocr-eng`) or
directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).
The `simon987/sist2` image comes with common languages
(hin, jpn, eng, fra, rus, spa) pre-installed.
Examples
You can use the `+` separator to specify multiple languages. The language
name must be identical to the `*.traineddata` file installed on your system
(use `chi_sim` rather than `chi-sim`).
Examples:
```bash
sist2 scan --ocr jpn ~/Books/Manga/
sist2 scan --ocr eng ~/Books/Textbooks/
sist2 scan --ocr-ebooks --ocr-lang jpn ~/Books/Manga/
sist2 scan --ocr-images --ocr-lang eng ~/Images/Screenshots/
sist2 scan --ocr-ebooks --ocr-images --ocr-lang eng+chi_sim ~/Chinese-Bilingual/
```
## Build from source
@@ -124,7 +132,7 @@ You can compile **sist2** by yourself if you don't want to use the pre-compiled
git clone --recursive https://github.com/simon987/sist2/
cd sist2
docker build . -f ./Dockerfile -t my-sist2-image
docker run --rm my-sist2-image cat /root/sist2 > sist2-x64-linux
docker run --rm --entrypoint cat my-sist2-image /root/sist2 > sist2-x64-linux
```
### On a linux computer
@@ -134,14 +142,14 @@ docker run --rm my-sist2-image cat /root/sist2 > sist2-x64-linux
```bash
apt install gcc g++ python3 yasm ragel automake autotools-dev wget libtool libssl-dev curl zip unzip tar xorg-dev libglu1-mesa-dev libxcursor-dev libxml2-dev libxinerama-dev gettext nasm git
```
1. Apply vcpkg patches, as per [sist2-build](https://github.com/simon987/sist2-build) Dockerfile
1. Install vcpkg dependencies
```bash
vcpkg install curl[core,openssl]
vcpkg install lmdb cjson glib brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libuuid libmagic libraw jasper lcms gumbo
vcpkg install lmdb cjson glib brotli libarchive[core,bzip2,libxml2,lz4,lzma,lzo] pthread tesseract libxml2 libmupdf gtest mongoose libmagic libraw jasper lcms gumbo
```
1. Build

View File

@@ -14,6 +14,7 @@
* [examples](#web-examples)
* [rewrite_url](#rewrite_url)
* [link to specific indices](#link-to-specific-indices)
* [elasticsearch](#elasticsearch)
* [exec-script](#exec-script)
* [tagging](#tagging)
* [sidecar files](#sidecar-files)
@@ -32,7 +33,7 @@ Lightning-fast file system indexer and search tool.
Scan options
-t, --threads=<int> Number of threads. DEFAULT=1
-q, --quality=<flt> Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=5
-q, --quality=<flt> Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=3
--size=<int> Thumbnail size, in pixels. Use negative value to disable. DEFAULT=500
--content-size=<int> Number of bytes to be extracted from text documents. Use negative value to disable. DEFAULT=32768
--incremental=<str> Reuse an existing index and only scan modified files.
@@ -41,12 +42,15 @@ Scan options
--name=<str> Index display name. DEFAULT: (name of the directory)
--depth=<int> Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1
--archive=<str> Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse
--ocr=<str> Tesseract language (use tesseract --list-langs to see which are installed on your machine)
--archive-passphrase=<str> Passphrase for encrypted archive files
# TODO: add new --ocr-* options here
-e, --exclude=<str> Files that match this regex will not be scanned
--fast Only index file names & mime type
--treemap-threshold=<str> Relative size threshold for treemap (see USAGE.md). DEFAULT: 0.0005
--mem-buffer=<int> Maximum memory buffer size per thread in MB for files inside archives (see USAGE.md). DEFAULT: 2000
--read-subtitles Read subtitles from media files
--read-subtitles Read subtitles from media files.
--fast-epub Faster but less accurate EPUB parsing (no thumbnails, metadata)
--checksums Calculate file checksums when scanning.
Index options
-t, --threads=<int> Number of threads. DEFAULT=1
@@ -66,13 +70,14 @@ Web options
--bind=<str> Listen on this address. DEFAULT=localhost:4090
--auth=<str> Basic auth in user:password format
--tag-auth=<str> Basic auth in user:password format for tagging
--tagline=<str> Tagline in navbar
--dev Serve html & js files from disk (for development)
Exec-script options
--es-url=<str> Elasticsearch url. DEFAULT=http://localhost:9200
--es-index=<str> Elasticsearch index name. DEFAULT=sist2
--script-file=<str> Path to user script.
--async-script Execute user script asynchronously.
Made by simon987 <me@simon987.net>. Released under GPL-3.0
```
## Scan
@@ -80,9 +85,9 @@ Made by simon987 <me@simon987.net>. Released under GPL-3.0
### Scan options
* `-t, --threads`
Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-WmiObject Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
Number of threads for file parsing. **Do not set a number higher than `$(nproc)` or `$(Get-CimInstance Win32_ComputerSystem).NumberOfLogicalProcessors` in Windows!**
* `-q, --quality`
Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. *Does not affect PDF thumbnails quality*
Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best.
* `--size`
Thumbnail size in pixels.
* `--content-size`
@@ -125,6 +130,10 @@ Made by simon987 <me@simon987.net>. Released under GPL-3.0
To check if a media file can be parsed without *seek*, execute `cat file.mp4 | ffprobe -`
* `--read-subtitles` When enabled, will attempt to read the subtitles stream from media files.
* `--fast-epub` Much faster but less accurate EPUB parsing. When enabled, sist2 will use a simple HTML parser to read epub files instead of the MuPDF library. No thumbnails are generated and author/title metadata are not parsed.
* `--checksums` Calculate file checksums (sha1) when scanning files. This option does not cause any additional read
operations. Checksums are not calculated for all file types, unless the file is inside an archive. When enabled, duplicate
files are hidden in the web UI (this behaviour can be toggled in the Configuration page).
### Scan examples
@@ -145,15 +154,11 @@ sist2 scan --incremental ./orig_idx/ -o ./updated_idx/ ~/Documents
### Index format
A typical `binary` type index structure looks like this:
A typical `ndjson` type index structure looks like this:
```
documents.idx/
├── descriptor.json
├── _index_139965416830720
├── _index_139965425223424
├── _index_139965433616128
├── _index_139965442008832
├── _index_139965442008832
├── _index_main.ndjson.zst
├── treemap.csv
├── agg_mime.csv
├── agg_date.csv
@@ -169,9 +174,7 @@ documents.idx/
└── lock.mdb
```
The `_index_*` files contain the raw binary index data and are not meant to be
read by other applications. The format is generally compatible across different
sist2 versions.
The `_index_*.ndjson.zst` files contain the document data in JSON format, in a compressed newline-delemited file.
The `thumbs/` folder is a [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database)
database containing the thumbnails.
@@ -181,66 +184,6 @@ following fields are safe to modify manually: `root`, `name`, [rewrite_url](#rew
The `.csv` are pre-computed aggregations necessary for the stats page.
*Advanced usage*
Instead of using the `scan` module, you can also import an index generated
by a third party application. The 'external' index must have the following format:
```
my_index/
├── descriptor.json
├── _index_0
└── thumbs/
| ├── data.mdb
| └── lock.mdb
└── meta/
└── <empty>
```
*descriptor.json*:
```json
{
"uuid": "<valid UUID4>",
"version": "_external_v1",
"root": "(optional)",
"name": "<name>",
"rewrite_url": "(optional)",
"type": "json",
"timestamp": 1578971024
}
```
*_index_0*: NDJSON format (One json object per line)
```json
{
"_id": "unique uuid for the file",
"index": "index uuid4 (same one as descriptor.json!)",
"mime": "application/x-cbz",
"size": 14341204,
"mtime": 1578882996,
"extension": "cbz",
"name": "my_book",
"path": "path/to/books",
"content": "text contents of the book",
"title": "Title of the book",
"tag": ["genre.fiction", "author.someguy", "etc..."],
"_keyword": [
{"k": "ISBN", "v": "ABCD34789231"}
],
"_text": [
{"k": "other", "v": "This will be indexed as text"}
]
}
```
You can find the full list of supported fields [here](../src/io/serialize.c#L90)
The `_keyword.*` items will be indexed and searchable as **keyword** fields (only full matches allowed).
The `_text.*` items will be indexed and searchable as **text** fields (fuzzy searching allowed)
*thumbs/*:
LMDB key-value store. Keys are **binary** 16-byte md5 hash* (`_id` field)
@@ -248,9 +191,6 @@ and values are raw image bytes.
*\* Hash is calculated from the full path of the file, including the extension, relative to the index root*
Importing an external `binary` type index is technically possible but
it is currently unsupported and has no guaranties of back/forward compatibility.
## Index
### Index options
@@ -276,6 +216,7 @@ it is currently unsupported and has no guaranties of back/forward compatibility.
down the process.
* `-f, --force-reset`
Reset Elasticsearch mappings and settings.
* `-t, --threads` Number of threads to use. Ideally, choose a number equal to the number of logical cores of the machine hosting Elasticsearch.
### Index examples
@@ -305,6 +246,8 @@ sist2 index --print ./my_index/ | jq | less
* `--auth=<str>` Basic auth in user:password format
* `--tag-auth=<str>` Basic auth in user:password format. Works the same way as the
`--auth` argument, but authentication is only applied the `/tag/` endpoint.
* `--tagline=<str>` When specified, will replace the default tagline in the navbar.
* `--dev` Serve html & js files from disk (for development, used to modify frontend files without having to recompile)
### Web examples
@@ -324,14 +267,19 @@ sist2 web index1 index2 index3 index4
When the `rewrite_url` field is not empty, the web module ignores the `root`
field and will return a HTTP redirect to `<rewrite_url><path>/<name><extension>`
instead of serving the file from disk.
Both the `root` and `rewrite_url` fields are safe to manually modify from the
Both the `root` and `rewrite_url` fields are safe to manually modify from the
`descriptor.json` file.
### Link to specific indices
# Elasticsearch
To link to specific indices, you can add a list of comma-separated index name to
the URL: `?i=<name>,<name>`. By default, indices with `"(nsfw)"` in their name are
not displayed.
Elasticsearch versions >=6.8.0, <8.0.0 are supported by sist2.
Using a version >=7.14.0 is recommended to enable the following features:
- Bug fix for large documents (See #198)
When using a legacy version of ES, a notice will be displayed next to the sist2 version in the web UI.
If you don't care about the features above, you can ignore it or disable it in the configuration page.
## exec-script
@@ -367,7 +315,7 @@ See [scripting](scripting.md) documentation.
# Sidecar files
When scanning, sist2 will read metadata from `.s2meta` JSON files and overwrite the
original document's metadata. Sidecar metadata files will also work inside archives.
original document's indexed metadata (does not modify the actual file). Sidecar metadata files will also work inside archives.
Sidecar files themselves are not saved in the index.
This feature is useful to leverage third-party applications such as speech-to-text or

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.9 KiB

After

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 889 KiB

After

Width:  |  Height:  |  Size: 1011 KiB

View File

@@ -4,6 +4,10 @@
"type": "keyword",
"doc_values": true
},
"checksum": {
"type": "keyword",
"index": false
},
"_depth": {
"type": "integer"
},
@@ -74,6 +78,7 @@
"name": {
"analyzer": "content_analyzer",
"type": "text",
"fielddata": true,
"fields": {
"nGram": {
"type": "text",

View File

@@ -2,7 +2,8 @@
"index": {
"refresh_interval": "30s",
"codec": "best_compression",
"number_of_replicas": 0
"number_of_replicas": 0,
"highlight.max_analyzed_offset": 10000000
},
"analysis": {
"tokenizer": {

View File

@@ -0,0 +1,58 @@
{
"index": {
"refresh_interval": "30s",
"codec": "best_compression",
"number_of_replicas": 0
},
"analysis": {
"tokenizer": {
"path_tokenizer": {
"type": "path_hierarchy",
"delimiter": "/"
},
"tag_tokenizer": {
"type": "path_hierarchy",
"delimiter": "."
},
"my_nGram_tokenizer": {
"type": "nGram",
"min_gram": 3,
"max_gram": 3
}
},
"analyzer": {
"path_analyzer": {
"tokenizer": "path_tokenizer",
"filter": [
"lowercase"
]
},
"tag_analyzer": {
"tokenizer": "tag_tokenizer",
"filter": [
"lowercase"
]
},
"case_insensitive_kw_analyzer": {
"tokenizer": "keyword",
"filter": [
"lowercase"
]
},
"my_nGram": {
"tokenizer": "my_nGram_tokenizer",
"filter": [
"lowercase",
"asciifolding"
]
},
"content_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
}

View File

@@ -6,5 +6,4 @@ python3 scripts/mime.py > src/parsing/mime_generated.c
python3 scripts/serve_static.py > src/web/static_generated.c
python3 scripts/index_static.py > src/index/static_generated.c
printf "static const char *const Sist2CommitHash = \"%s\";\n" $(git rev-parse HEAD) > src/git_hash.h
printf "static const char *const LibScanCommitHash = \"%s\";\n" $(cd third-party/libscan/ && git rev-parse HEAD) >> src/git_hash.h
printf "static const char *const Sist2CommitHash = \"%s\";\n" $(git rev-parse HEAD) > src/git_hash.h

View File

@@ -3,6 +3,7 @@ import json
files = [
"schema/mappings.json",
"schema/settings.json",
"schema/settings_legacy.json",
"schema/pipeline.json",
]

View File

@@ -22,6 +22,7 @@ application/java-archive, jar
application/java, class
application/javascript,
application/json, json
application/ndjson, jsonl|ndjson
application/marc, mrc
application/mbedlet, mbd
application/mime, aps
@@ -78,9 +79,7 @@ application/vocaltec-media-desc, vmd
application/vocaltec-media-file, vmf
application/warc, warc
application/winhelp, hlp
application/wordperfect6.0, w60
application/wordperfect6.1, w61
application/wordperfect, wp|wp5|wp6|wpd
application/wordperfect, wp|wp5|wp6|wpd|w60|w61
application/x-123, wk1
application/x-7z-compressed, 7z
application/x-aim, aim
1 application/arj arj
22 application/java class
23 application/javascript
24 application/json json
25 application/ndjson jsonl|ndjson
26 application/marc mrc
27 application/mbedlet mbd
28 application/mime aps
79 application/vocaltec-media-file vmf
80 application/warc warc
81 application/winhelp hlp
82 application/wordperfect6.0 application/wordperfect w60 wp|wp5|wp6|wpd|w60|w61
application/wordperfect6.1 w61
application/wordperfect wp|wp5|wp6|wpd
83 application/x-123 wk1
84 application/x-7z-compressed 7z
85 application/x-aim aim

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -23,7 +23,6 @@
"vue-color": "^2.8.1",
"vue-i18n": "^8.24.4",
"vue-masonry-wall": "^0.3.2",
"vue-multiselect": "^2.1.6",
"vue-router": "^3.2.0",
"vue-simple-suggest": "^1.11.1",
"vuex": "^3.4.0"
@@ -13604,15 +13603,6 @@
"node": ">=10"
}
},
"node_modules/vue-multiselect": {
"version": "2.1.6",
"resolved": "https://registry.npmjs.org/vue-multiselect/-/vue-multiselect-2.1.6.tgz",
"integrity": "sha512-s7jmZPlm9FeueJg1RwJtnE9KNPtME/7C8uRWSfp9/yEN4M8XcS/d+bddoyVwVnvFyRh9msFo0HWeW0vTL8Qv+w==",
"engines": {
"node": ">= 4.0.0",
"npm": ">= 3.0.0"
}
},
"node_modules/vue-observe-visibility": {
"version": "0.4.6",
"resolved": "https://registry.npmjs.org/vue-observe-visibility/-/vue-observe-visibility-0.4.6.tgz",
@@ -26376,11 +26366,6 @@
"vue-observe-visibility": "^0.4.6"
}
},
"vue-multiselect": {
"version": "2.1.6",
"resolved": "https://registry.npmjs.org/vue-multiselect/-/vue-multiselect-2.1.6.tgz",
"integrity": "sha512-s7jmZPlm9FeueJg1RwJtnE9KNPtME/7C8uRWSfp9/yEN4M8XcS/d+bddoyVwVnvFyRh9msFo0HWeW0vTL8Qv+w=="
},
"vue-observe-visibility": {
"version": "0.4.6",
"resolved": "https://registry.npmjs.org/vue-observe-visibility/-/vue-observe-visibility-0.4.6.tgz",

View File

@@ -22,7 +22,6 @@
"vue-color": "^2.8.1",
"vue-i18n": "^8.24.4",
"vue-masonry-wall": "^0.3.2",
"vue-multiselect": "^2.1.6",
"vue-router": "^3.2.0",
"vue-simple-suggest": "^1.11.1",
"vuex": "^3.4.0"

View File

@@ -50,6 +50,8 @@ export interface EsHit {
height: number
duration: number
tag: string[]
checksum: string
thumbnail: string
}
_props: {
isSubDocument: boolean
@@ -60,6 +62,8 @@ export interface EsHit {
isPlayableImage: boolean
isAudio: boolean
hasThumbnail: boolean
tnW: number
tnH: number
}
highlight: {
name: string[] | undefined,
@@ -130,6 +134,8 @@ class Sist2Api {
if ("thumbnail" in hit._source) {
hit._props.hasThumbnail = true;
hit._props.tnW = Number(hit._source.thumbnail.split(",")[0]);
hit._props.tnH = Number(hit._source.thumbnail.split(",")[1]);
}
switch (mimeCategory) {
@@ -250,20 +256,31 @@ class Sist2Api {
});
}
getMimeTypes() {
return this.esQuery({
aggs: {
mimeTypes: {
terms: {
field: "mime",
size: 10000
}
getMimeTypes(query = undefined) {
const AGGS = {
mimeTypes: {
terms: {
field: "mime",
size: 10000
}
},
size: 0,
}).then(resp => {
}
};
if (!query) {
query = {
aggs: AGGS,
size: 0,
};
} else {
query.size = 0;
query.aggs = AGGS;
}
return this.esQuery(query).then(resp => {
const mimeMap: any[] = [];
resp["aggregations"]["mimeTypes"]["buckets"].sort((a: any, b: any) => a.key > b.key).forEach((bucket: any) => {
const buckets = resp["aggregations"]["mimeTypes"]["buckets"];
buckets.sort((a: any, b: any) => a.key > b.key).forEach((bucket: any) => {
const tmp = bucket["key"].split("/");
const category = tmp[0];
const mime = tmp[1];
@@ -283,11 +300,18 @@ class Sist2Api {
});
if (!category_exists) {
mimeMap.push({"text": category, children: [child]});
mimeMap.push({text: category, children: [child], id: category});
}
})
return mimeMap;
mimeMap.forEach(node => {
if (node.children) {
node.children.sort((a, b) => a.id.localeCompare(b.id));
}
})
mimeMap.sort((a, b) => a.id.localeCompare(b.id))
return {buckets, mimeMap};
});
}

View File

@@ -43,6 +43,20 @@ const SORT_MODES = {
{_tie: {order: "asc"}}
],
key: (hit: EsHit) => hit._source.size
},
nameAsc: {
mode: [
{name: {order: "asc"}},
{_tie: {order: "asc"}}
],
key: (hit: EsHit) => hit._source.name
},
nameDesc: {
mode: [
{name: {order: "desc"}},
{_tie: {order: "asc"}}
],
key: (hit: EsHit) => hit._source.name
}
} as any;
@@ -73,6 +87,8 @@ class Sist2Query {
const selectedMimeTypes = getters.selectedMimeTypes;
const selectedTags = getters.selectedTags;
const legacyES = store.state.sist2Info.esVersionLegacy;
const filters = [
{terms: {index: selectedIndexIds}}
] as any[];
@@ -189,6 +205,11 @@ class Sist2Query {
font_name: {},
}
};
if (!legacyES) {
q.highlight.max_analyzed_offset = 9_999_999;
}
if (getters.optSearchInPath) {
q.highlight.fields["path.text"] = {};
q.highlight.fields["path.nGram"] = {};

View File

@@ -5,7 +5,6 @@
<b-card-body>
<!-- TODO: ES connectivity, Link to GH page -->
<b-table :items="tableItems" small borderless responsive="md" thead-class="hidden" class="mb-0"></b-table>
<hr />
@@ -16,7 +15,7 @@
<script>
import IndexDebugInfo from "@/components/IndexDebugInfo";
import DebugIcon from "@/components/DebugIcon";
import DebugIcon from "@/components/icons/DebugIcon";
export default {
name: "DebugInfo.vue",
@@ -28,10 +27,12 @@ export default {
{key: "platform", value: this.$store.state.sist2Info.platform},
{key: "debugBinary", value: this.$store.state.sist2Info.debug},
{key: "sist2CommitHash", value: this.$store.state.sist2Info.sist2Hash},
{key: "libscanCommitHash", value: this.$store.state.sist2Info.libscanHash},
{key: "esIndex", value: this.$store.state.sist2Info.esIndex},
{key: "tagline", value: this.$store.state.sist2Info.tagline},
{key: "dev", value: this.$store.state.sist2Info.dev},
{key: "esVersion", value: this.$store.state.sist2Info.esVersion},
{key: "esVersionSupported", value: this.$store.state.sist2Info.esVersionSupported},
{key: "esVersionLegacy", value: this.$store.state.sist2Info.esVersionLegacy},
]
}
}

View File

@@ -15,11 +15,15 @@
<span class="badge badge-resolution">{{ humanTime(doc._source.duration) }}</span>
</div>
<div v-if="doc._props.isImage && !hover" class="card-img-overlay" :class="{'small-badge': smallBadge}">
<div
v-if="doc._props.isImage && !hover && doc._props.tnW / doc._props.tnH < 5"
class="card-img-overlay"
:class="{'small-badge': smallBadge}">
<span class="badge badge-resolution">{{ `${doc._source.width}x${doc._source.height}` }}</span>
</div>
<div v-if="(doc._props.isVideo || doc._props.isGif) && doc._source.duration > 0 && !hover" class="card-img-overlay"
<div v-if="(doc._props.isVideo || doc._props.isGif) && doc._source.duration > 0 && !hover"
class="card-img-overlay"
:class="{'small-badge': smallBadge}">
<span class="badge badge-resolution">{{ humanTime(doc._source.duration) }}</span>
</div>
@@ -30,16 +34,19 @@
</svg>
</div>
<img v-if="doc._props.isPlayableImage || doc._props.isPlayableVideo"
<img ref="tn"
v-if="doc._props.isPlayableImage || doc._props.isPlayableVideo"
:src="(doc._props.isGif && hover) ? `f/${doc._id}` : `t/${doc._source.index}/${doc._id}`"
alt=""
:style="{height: (doc._props.isGif && hover) ? `${tnHeight()}px` : undefined}"
class="pointer fit card-img-top" @click="onThumbnailClick()">
<img v-else :src="`t/${doc._source.index}/${doc._id}`" alt=""
class="fit card-img-top">
</div>
<!-- Audio player-->
<audio v-if="doc._props.isAudio" ref="audio" preload="none" class="audio-fit fit" controls :type="doc._source.mime"
<audio v-if="doc._props.isAudio" ref="audio" preload="none" class="audio-fit fit" controls
:type="doc._source.mime"
:src="`f/${doc._id}`"
@play="onAudioPlay()"></audio>
@@ -117,6 +124,9 @@ export default {
},
onTnLeave() {
this.hover = false;
},
tnHeight() {
return this.$refs.tn.height;
}
},
}

View File

@@ -4,7 +4,8 @@
<template #modal-title>
<h5 class="modal-title" :title="doc._source.name + ext(doc)">{{ doc._source.name + ext(doc) }}</h5>
</template>
<img :src="`t/${doc._source.index}/${doc._id}`" alt="" class="fit card-img-top">
<img v-if="doc._props.hasThumbnail" :src="`t/${doc._source.index}/${doc._id}`" alt="" class="fit card-img-top">
<InfoTable :doc="doc"></InfoTable>

View File

@@ -1,5 +1,6 @@
<template>
<b-list-group-item class="flex-column align-items-start mb-2">
<b-list-group-item class="flex-column align-items-start mb-2" :class="{'sub-document': doc._props.isSubDocument}"
@mouseenter="onTnEnter()" @mouseleave="onTnLeave()" >
<!-- Info modal-->
<DocInfoModal :show="showInfo" :doc="doc" @close="showInfo = false"></DocInfoModal>
@@ -40,9 +41,11 @@
</div>
<div v-if="doc._source.pages || doc._source.author" class="path-row text-muted">
<span v-if="doc._source.pages">{{ doc._source.pages }} {{ doc._source.pages > 1 ? $t("pages") : $t("page") }}</span>
<span v-if="doc._source.pages">{{ doc._source.pages }} {{
doc._source.pages > 1 ? $t("pages") : $t("page")
}}</span>
<span v-if="doc._source.author && doc._source.pages" class="mx-1">-</span>
<span v-if="doc._source.author">{{doc._source.author}}</span>
<span v-if="doc._source.author">{{ doc._source.author }}</span>
</div>
</div>
</div>
@@ -54,7 +57,7 @@ import TagContainer from "@/components/TagContainer";
import DocFileTitle from "@/components/DocFileTitle";
import DocInfoModal from "@/components/DocInfoModal";
import ContentDiv from "@/components/ContentDiv";
import FileIcon from "@/components/FileIcon";
import FileIcon from "@/components/icons/FileIcon";
export default {
name: "DocListItem",
@@ -83,12 +86,26 @@ export default {
return this.doc.highlight["path.nGram"] + "/"
}
return this.doc._source.path + "/"
}
},
onTnEnter() {
this.hover = true;
},
onTnLeave() {
this.hover = false;
},
}
}
</script>
<style scoped>
.sub-document {
background: #AB47BC1F !important;
}
.theme-black .sub-document {
background: #37474F !important;
}
.list-group {
margin-top: 1em;
}

View File

@@ -1,93 +1,171 @@
<template>
<VueMultiselect
multiple
label="name"
:value="selectedIndices"
:options="indices"
:close-on-select="indices.length <= 1"
:placeholder="$t('indexPickerPlaceholder')"
@select="addItem"
@remove="removeItem">
<div v-if="isMobile">
<b-form-select
:value="selectedIndicesIds"
@change="onSelect($event)"
:options="indices" multiple :select-size="6" text-field="name"
value-field="id"></b-form-select>
</div>
<div v-else>
<template slot="option" slot-scope="idx">
<b-row>
<b-col>
<span class="mr-1">{{ idx.option.name }}</span>
<SmallBadge pill :text="idx.option.version"></SmallBadge>
</b-col>
</b-row>
<b-row class="mt-1">
<b-col>
<span>{{ formatIdxDate(idx.option.timestamp) }}</span>
</b-col>
</b-row>
</template>
<div class="d-flex justify-content-between align-content-center">
<span>
{{ selectedIndices.length }}
{{ selectedIndices.length === 1 ? $t("indexPicker.selectedIndex") : $t("indexPicker.selectedIndices") }}
</span>
</VueMultiselect>
<div>
<b-button variant="link" @click="selectAll()"> {{ $t("indexPicker.selectAll") }}</b-button>
<b-button variant="link" @click="selectNone()"> {{ $t("indexPicker.selectNone") }}</b-button>
</div>
</div>
<b-list-group id="index-picker-desktop" class="unselectable">
<b-list-group-item
v-for="idx in indices"
@click="toggleIndex(idx, $event)"
@click.shift="shiftClick(idx, $event)"
class="d-flex justify-content-between align-items-center list-group-item-action pointer"
:class="{active: lastClickIndex === idx}"
>
<div class="d-flex">
<b-checkbox @change="toggleIndex(idx)" :checked="isSelected(idx)"></b-checkbox>
{{ idx.name }}
<span class="text-muted timestamp-text ml-2">{{ formatIdxDate(idx.timestamp) }}</span>
</div>
<b-badge class="version-badge">v{{ idx.version }}</b-badge>
</b-list-group-item>
</b-list-group>
</div>
</template>
<script lang="ts">
import VueMultiselect from "vue-multiselect"
import SmallBadge from "./SmallBadge.vue"
import {mapActions, mapGetters} from "vuex";
import {Index} from "@/Sist2Api";
import Vue from "vue";
import {format} from "date-fns";
export default Vue.extend({
components: {
VueMultiselect,
SmallBadge
},
data() {
return {
loading: true
loading: true,
lastClickIndex: null
}
},
computed: {
...mapGetters([
"indices", "selectedIndices"
]),
selectedIndicesIds() {
return this.selectedIndices.map(idx => idx.id)
},
isMobile() {
return window.innerWidth <= 650;
}
},
methods: {
...mapActions({
setSelectedIndices: "setSelectedIndices"
}),
removeItem(val: Index): void {
this.setSelectedIndices(this.selectedIndices.filter((item: Index) => item !== val))
shiftClick(index, e) {
if (this.lastClickIndex === null) {
return;
}
const select = this.isSelected(this.lastClickIndex);
let leftBoundary = this.indices.indexOf(this.lastClickIndex);
let rightBoundary = this.indices.indexOf(index);
if (rightBoundary < leftBoundary) {
let tmp = leftBoundary;
leftBoundary = rightBoundary;
rightBoundary = tmp;
}
for (let i = leftBoundary; i <= rightBoundary; i++) {
if (select) {
if (!this.isSelected(this.indices[i])) {
this.setSelectedIndices([this.indices[i], ...this.selectedIndices]);
}
} else {
this.setSelectedIndices(this.selectedIndices.filter(idx => idx !== this.indices[i]));
}
}
},
addItem(val: Index): void {
this.setSelectedIndices([...this.selectedIndices, val])
selectAll() {
this.setSelectedIndices(this.indices);
},
selectNone() {
this.setSelectedIndices([]);
},
onSelect(value) {
this.setSelectedIndices(this.indices.filter(idx => value.includes(idx.id)));
},
formatIdxDate(timestamp: number): string {
return format(new Date(timestamp * 1000), "yyyy-MM-dd");
},
toggleIndex(index, e) {
if (e.shiftKey) {
return;
}
this.lastClickIndex = index;
if (this.isSelected(index)) {
this.setSelectedIndices(this.selectedIndices.filter(idx => idx.id != index.id));
} else {
this.setSelectedIndices([index, ...this.selectedIndices]);
}
},
isSelected(index) {
return this.selectedIndices.find(idx => idx.id == index.id) != null;
}
},
})
</script>
<style src="vue-multiselect/dist/vue-multiselect.min.css"></style>
<style>
.multiselect__option {
padding: 5px 10px;
<style scoped>
.timestamp-text {
line-height: 24px;
font-size: 80%;
}
.multiselect__content-wrapper {
overflow: hidden;
.theme-black .version-badge {
color: #eee !important;
background: none;
}
.theme-black .multiselect__tags {
background: #37474F;
border: 1px solid #616161 !important
.version-badge {
color: #222 !important;
background: none;
}
.theme-black .multiselect__input {
color: #dbdbdb;
background: #37474F;
.list-group-item {
padding: 0.2em 0.4em;
}
.theme-black .multiselect__content-wrapper {
border: none
#index-picker-desktop {
overflow-y: auto;
max-height: 132px;
}
.btn-link:focus {
box-shadow: none;
}
.unselectable {
user-select: none;
-ms-user-select: none;
-moz-user-select: none;
-webkit-user-select: none;
}
.list-group-item.active {
z-index: 2;
background-color: inherit;
color: inherit;
}
</style>

View File

@@ -3,7 +3,7 @@
<template #cell(value)="data">
<span v-if="'html' in data.item" v-html="data.item.html"></span>
<span v-else>{{data.value}}</span>
<span v-else>{{ data.value }}</span>
</template>
</b-table>
</template>
@@ -57,7 +57,8 @@ export default {
"bitrate", "artist", "album", "album_artist", "genre", "font_name", "author",
"modified_by", "pages", "tag",
"exif_make", "exif_software", "exif_exposure_time", "exif_fnumber", "exif_focal_length",
"exif_user_comment", "exif_iso_speed_ratings", "exif_model", "exif_datetime",
"exif_user_comment", "exif_iso_speed_ratings", "exif_model", "exif_datetime",
"checksum"
];
fields.forEach(field => {
@@ -76,9 +77,9 @@ export default {
items.push({
key: "Exif GPS",
html: makeGpsLink(
dmsToDecimal(src["exif_gps_latitude_dms"], src["exif_gps_latitude_ref"]),
dmsToDecimal(src["exif_gps_longitude_dms"], src["exif_gps_longitude_ref"]),
),
dmsToDecimal(src["exif_gps_latitude_dms"], src["exif_gps_latitude_ref"]),
dmsToDecimal(src["exif_gps_longitude_dms"], src["exif_gps_longitude_ref"]),
),
});
}

View File

@@ -1,6 +1,5 @@
<template>
<div>
<!-- TODO: Set slideshowTime as a configurable option-->
<FsLightbox
:key="lightboxKey"
:toggler="showLightbox"
@@ -10,7 +9,7 @@
:types="lightboxTypes"
:source-index="lightboxSlide"
:custom-toolbar-buttons="customButtons"
:slideshow-time="1000 * 10"
:slideshow-time="$store.getters.optLightboxSlideDuration * 1000"
:zoom-increment="0.5"
:load-only-current-source="$store.getters.optLightboxLoadOnlyCurrent"
:on-close="onClose"

View File

@@ -7,37 +7,24 @@ import InspireTree from "inspire-tree";
import InspireTreeDOM from "inspire-tree-dom";
import "inspire-tree-dom/dist/inspire-tree-light.min.css";
import {getSelectedTreeNodes} from "@/util";
import {getSelectedTreeNodes, getTreeNodeAttributes} from "@/util";
import Sist2Api from "@/Sist2Api";
import Sist2Query from "@/Sist2Query";
export default {
name: "MimePicker",
data() {
return {
mimeTree: null,
stashedMimeTreeAttributes: null
}
},
mounted() {
this.$store.subscribe((mutation) => {
if (mutation.type === "setUiMimeMap") {
const mimeMap = mutation.payload.slice();
this.mimeTree = new InspireTree({
selection: {
mode: 'checkbox'
},
data: mimeMap
});
new InspireTreeDOM(this.mimeTree, {
target: '#mimeTree'
});
this.mimeTree.on("node.state.changed", this.handleTreeClick);
this.mimeTree.deselect();
if (this.$store.state._onLoadSelectedMimeTypes.length > 0) {
this.$store.state._onLoadSelectedMimeTypes.forEach(mime => {
this.mimeTree.node(mime).select();
});
}
if (mutation.type === "setUiMimeMap" && this.mimeTree === null) {
this.initializeTree();
} else if (mutation.type === "busSearch") {
this.updateTree();
}
});
},
@@ -49,6 +36,73 @@ export default {
this.$store.commit("setSelectedMimeTypes", getSelectedTreeNodes(this.mimeTree));
},
updateTree() {
if (this.$store.getters.optUpdateMimeMap === false) {
return;
}
if (this.stashedMimeTreeAttributes === null) {
this.stashedMimeTreeAttributes = getTreeNodeAttributes(this.mimeTree);
}
const query = Sist2Query.searchQuery();
Sist2Api.getMimeTypes(query).then(({buckets, mimeMap}) => {
this.$store.commit("setUiMimeMap", mimeMap);
this.$store.commit("setUiDetailsMimeAgg", buckets);
this.mimeTree.removeAll();
this.mimeTree.addNodes(mimeMap);
// Restore selected mimes
if (this.stashedMimeTreeAttributes === null) {
// NOTE: This happens when successive fast searches are triggered
this.stashedMimeTreeAttributes = {};
// Always add the selected mime types
this.$store.state.selectedMimeTypes.forEach(mime => {
this.stashedMimeTreeAttributes[mime] = {
checked: true
}
});
}
Object.entries(this.stashedMimeTreeAttributes).forEach(([mime, attributes]) => {
if (this.mimeTree.node(mime)) {
if (attributes.checked) {
this.mimeTree.node(mime).select();
}
if (attributes.collapsed === false) {
this.mimeTree.node(mime).expand();
}
}
});
this.stashedMimeTreeAttributes = null;
});
},
initializeTree() {
const mimeMap = this.$store.state.uiMimeMap;
this.mimeTree = new InspireTree({
selection: {
mode: "checkbox"
},
data: mimeMap
});
new InspireTreeDOM(this.mimeTree, {
target: "#mimeTree"
});
this.mimeTree.on("node.state.changed", this.handleTreeClick);
this.mimeTree.deselect();
if (this.$store.state._onLoadSelectedMimeTypes.length > 0) {
this.$store.state._onLoadSelectedMimeTypes.forEach(mime => {
this.mimeTree.node(mime).select();
});
}
}
}
}
</script>

View File

@@ -8,7 +8,8 @@
</b-navbar-brand>
<span class="badge badge-pill version" v-if="$store && $store.state.sist2Info">
{{ sist2Version() }}<span v-if="isDebug()">-dbg</span>
v{{ sist2Version() }}<span v-if="isDebug()">-dbg</span><span v-if="isLegacy() && !hideLegacy()">-<a
href="https://github.com/simon987/sist2/blob/master/docs/USAGE.md#elasticsearch" target="_blank">legacyES</a></span>
</span>
<span v-if="$store && $store.state.sist2Info" class="tagline" v-html="tagline()"></span>
@@ -19,7 +20,8 @@
</template>
<script>
import Sist2Icon from "@/components/Sist2Icon";
import Sist2Icon from "@/components/icons/Sist2Icon";
export default {
name: "NavBar",
components: {Sist2Icon},
@@ -32,6 +34,12 @@ export default {
},
isDebug() {
return this.$store.state.sist2Info.debug;
},
isLegacy() {
return this.$store.state.sist2Info.esVersionLegacy;
},
hideLegacy() {
return this.$store.state.optHideLegacy;
}
}
}
@@ -95,7 +103,7 @@ export default {
}
}
.theme-light .btn-link{
.theme-light .btn-link {
color: #222;
}
</style>

View File

@@ -3,31 +3,56 @@
<span>{{ hitCount }} {{ hitCount === 1 ? $t("hit") : $t("hits") }}</span>
<div style="float: right">
<b-button v-b-toggle.collapse-1 variant="primary" class="not-mobile">{{ $t("details") }}</b-button>
<b-button v-b-toggle.collapse-1 variant="primary" class="not-mobile" @click="onToggle()">{{
$t("details")
}}
</b-button>
<SortSelect class="ml-2"></SortSelect>
<template v-if="hitCount !== 0">
<SortSelect class="ml-2"></SortSelect>
<DisplayModeToggle class="ml-2"></DisplayModeToggle>
<DisplayModeToggle class="ml-2"></DisplayModeToggle>
</template>
</div>
<b-collapse id="collapse-1" class="pt-2" style="clear:both;">
<b-card>
<b-table :items="tableItems" small borderless thead-class="hidden" class="mb-0"></b-table>
<b-table :items="tableItems" small borderless bordered thead-class="hidden" class="mb-0"></b-table>
<br/>
<h4>
{{$t("mimeTypes")}}
<b-button size="sm" variant="primary" class="float-right" @click="onCopyClick"><ClipboardIcon/></b-button>
</h4>
<Preloader v-if="$store.state.uiDetailsMimeAgg == null"></Preloader>
<b-table
v-else
sort-by="doc_count"
:sort-desc="true"
thead-class="hidden"
:items="$store.state.uiDetailsMimeAgg" small bordered class="mb-0"
></b-table>
</b-card>
</b-collapse>
</b-card>
</template>
<script lang="ts">
import {EsResult} from "@/Sist2Api";
import Sist2Api, {EsResult} from "@/Sist2Api";
import Vue from "vue";
import {humanFileSize, humanTime} from "@/util";
import {humanFileSize} from "@/util";
import DisplayModeToggle from "@/components/DisplayModeToggle.vue";
import SortSelect from "@/components/SortSelect.vue";
import Preloader from "@/components/Preloader.vue";
import Sist2Query from "@/Sist2Query";
import ClipboardIcon from "@/components/icons/ClipboardIcon.vue";
export default Vue.extend({
name: "ResultsCard",
components: {SortSelect, DisplayModeToggle},
components: {ClipboardIcon, Preloader, SortSelect, DisplayModeToggle},
created() {
},
computed: {
lastResultsLoaded() {
return this.$store.state.lastQueryResults != null;
@@ -52,6 +77,39 @@ export default Vue.extend({
totalSize() {
return humanFileSize((this.$store.state.lastQueryResults as EsResult).aggregations.total_size.value);
},
onToggle() {
const show = !document.getElementById("collapse-1").classList.contains("show");
this.$store.commit("setUiShowDetails", show);
if (show && this.$store.state.uiDetailsMimeAgg == null && !this.$store.state.optUpdateMimeMap) {
// Mime aggs are not updated automatically, update now
this.forceUpdateMimeAgg();
}
},
onCopyClick() {
let tsvString = "";
this.$store.state.uiDetailsMimeAgg.slice().sort((a,b) => b["doc_count"] - a["doc_count"]).forEach(row => {
tsvString += `${row["key"]}\t${row["doc_count"]}\n`;
});
navigator.clipboard.writeText(tsvString);
this.$bvToast.toast(
this.$t("toast.copiedToClipboard"),
{
title: null,
noAutoHide: false,
toaster: "b-toaster-bottom-right",
headerClass: "hidden",
bodyClass: "toast-body-info",
});
},
forceUpdateMimeAgg() {
const query = Sist2Query.searchQuery();
Sist2Api.getMimeTypes(query).then(({buckets}) => {
this.$store.commit("setUiDetailsMimeAgg", buckets);
});
}
},
});

View File

@@ -19,6 +19,14 @@
{{ $t("sort.sizeDesc") }}
</b-dropdown-item>
<b-dropdown-item :class="{'dropdown-active': sort === 'nameDesc'}" @click="onSelect('nameDesc')">
{{ $t("sort.nameDesc") }}
</b-dropdown-item>
<b-dropdown-item :class="{'dropdown-active': sort === 'nameAsc'}" @click="onSelect('nameAsc')">
{{ $t("sort.nameAsc") }}
</b-dropdown-item>
<b-dropdown-item :class="{'dropdown-active': sort === 'random'}" @click="onSelect('random')">
{{ $t("sort.random") }}
</b-dropdown-item>

View File

@@ -51,7 +51,7 @@
>{{ tag.text.split(".").pop() }}</span>
<b-popover :target="hit._id+tag.rawText" triggers="focus blur" placement="top">
<b-button variant="danger" @click="onTagDeleteClick(tag, $event)">Delete</b-button>
<b-button variant="danger" @click="onTagDeleteClick(tag, $event)">{{$t("deleteTag")}}</b-button>
</b-popover>
</div>
@@ -63,7 +63,7 @@
</template>
<!-- Add button -->
<small v-if="showAddButton" class="badge add-tag-button" @click="tagAdd()">Add</small>
<small v-if="showAddButton" class="badge add-tag-button" @click="tagAdd()">{{$t("addTag")}}</small>
<!-- Size tag-->
<small v-else class="text-muted badge-size">{{

View File

@@ -120,7 +120,7 @@ export default {
},
mounted() {
this.$store.subscribe((mutation) => {
if (mutation.type === "setUiMimeMap") {
if (mutation.type === "setUiMimeMap" && this.tagTree === null) {
this.initializeTree();
this.updateTree();
} else if (mutation.type === "busUpdateTags") {
@@ -147,6 +147,7 @@ export default {
this.tagTree.on("node.state.changed", this.handleTreeClick);
},
updateTree() {
// TODO: remember which tags are selected and restore?
const tagMap = [];
Sist2Api.getTags().then(tags => {
tags.forEach(tag => addTag(tagMap, tag.id, tag.id, tag.count));

View File

@@ -0,0 +1,21 @@
<template>
<svg style="width:24px;height:24px" viewBox="0 0 24 24">
<path
fill="currentColor"
d="M17,9H7V7H17M17,13H7V11H17M14,17H7V15H14M12,3A1,1 0 0,1 13,4A1,1 0 0,1 12,5A1,1 0 0,1 11,4A1,1 0 0,1 12,3M19,3H14.82C14.4,1.84 13.3,1 12,1C10.7,1 9.6,1.84 9.18,3H5A2,2 0 0,0 3,5V19A2,2 0 0,0 5,21H19A2,2 0 0,0 21,19V5A2,2 0 0,0 19,3Z"/>
</svg>
</template>
<script>
export default {
name: "ClipboardIcon"
}
</script>
<style scoped>
svg {
display: inline-block;
width: 20px;
height: 20px;
}
</style>

View File

@@ -0,0 +1,21 @@
<template>
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24">
<path
fill="currentColor"
d="M12 0c-6.627 0-12 5.373-12 12s5.373 12 12 12 12-5.373 12-12-5.373-12-12-12zm1 16.057v-3.057h2.994c-.059 1.143-.212 2.24-.456 3.279-.823-.12-1.674-.188-2.538-.222zm1.957 2.162c-.499 1.33-1.159 2.497-1.957 3.456v-3.62c.666.028 1.319.081 1.957.164zm-1.957-7.219v-3.015c.868-.034 1.721-.103 2.548-.224.238 1.027.389 2.111.446 3.239h-2.994zm0-5.014v-3.661c.806.969 1.471 2.15 1.971 3.496-.642.084-1.3.137-1.971.165zm2.703-3.267c1.237.496 2.354 1.228 3.29 2.146-.642.234-1.311.442-2.019.607-.344-.992-.775-1.91-1.271-2.753zm-7.241 13.56c-.244-1.039-.398-2.136-.456-3.279h2.994v3.057c-.865.034-1.714.102-2.538.222zm2.538 1.776v3.62c-.798-.959-1.458-2.126-1.957-3.456.638-.083 1.291-.136 1.957-.164zm-2.994-7.055c.057-1.128.207-2.212.446-3.239.827.121 1.68.19 2.548.224v3.015h-2.994zm1.024-5.179c.5-1.346 1.165-2.527 1.97-3.496v3.661c-.671-.028-1.329-.081-1.97-.165zm-2.005-.35c-.708-.165-1.377-.373-2.018-.607.937-.918 2.053-1.65 3.29-2.146-.496.844-.927 1.762-1.272 2.753zm-.549 1.918c-.264 1.151-.434 2.36-.492 3.611h-3.933c.165-1.658.739-3.197 1.617-4.518.88.361 1.816.67 2.808.907zm.009 9.262c-.988.236-1.92.542-2.797.9-.89-1.328-1.471-2.879-1.637-4.551h3.934c.058 1.265.231 2.488.5 3.651zm.553 1.917c.342.976.768 1.881 1.257 2.712-1.223-.49-2.326-1.211-3.256-2.115.636-.229 1.299-.435 1.999-.597zm9.924 0c.7.163 1.362.367 1.999.597-.931.903-2.034 1.625-3.257 2.116.489-.832.915-1.737 1.258-2.713zm.553-1.917c.27-1.163.442-2.386.501-3.651h3.934c-.167 1.672-.748 3.223-1.638 4.551-.877-.358-1.81-.664-2.797-.9zm.501-5.651c-.058-1.251-.229-2.46-.492-3.611.992-.237 1.929-.546 2.809-.907.877 1.321 1.451 2.86 1.616 4.518h-3.933z"/>
</svg>
</template>
<script>
export default {
name: "LanguageIcon"
}
</script>
<style scoped>
svg {
display: inline-block;
width: 20px;
height: 20px;
}
</style>

View File

@@ -5,6 +5,8 @@ export default {
advanced: "Advanced search",
fuzzy: "Fuzzy"
},
addTag: "Add",
deleteTag: "Delete",
download: "Download",
and: "and",
page: "page",
@@ -62,7 +64,10 @@ export default {
lightboxLoadOnlyCurrent: "Do not preload full-size images for adjacent slides in image viewer.",
slideDuration: "Slide duration",
resultSize: "Number of results per page",
tagOrOperator: "Use OR operator when specifying multiple tags."
tagOrOperator: "Use OR operator when specifying multiple tags.",
hideDuplicates: "Hide duplicate results based on checksum",
hideLegacy: "Hide the 'legacyES' Elasticsearch notice",
updateMimeMap: "Update the Media Types tree in real time"
},
queryMode: {
simple: "Simple",
@@ -70,7 +75,8 @@ export default {
},
lang: {
en: "English",
fr: "Français"
fr: "Français",
"zh-CN": "简体中文",
},
displayMode: {
grid: "Grid",
@@ -124,18 +130,21 @@ export default {
esQueryErr: "Could not parse or execute query, please check the Advanced search documentation. " +
"See server logs for more information.",
dupeTagTitle: "Duplicate tag",
dupeTag: "This tag already exists for this document."
dupeTag: "This tag already exists for this document.",
copiedToClipboard: "Copied to clipboard"
},
saveTagModalTitle: "Add tag",
saveTagPlaceholder: "Tag name",
confirm: "Confirm",
indexPickerPlaceholder: "Select indices",
indexPickerPlaceholder: "Select an index",
sort: {
relevance: "Relevance",
dateAsc: "Date (Older first)",
dateDesc: "Date (Newer first)",
sizeAsc: "Size (Smaller first)",
sizeDesc: "Size (Larger first)",
nameAsc: "Name (A-z)",
nameDesc: "Name (Z-a)",
random: "Random",
},
d3: {
@@ -143,7 +152,13 @@ export default {
mimeSize: "Size distribution by media type",
dateHistogram: "File modification time distribution",
sizeHistogram: "File size distribution",
}
},
indexPicker: {
selectNone: "Select None",
selectAll: "Select All",
selectedIndex: "selected index",
selectedIndices: "selected indices",
},
},
fr: {
searchBar: {
@@ -151,6 +166,8 @@ export default {
advanced: "Recherche avancée",
fuzzy: "Approximatif"
},
addTag: "Ajouter",
deleteTag: "Supprimer",
download: "Télécharger",
and: "et",
page: "page",
@@ -209,7 +226,10 @@ export default {
lightboxLoadOnlyCurrent: "Désactiver le chargement des diapositives adjacentes pour le visualiseur d'images",
slideDuration: "Durée des diapositives",
resultSize: "Nombre de résultats par page",
tagOrOperator: "Utiliser l'opérateur OU lors de la spécification de plusieurs tags"
tagOrOperator: "Utiliser l'opérateur OU lors de la spécification de plusieurs tags",
hideDuplicates: "Masquer les résultats en double",
hideLegacy: "Masquer la notice 'legacyES' Elasticsearch",
updateMimeMap: "Mettre à jour l'arbre de Types de médias en temps réel"
},
queryMode: {
simple: "Simple",
@@ -217,7 +237,8 @@ export default {
},
lang: {
en: "English",
fr: "Français"
fr: "Français",
"zh-CN": "简体中文",
},
displayMode: {
grid: "Grille",
@@ -272,7 +293,8 @@ export default {
esQueryErr: "Impossible d'analyser ou d'exécuter la requête, veuillez consulter la documentation sur la " +
"recherche avancée. Voir les journaux du serveur pour plus d'informations.",
dupeTagTitle: "Tag en double",
dupeTag: "Ce tag existe déjà pour ce document."
dupeTag: "Ce tag existe déjà pour ce document.",
copiedToClipboard: "Copié dans le presse-papier"
},
saveTagModalTitle: "Ajouter un tag",
saveTagPlaceholder: "Nom du tag",
@@ -284,6 +306,8 @@ export default {
dateDesc: "Date (Plus récent)",
sizeAsc: "Taille (Plus petit)",
sizeDesc: "Taille (Plus grand)",
nameAsc: "Nom (A-z)",
nameDesc: "Nom (Z-a)",
random: "Aléatoire",
},
d3: {
@@ -291,6 +315,173 @@ export default {
mimeSize: "Distribution des tailles de fichiers par type de média",
dateHistogram: "Distribution des dates de modification",
sizeHistogram: "Distribution des tailles de fichier",
}
}
}
},
indexPicker: {
selectNone: "Sélectionner aucun",
selectAll: "Sélectionner tout",
selectedIndex: "indice sélectionné",
selectedIndices: "indices sélectionnés",
},
},
"zh-CN": {
searchBar: {
simple: "搜索",
advanced: "高级搜索",
fuzzy: "模糊搜索"
},
addTag: "添加",
deleteTag: "删除",
download: "下载",
and: "与",
page: "页",
pages: "页",
mimeTypes: "文件类型",
tags: "标签",
help: {
simpleSearch: "简易搜索",
advancedSearch: "高级搜索",
help: "帮助",
term: "<关键词>",
and: "与操作",
or: "或操作",
not: "反选单个关键词",
quotes: "括起来的部分视为一个关键词,保序",
prefix: "在词尾使用时,匹配前缀",
parens: "表达式编组",
tildeTerm: "匹配编辑距离以内的关键词",
tildePhrase: "匹配短语,容忍一些非匹配词",
example1:
"例如: <code>\"番茄\" +(炒蛋 | 牛腩) -饭</code> 将匹配" +
"短语 <i>番茄炒蛋</i>、<i>炒蛋</i> 或者 <i>牛腩</i>,而忽略任何带有" +
"<i>饭</i>的关键词.",
defaultOperator:
"表达式中无<code>+</code>或者<code>|</code>时,默认使用" +
"<code>+</code>(与操作)。",
fuzzy:
"选中<b>模糊搜索</b>选项时返回部分匹配的结果3-grams)。",
moreInfoSimple: "详细信息:<a target=\"_blank\" " +
"rel=\"noreferrer\" href=\"//www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html\">Elasticsearch文档</a>",
moreInfoAdvanced: "高级搜索模式文档:<a target=\"_blank\" rel=\"noreferrer\" href=\"//www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax\">Elasticsearch文档</a>"
},
config: "配置",
configDescription: "配置在此浏览器中实时保存。",
configReset: "重置所有设置",
searchOptions: "搜索选项",
treemapOptions: "树状图选项",
displayOptions: "显示选项",
opt: {
lang: "语言",
highlight: "启用高亮",
fuzzy: "默认使用模糊搜索",
searchInPath: "匹配文档路径",
suggestPath: "搜索框启用自动补全",
fragmentSize: "高亮上下文大小",
queryMode: "搜索模式",
displayMode: "显示",
columns: "列数",
treemapType: "树状图类属性",
treemapTiling: "树状图平铺",
treemapColorGroupingDepth: "树状图颜色编组深度(展开)",
treemapColor: "树状图颜色(折叠)",
treemapSize: "树状图大小",
theme: "主题",
lightboxLoadOnlyCurrent: "在图片查看器中,不要预读相邻的全图",
slideDuration: "幻灯片时长",
resultSize: "每页结果数",
tagOrOperator: "使用或操作OR匹配多个标签。",
hideDuplicates: "使用校验码隐藏重复结果",
hideLegacy: "隐藏'legacyES' Elasticsearch 通知",
updateMimeMap: "媒体类型树的实时更新"
},
queryMode: {
simple: "简单",
advanced: "高级",
},
lang: {
en: "English",
fr: "Français",
"zh-CN": "简体中文",
},
displayMode: {
grid: "网格",
list: "列表",
},
columns: {
auto: "自动"
},
treemapType: {
cascaded: "折叠",
flat: "平铺(紧凑)"
},
treemapSize: {
small: "小",
medium: "中",
large: "大",
xLarge: "加大",
xxLarge: "加加大",
custom: "自订",
},
treemapTiling: {
binary: "Binary",
squarify: "Squarify",
slice: "Slice",
dice: "Dice",
sliceDice: "Slice & Dice",
},
theme: {
light: "亮",
black: "暗"
},
hit: "命中",
hits: "命中",
details: "详细信息",
stats: "统计信息",
queryTime: "查询时间",
totalSize: "总大小",
pathBar: {
placeholder: "过滤路径",
modalTitle: "选择路径"
},
debug: "调试信息",
debugDescription: "对调试除错有用的信息。 若您遇到bug或者想建议新功能请提交新Issue到" +
"<a href='https://github.com/simon987/sist2/issues/new/choose'>这里</a>.",
tagline: "标签栏",
toast: {
esConnErrTitle: "Elasticsearch连接错误",
esConnErr: "sist2 web 模块连接Elasticsearch出错。" +
"查看服务日志以获取更多信息。",
esQueryErrTitle: "查询错误",
esQueryErr: "无法识别或执行查询,请查阅高级搜索文档。" +
"查看服务日志以获取更多信息。",
dupeTagTitle: "重复标签",
dupeTag: "该标签已存在于此文档。",
copiedToClipboard: "复制到剪贴板"
},
saveTagModalTitle: "增加标签",
saveTagPlaceholder: "标签名",
confirm: "确认",
indexPickerPlaceholder: "选择一个索引",
sort: {
relevance: "相关度",
dateAsc: "日期(由旧到新)",
dateDesc: "日期(由新到旧)",
sizeAsc: "大小(从小到大)",
sizeDesc: "大小(从大到小)",
nameAsc: "名字A-z",
nameDesc: "名字 Z-a",
random: "随机",
},
d3: {
mimeCount: "各类文件数量分布",
mimeSize: "各类文件大小分布",
dateHistogram: "文件修改时间分布",
sizeHistogram: "文件大小分布",
},
indexPicker: {
selectNone: "清空",
selectAll: "全选",
selectedIndex: "选中索引",
selectedIndices: "选中索引",
},
},
}

View File

@@ -27,6 +27,8 @@ export default new Vuex.Store({
size: 60,
optLang: "en",
optLangIsDefault: true,
optHideDuplicates: true,
optTheme: "light",
optDisplay: "grid",
@@ -45,6 +47,8 @@ export default new Vuex.Store({
optTreemapColor: "PuBuGn",
optLightboxLoadOnlyCurrent: false,
optLightboxSlideDuration: 15,
optHideLegacy: false,
optUpdateMimeMap: true,
_onLoadSelectedIndices: [] as string[],
_onLoadSelectedMimeTypes: [] as string[],
@@ -69,9 +73,14 @@ export default new Vuex.Store({
uiLightboxSlide: 0,
uiReachedScrollEnd: false,
uiDetailsMimeAgg: null,
uiShowDetails: false,
uiMimeMap: [] as any[]
},
mutations: {
setUiShowDetails: (state, val) => state.uiShowDetails = val,
setUiDetailsMimeAgg: (state, val) => state.uiDetailsMimeAgg = val,
setUiReachedScrollEnd: (state, val) => state.uiReachedScrollEnd = val,
setTags: (state, val) => state.tags = val,
setPathText: (state, val) => state.pathText = val,
@@ -79,7 +88,11 @@ export default new Vuex.Store({
setSizeMax: (state, val) => state.sizeMax = val,
setSist2Info: (state, val) => state.sist2Info = val,
setSeed: (state, val) => state.seed = val,
setOptLang: (state, val) => state.optLang = val,
setOptHideDuplicates: (state, val) => state.optHideDuplicates = val,
setOptLang: (state, val) => {
state.optLang = val;
state.optLangIsDefault = false;
},
setSortMode: (state, val) => state.sortMode = val,
setIndices: (state, val) => {
state.indices = val;
@@ -142,8 +155,11 @@ export default new Vuex.Store({
setOptTreemapColorGroupingDepth: (state, val) => state.optTreemapColorGroupingDepth = val,
setOptTreemapSize: (state, val) => state.optTreemapSize = val,
setOptTreemapColor: (state, val) => state.optTreemapColor = val,
setOptHideLegacy: (state, val) => state.optHideLegacy = val,
setOptUpdateMimeMap: (state, val) => state.optUpdateMimeMap = val,
setOptLightboxLoadOnlyCurrent: (state, val) => state.optLightboxLoadOnlyCurrent = val,
setOptLightboxSlideDuration: (state, val) => state.optLightboxSlideDuration = val,
setUiMimeMap: (state, val) => state.uiMimeMap = val,
@@ -153,8 +169,18 @@ export default new Vuex.Store({
busUpdateTags: () => {
// noop
},
busSearch: () => {
// noop
},
},
actions: {
setSist2Info: (store, val) => {
store.commit("setSist2Info", val);
if (store.state.optLangIsDefault) {
store.commit("setOptLang", val.lang);
}
},
loadFromArgs({commit}, route: Route) {
if (route.query.q) {
@@ -274,6 +300,7 @@ export default new Vuex.Store({
commit("setUiLightboxTypes", []);
commit("setUiLightboxCaptions", []);
commit("setUiLightboxKey", 0);
commit("setUiDetailsMimeAgg", null);
}
},
modules: {},
@@ -317,6 +344,7 @@ export default new Vuex.Store({
uiLightboxKey: state => state.uiLightboxKey,
uiLightboxSlide: state => state.uiLightboxSlide,
optHideDuplicates: state => state.optHideDuplicates,
optLang: state => state.optLang,
optTheme: state => state.optTheme,
optDisplay: state => state.optDisplay,
@@ -336,5 +364,7 @@ export default new Vuex.Store({
optLightboxLoadOnlyCurrent: state => state.optLightboxLoadOnlyCurrent,
optLightboxSlideDuration: state => state.optLightboxSlideDuration,
optResultSize: state => state.size,
optHideLegacy: state => state.optHideLegacy,
optUpdateMimeMap: state => state.optUpdateMimeMap,
}
})

View File

@@ -97,6 +97,30 @@ export function getSelectedTreeNodes(tree: any) {
return Array.from(selectedNodes);
}
export function getTreeNodeAttributes(tree: any) {
const nodes = tree.selectable();
const attributes = {};
for (let i = 0; i < nodes.length; i++) {
let id = null;
if (nodes[i].text.indexOf("(") !== -1 && nodes[i].values) {
id = nodes[i].values.slice(-1)[0];
} else {
id = nodes[i].id
}
attributes[id] = {
checked: nodes[i].itree.state.checked,
collapsed: nodes[i].itree.state.collapsed,
}
}
return attributes;
}
export function serializeMimes(mimes: string[]): string | undefined {
if (mimes.length == 0) {
return undefined;

View File

@@ -15,11 +15,8 @@
<h4>{{ $t("displayOptions") }}</h4>
<b-card>
<b-form-checkbox :checked="optLightboxLoadOnlyCurrent" @input="setOptLightboxLoadOnlyCurrent">
{{ $t("opt.lightboxLoadOnlyCurrent") }}
</b-form-checkbox>
<label>{{ $t("opt.lang") }}</label>
<label><LanguageIcon/><span style="vertical-align: middle">&nbsp;{{ $t("opt.lang") }}</span></label>
<b-form-select :options="langOptions" :value="optLang" @input="setOptLang"></b-form-select>
<label>{{ $t("opt.theme") }}</label>
@@ -30,11 +27,30 @@
<label>{{ $t("opt.columns") }}</label>
<b-form-select :options="columnsOptions" :value="optColumns" @input="setOptColumns"></b-form-select>
<div style="height: 10px"></div>
<b-form-checkbox :checked="optLightboxLoadOnlyCurrent" @input="setOptLightboxLoadOnlyCurrent">
{{ $t("opt.lightboxLoadOnlyCurrent") }}
</b-form-checkbox>
<b-form-checkbox :checked="optHideLegacy" @input="setOptHideLegacy">
{{ $t("opt.hideLegacy") }}
</b-form-checkbox>
<b-form-checkbox :checked="optUpdateMimeMap" @input="setOptUpdateMimeMap">
{{ $t("opt.updateMimeMap") }}
</b-form-checkbox>
</b-card>
<br/>
<h4>{{ $t("searchOptions") }}</h4>
<b-card>
<b-form-checkbox :checked="optHideDuplicates" @input="setOptHideDuplicates">{{
$t("opt.hideDuplicates")
}}
</b-form-checkbox>
<b-form-checkbox :checked="optHighlight" @input="setOptHighlight">{{ $t("opt.highlight") }}</b-form-checkbox>
<b-form-checkbox :checked="optTagOrOperator" @input="setOptTagOrOperator">{{
$t("opt.tagOrOperator")
@@ -108,15 +124,15 @@
</template>
<script>
import Vue from "vue";
import {mapGetters, mapMutations} from "vuex";
import {mapActions, mapGetters, mapMutations} from "vuex";
import DebugInfo from "@/components/DebugInfo.vue";
import Preloader from "@/components/Preloader.vue";
import sist2 from "@/Sist2Api";
import GearIcon from "@/components/GearIcon.vue";
import GearIcon from "@/components/icons/GearIcon.vue";
import LanguageIcon from "@/components/icons/LanguageIcon";
export default {
components: {GearIcon, DebugInfo, Preloader},
components: {LanguageIcon, GearIcon, DebugInfo, Preloader},
data() {
return {
loading: true,
@@ -124,6 +140,7 @@ export default {
langOptions: [
{value: "en", text: this.$t("lang.en")},
{value: "fr", text: this.$t("lang.fr")},
{value: "zh-CN", text: this.$t("lang.zh-CN")},
],
queryModeOptions: [
{value: "simple", text: this.$t("queryMode.simple")},
@@ -206,10 +223,12 @@ export default {
"optTreemapSize",
"optLightboxLoadOnlyCurrent",
"optLightboxSlideDuration",
"optContainerWidth",
"optResultSize",
"optTagOrOperator",
"optLang"
"optLang",
"optHideDuplicates",
"optHideLegacy",
"optUpdateMimeMap",
]),
clientWidth() {
return window.innerWidth;
@@ -217,7 +236,7 @@ export default {
},
mounted() {
sist2.getSist2Info().then(data => {
this.$store.commit("setSist2Info", data)
this.setSist2Info(data);
this.loading = false;
});
@@ -228,6 +247,9 @@ export default {
});
},
methods: {
...mapActions({
setSist2Info: "setSist2Info",
}),
...mapMutations([
"setOptTheme",
"setOptDisplay",
@@ -245,10 +267,12 @@ export default {
"setOptTreemapSize",
"setOptLightboxLoadOnlyCurrent",
"setOptLightboxSlideDuration",
"setOptContainerWidth",
"setOptResultSize",
"setOptTagOrOperator",
"setOptLang"
"setOptLang",
"setOptHideDuplicates",
"setOptHideLegacy",
"setOptUpdateMimeMap"
]),
onResetClick() {
localStorage.removeItem("sist2_configuration");

View File

@@ -31,7 +31,7 @@
</b-row>
</b-col>
<b-col>
<b-tabs>
<b-tabs justified>
<b-tab :title="$t('mimeTypes')">
<MimePicker></MimePicker>
</b-tab>
@@ -43,9 +43,13 @@
</b-row>
</b-card>
<Preloader v-if="searchBusy && docs.length === 0" class="mt-3"></Preloader>
<div v-show="docs.length === 0 && !uiLoading">
<Preloader v-if="searchBusy" class="mt-3"></Preloader>
<div v-else-if="docs.length > 0">
<ResultsCard></ResultsCard>
</div>
<div v-if="docs.length > 0">
<ResultsCard></ResultsCard>
<DocCardWall v-if="optDisplay==='grid'" :docs="docs" :append="appendFunc"></DocCardWall>
@@ -56,7 +60,7 @@
<script lang="ts">
import Preloader from "@/components/Preloader.vue";
import {mapGetters, mapMutations} from "vuex";
import {mapActions, mapGetters, mapMutations} from "vuex";
import sist2 from "../Sist2Api";
import Sist2Api, {EsHit, EsResult} from "../Sist2Api";
import SearchBar from "@/components/SearchBar.vue";
@@ -91,6 +95,7 @@ export default Vue.extend({
search: undefined as any,
docs: [] as EsHit[],
docIds: new Set(),
docChecksums: new Set(),
searchBusy: false,
Sist2Query: Sist2Query,
showHelp: false
@@ -108,10 +113,6 @@ export default Vue.extend({
}, 350, {leading: false});
Sist2Api.getMimeTypes().then(mimeMap => {
this.$store.commit("setUiMimeMap", mimeMap);
});
this.$store.dispatch("loadFromArgs", this.$route).then(() => {
this.$store.subscribe(() => this.$store.dispatch("updateArgs", this.$router));
this.$store.subscribe((mutation) => {
@@ -137,17 +138,23 @@ export default Vue.extend({
sist2.getSist2Info().then(data => {
this.setSist2Info(data);
this.setIndices(data.indices);
this.uiLoading = false;
this.search(true);
Sist2Api.getMimeTypes(Sist2Query.searchQuery()).then(({mimeMap}) => {
this.$store.commit("setUiMimeMap", mimeMap);
this.uiLoading = false;
this.search(true);
});
}).catch(() => {
this.showErrorToast();
});
});
},
methods: {
...mapMutations({
...mapActions({
setSist2Info: "setSist2Info",
}),
...mapMutations({
setIndices: "setIndices",
setDateBoundsMin: "setDateBoundsMin",
setDateBoundsMax: "setDateBoundsMax",
@@ -178,6 +185,7 @@ export default Vue.extend({
async searchNow(q: any) {
this.searchBusy = true;
await this.$store.dispatch("incrementQuerySequence");
this.$store.commit("busSearch");
Sist2Api.esQuery(q).then(async (resp: EsResult) => {
await this.handleSearch(resp);
@@ -193,6 +201,7 @@ export default Vue.extend({
async clearResults() {
this.docs = [];
this.docIds.clear();
this.docChecksums.clear();
await this.$store.dispatch("clearResults");
this.$store.commit("setUiReachedScrollEnd", false);
},
@@ -202,7 +211,19 @@ export default Vue.extend({
}
resp.hits.hits = resp.hits.hits.filter(hit => !this.docIds.has(hit._id));
resp.hits.hits.forEach(hit => this.docIds.add(hit._id));
if (this.$store.state.optHideDuplicates) {
resp.hits.hits = resp.hits.hits.filter(hit => {
if (!("checksum" in hit._source)) {
return true;
}
const isDupe = !this.docChecksums.has(hit._source.checksum);
this.docChecksums.add(hit._source.checksum);
return isDupe;
});
}
for (const hit of resp.hits.hits) {
if (hit._props.isPlayableImage || hit._props.isPlayableVideo) {
@@ -266,6 +287,11 @@ export default Vue.extend({
border: none;
}
.toast-header-info, .toast-body-info {
background: #2196f3;
color: #fff !important;
}
.toast-header-error, .toast-body-error {
background: #a94442;
color: #f2dede !important;

View File

@@ -22,6 +22,7 @@
const char *TESS_DATAPATHS[] = {
"/usr/share/tessdata/",
"/usr/share/tesseract-ocr/tessdata/",
"/usr/share/tesseract-ocr/4.00/tessdata/",
"./",
NULL
};
@@ -145,7 +146,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
if (args->name == NULL) {
args->name = g_path_get_basename(args->output);
} else {
char* tmp = malloc(strlen(args->name) + 1);
char *tmp = malloc(strlen(args->name) + 1);
strcpy(tmp, args->name);
args->name = tmp;
}
@@ -167,17 +168,50 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
return 1;
}
if (args->tesseract_lang != NULL) {
TessBaseAPI *api = TessBaseAPICreate();
if (args->ocr_images && args->tesseract_lang == NULL) {
fprintf(stderr, "You must specify --ocr-lang <LANG> to use --ocr-images");
return 1;
}
char filename[128];
sprintf(filename, "%s.traineddata", args->tesseract_lang);
const char *path = find_file_in_paths(TESS_DATAPATHS, filename);
if (path == NULL) {
LOG_FATAL("cli.c", "Could not find tesseract language file!");
if (args->ocr_ebooks && args->tesseract_lang == NULL) {
fprintf(stderr, "You must specify --ocr-lang <LANG> to use --ocr-ebooks");
return 1;
}
if (args->tesseract_lang != NULL) {
if (!args->ocr_ebooks && !args->ocr_images) {
fprintf(stderr, "You must specify at least one of --ocr-ebooks, --ocr-images");
return 1;
}
ret = TessBaseAPIInit3(api, path, args->tesseract_lang);
TessBaseAPI *api = TessBaseAPICreate();
const char *trained_data_path = NULL;
char *lang = malloc(strlen(args->tesseract_lang) + 1);
strcpy(lang, args->tesseract_lang);
lang = strtok(lang, "+");
while (lang != NULL) {
char filename[128];
sprintf(filename, "%s.traineddata", lang);
const char *path = find_file_in_paths(TESS_DATAPATHS, filename);
if (path == NULL) {
LOG_FATALF("cli.c", "Could not find tesseract language file: %s!", filename);
}
if (trained_data_path != NULL && path != trained_data_path) {
LOG_FATAL("cli.c", "When specifying more than one tesseract language, all the traineddata "
"files must be in the same folder")
}
trained_data_path = path;
lang = strtok(NULL, "+");
}
free(lang);
ret = TessBaseAPIInit3(api, trained_data_path, args->tesseract_lang);
if (ret != 0) {
fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang);
return 1;
@@ -185,7 +219,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
TessBaseAPIEnd(api);
TessBaseAPIDelete(api);
args->tesseract_path = path;
args->tesseract_path = trained_data_path;
}
if (args->exclude_regex != NULL) {
@@ -218,6 +252,19 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
args->max_memory_buffer = DEFAULT_MAX_MEM_BUFFER;
}
if (args->list_path != NULL) {
if (strcmp(args->list_path, "-") == 0) {
args->list_file = stdin;
LOG_DEBUG("cli.c", "Using stdin as list file")
} else {
args->list_file = fopen(args->list_path, "r");
if (args->list_file == NULL) {
LOG_FATALF("main.c", "List file could not be opened: %s (%s)", args->list_path, errno);
}
}
}
LOG_DEBUGF("cli.c", "arg quality=%f", args->quality)
LOG_DEBUGF("cli.c", "arg size=%d", args->size)
LOG_DEBUGF("cli.c", "arg content_size=%d", args->content_size)
@@ -237,6 +284,7 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
LOG_DEBUGF("cli.c", "arg fast_epub=%d", args->fast_epub)
LOG_DEBUGF("cli.c", "arg treemap_threshold=%f", args->treemap_threshold)
LOG_DEBUGF("cli.c", "arg max_memory_buffer=%d", args->max_memory_buffer)
LOG_DEBUGF("cli.c", "arg list_path=%s", args->list_path)
return 0;
}
@@ -362,15 +410,15 @@ int web_args_validate(web_args_t *args, int argc, const char **argv) {
args->es_index = DEFAULT_ES_INDEX;
}
if (args->lang == NULL) {
args->lang = DEFAULT_LANG;
}
if (args->tagline == NULL) {
args->tagline = DEFAULT_TAGLINE;
}
if (strlen(args->lang) != 2) {
if (args->lang == NULL) {
args->lang = DEFAULT_LANG;
}
if (strlen(args->lang) != 2 && strlen(args->lang) != 5) {
fprintf(stderr, "Invalid --lang value, see usage\n");
return 1;
}

View File

@@ -21,6 +21,8 @@ typedef struct scan_args {
char *archive_passphrase;
char *tesseract_lang;
const char *tesseract_path;
int ocr_images;
int ocr_ebooks;
char *exclude_regex;
int fast;
const char* treemap_threshold_str;
@@ -28,6 +30,9 @@ typedef struct scan_args {
int max_memory_buffer;
int read_subtitles;
int fast_epub;
int calculate_checksums;
char *list_path;
FILE *list_file;
} scan_args_t;
scan_args_t *scan_args_create();

View File

@@ -2,6 +2,7 @@
ScanCtx_t ScanCtx = {
.stat_index_size = 0,
.stat_tn_size = 0,
.dbg_current_files = NULL,
.pool = NULL
};

View File

@@ -14,7 +14,10 @@
#include "libscan/mobi/scan_mobi.h"
#include "libscan/raw/raw.h"
#include "libscan/msdoc/msdoc.h"
#include "libscan/wpd/wpd.h"
#include "libscan/json/json.h"
#include "src/io/store.h"
#include "src/index/elastic.h"
#include <glib.h>
#include <pcre.h>
@@ -31,12 +34,14 @@ typedef struct {
int threads;
int depth;
int calculate_checksums;
size_t stat_tn_size;
size_t stat_index_size;
GHashTable *original_table;
GHashTable *copy_table;
pthread_mutex_t copy_table_mu;
pcre *exclude;
pcre_extra *exclude_extra;
@@ -60,6 +65,8 @@ typedef struct {
scan_mobi_ctx_t mobi_ctx;
scan_raw_ctx_t raw_ctx;
scan_msdoc_ctx_t msdoc_ctx;
scan_wpd_ctx_t wpd_ctx;
scan_json_ctx_t json_ctx;
} ScanCtx_t;
typedef struct {
@@ -70,6 +77,7 @@ typedef struct {
typedef struct {
char *es_url;
es_version_t *es_version;
char *es_index;
int batch_size;
tpool_t *pool;
@@ -81,6 +89,7 @@ typedef struct {
typedef struct {
char *es_url;
es_version_t *es_version;
char *es_index;
int index_count;
char *auth_user;
@@ -89,7 +98,7 @@ typedef struct {
int tag_auth_enabled;
char *tagline;
struct index_t indices[256];
char lang[3];
char lang[10];
int dev;
} WebCtx_t;

View File

@@ -253,7 +253,7 @@ void _elastic_flush(int max) {
} else {
print_errors(r);
LOG_INFOF("elastic.c", "Indexed %d documents (%zukB) <%d>", count, buf_len / 1024, r->status_code);
LOG_DEBUGF("elastic.c", "Indexed %d documents (%zukB) <%d>", count, buf_len / 1024, r->status_code);
delete_queue(max);
if (Indexer->queued != 0) {
@@ -356,7 +356,65 @@ void finish_indexer(char *script, int async_script, char *index_id) {
free_response(r);
}
void elastic_init(int force_reset, const char* user_mappings, const char* user_settings) {
es_version_t *elastic_get_version(const char *es_url) {
response_t *r = web_get(es_url, 30);
char *tmp = malloc(r->size + 1);
memcpy(tmp, r->body, r->size);
*(tmp + r->size) = '\0';
cJSON *response = cJSON_Parse(tmp);
free(tmp);
free_response(r);
if (response == NULL) {
return NULL;
}
if (cJSON_GetObjectItem(response, "version") == NULL ||
cJSON_GetObjectItem(cJSON_GetObjectItem(response, "version"), "number") == NULL) {
cJSON_Delete(response);
return NULL;
}
char *version_str = cJSON_GetObjectItem(cJSON_GetObjectItem(response, "version"), "number")->valuestring;
es_version_t *version = malloc(sizeof(es_version_t));
const char *tok = strtok(version_str, ".");
version->major = atoi(tok);
tok = strtok(NULL, ".");
version->minor = atoi(tok);
tok = strtok(NULL, ".");
version->patch = atoi(tok);
cJSON_Delete(response);
return version;
}
void elastic_init(int force_reset, const char *user_mappings, const char *user_settings) {
es_version_t *es_version = elastic_get_version(IndexCtx.es_url);
IndexCtx.es_version = es_version;
if (es_version == NULL) {
LOG_FATAL("elastic.c", "Could not get ES version")
}
LOG_INFOF("elastic.c",
"Elasticsearch version is %s (supported=%d, legacy=%d)",
format_es_version(es_version), IS_SUPPORTED_ES_VERSION(es_version), USE_LEGACY_ES_SETTINGS(es_version));
if (!IS_SUPPORTED_ES_VERSION(es_version)) {
LOG_FATAL("elastic.c", "sist2 only supports Elasticsearch v6.8 or newer")
}
char *settings = NULL;
if (USE_LEGACY_ES_SETTINGS(es_version)) {
settings = settings_json;
} else {
settings = settings_legacy_json;
}
// Check if index exists
char url[4096];
@@ -392,7 +450,7 @@ void elastic_init(int force_reset, const char* user_mappings, const char* user_s
free_response(r);
snprintf(url, sizeof(url), "%s/%s/_settings", IndexCtx.es_url, IndexCtx.es_index);
r = web_put(url, user_settings ? user_settings : settings_json);
r = web_put(url, user_settings ? user_settings : settings);
LOG_INFOF("elastic.c", "Update ES settings <%d>", r->status_code);
if (r->status_code != 200) {
print_error(r);

View File

@@ -9,6 +9,26 @@ typedef struct es_bulk_line {
char line[0];
} es_bulk_line_t;
typedef struct {
int major;
int minor;
int patch;
} es_version_t;
#define VERSION_GE(version, maj, min) ((version)->major > (maj) || ((version)->major == (maj) && (version)->minor >= (min)))
#define IS_SUPPORTED_ES_VERSION(es_version) VERSION_GE((es_version), 6, 8)
#define USE_LEGACY_ES_SETTINGS(es_version) (!VERSION_GE((es_version), 7, 14))
__always_inline
static const char *format_es_version(es_version_t *version) {
static char buf[64];
snprintf(buf, sizeof(buf), "%d.%d.%d", version->major, version->minor, version->patch);
return buf;
}
/**
* Note: indexer is *not* thread safe
*/
@@ -31,6 +51,8 @@ cJSON *elastic_get_document(const char *id_str);
char *elastic_get_status();
es_version_t *elastic_get_version(const char *es_url);
void execute_update_script(const char *script, int async, const char index_id[MD5_STR_LENGTH]);
#endif

File diff suppressed because one or more lines are too long

View File

@@ -38,6 +38,8 @@ char *get_meta_key_text(enum metakey meta_key) {
return "parent";
case MetaExifMake:
return "exif_make";
case MetaExifDescription:
return "exif_description";
case MetaExifSoftware:
return "exif_software";
case MetaExifExposureTime:
@@ -74,6 +76,8 @@ char *get_meta_key_text(enum metakey meta_key) {
return "exif_gps_latitude_dms";
case MetaExifGpsLatitudeDec:
return "exif_gps_latitude_dec";
case MetaChecksum:
return "checksum";
default:
LOG_FATALF("serialize.c", "FIXME: Unknown meta key: %d", meta_key)
}
@@ -148,6 +152,7 @@ char *build_json_string(document_t *doc) {
case MetaFontName:
case MetaParent:
case MetaExifMake:
case MetaExifDescription:
case MetaExifSoftware:
case MetaExifExposureTime:
case MetaExifFNumber:
@@ -165,6 +170,7 @@ char *build_json_string(document_t *doc) {
case MetaExifGpsLatitudeDMS:
case MetaExifGpsLatitudeDec:
case MetaExifGpsLatitudeRef:
case MetaChecksum:
case MetaTitle: {
cJSON_AddStringToObject(json, get_meta_key_text(meta->key), meta->str_val);
buffer_size_guess += (int) strlen(meta->str_val);

View File

@@ -4,6 +4,7 @@
store_t *store_create(const char *path, size_t chunk_size) {
store_t *store = malloc(sizeof(struct store_t));
mkdir(path, S_IWUSR | S_IRUSR | S_IXUSR);
strcpy(store->path, path);
#if (SIST_FAKE_STORE != 1)
store->chunk_size = chunk_size;
@@ -22,7 +23,6 @@ store_t *store_create(const char *path, size_t chunk_size) {
}
store->size = (size_t) store->chunk_size;
ScanCtx.stat_tn_size = 0;
mdb_env_set_mapsize(store->env, store->size);
// Open dbi
@@ -78,27 +78,57 @@ void store_write(store_t *store, char *key, size_t key_len, char *buf, size_t bu
int put_ret = mdb_put(txn, store->dbi, &mdb_key, &mdb_value, 0);
ScanCtx.stat_tn_size += buf_len;
int db_full = FALSE;
int should_abort_transaction = FALSE;
if (put_ret == MDB_MAP_FULL) {
mdb_txn_abort(txn);
db_full = TRUE;
should_abort_transaction = TRUE;
} else {
int commit_ret = mdb_txn_commit(txn);
if (commit_ret == MDB_MAP_FULL) {
db_full = TRUE;
}
}
if (db_full) {
LOG_INFOF("store.c", "Updating mdb mapsize to %lu bytes", store->size)
if (should_abort_transaction) {
mdb_txn_abort(txn);
}
pthread_rwlock_unlock(&store->lock);
// Cannot resize when there is a opened transaction.
// Resize take effect on the next commit.
pthread_rwlock_wrlock(&store->lock);
store->size += store->chunk_size;
mdb_env_set_mapsize(store->env, store->size);
int resize_ret = mdb_env_set_mapsize(store->env, store->size);
if (resize_ret != 0) {
LOG_ERROR("store.c", mdb_strerror(put_ret))
}
mdb_txn_begin(store->env, NULL, 0, &txn);
put_ret = mdb_put(txn, store->dbi, &mdb_key, &mdb_value, 0);
int put_ret_retry = mdb_put(txn, store->dbi, &mdb_key, &mdb_value, 0);
if (put_ret_retry != 0) {
LOG_ERROR("store.c", mdb_strerror(put_ret))
}
int ret = mdb_txn_commit(txn);
if (ret != 0) {
LOG_FATALF("store.c", "FIXME: Could not commit to store %s: %s (%d), %d, %d %d",
store->path, mdb_strerror(ret), ret,
put_ret, put_ret_retry);
}
LOG_INFOF("store.c", "Updated mdb mapsize to %lu bytes", store->size)
}
mdb_txn_commit(txn);
pthread_rwlock_unlock(&store->lock);
if (put_ret != 0) {
} else if (put_ret != 0) {
LOG_ERROR("store.c", mdb_strerror(put_ret))
}
pthread_rwlock_unlock(&store->lock);
#endif
}

View File

@@ -6,12 +6,12 @@
#include <glib.h>
#define STORE_SIZE_TN 1024 * 1024 * 5
#define STORE_SIZE_TAG 1024 * 16
#define STORE_SIZE_TN (1024 * 1024 * 5)
#define STORE_SIZE_TAG (1024 * 1024)
#define STORE_SIZE_META STORE_SIZE_TAG
typedef struct store_t {
char *path;
char path[PATH_MAX];
char *tmp_path;
MDB_dbi dbi;
MDB_env *env;

View File

@@ -4,6 +4,8 @@
#include <ftw.h>
#define STR_STARTS_WITH(x, y) (strncmp(y, x, strlen(y) - 1) == 0)
__always_inline
parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info, int base) {
int len = (int) strlen(filepath);
@@ -24,39 +26,110 @@ parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info,
job->vfile.filepath = job->filepath;
job->vfile.read = fs_read;
// Filesystem reads are always rewindable
job->vfile.read_rewindable = fs_read;
job->vfile.reset = fs_reset;
job->vfile.close = fs_close;
job->vfile.fd = -1;
job->vfile.is_fs_file = TRUE;
job->vfile.has_checksum = FALSE;
job->vfile.rewind_buffer_size = 0;
job->vfile.rewind_buffer = NULL;
job->vfile.calculate_checksum = ScanCtx.calculate_checksums;
return job;
}
int sub_strings[30];
#define EXCLUDED(str) (pcre_exec(ScanCtx.exclude, ScanCtx.exclude_extra, filepath, strlen(filepath), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
#define EXCLUDED(str) (pcre_exec(ScanCtx.exclude, ScanCtx.exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
int handle_entry(const char *filepath, const struct stat *info, int typeflag, struct FTW *ftw) {
if (typeflag == FTW_F && S_ISREG(info->st_mode) && ftw->level <= ScanCtx.depth) {
if (ftw->level > ScanCtx.depth) {
if (typeflag == FTW_D) {
return FTW_SKIP_SUBTREE;
}
return FTW_CONTINUE;
}
if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) {
LOG_DEBUGF("walk.c", "Excluded: %s", filepath)
if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) {
LOG_DEBUGF("walk.c", "Excluded: %s", filepath)
if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
ScanCtx.dbg_excluded_files_count += 1;
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
return 0;
} else if (typeflag == FTW_D) {
return FTW_SKIP_SUBTREE;
}
return FTW_CONTINUE;
}
if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
parse_job_t *job = create_fs_parse_job(filepath, info, ftw->base);
tpool_add_work(ScanCtx.pool, parse, job);
}
return 0;
return FTW_CONTINUE;
}
#define MAX_FILE_DESCRIPTORS 64
int walk_directory_tree(const char *dirpath) {
return nftw(dirpath, handle_entry, MAX_FILE_DESCRIPTORS, FTW_PHYS | FTW_DEPTH);
return nftw(dirpath, handle_entry, MAX_FILE_DESCRIPTORS, FTW_PHYS | FTW_ACTIONRETVAL);
}
int iterate_file_list(void *input_file) {
char buf[PATH_MAX];
struct stat info;
while (fgets(buf, sizeof(buf), input_file) != NULL) {
// Remove trailing newline
*(buf + strlen(buf) - 1) = '\0';
int stat_ret = stat(buf, &info);
if (stat_ret != 0) {
LOG_ERRORF("walk.c", "Could not stat file %s (%s)", buf, strerror(errno));
continue;
}
if (!S_ISREG(info.st_mode)) {
LOG_ERRORF("walk.c", "Is not a regular file: %s", buf);
continue;
}
char *absolute_path = canonicalize_file_name(buf);
if (absolute_path == NULL) {
LOG_FATALF("walk.c", "FIXME: Could not get absolute path of %s", buf);
}
if (ScanCtx.exclude != NULL && EXCLUDED(absolute_path)) {
LOG_DEBUGF("walk.c", "Excluded: %s", absolute_path)
if (S_ISREG(info.st_mode)) {
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
ScanCtx.dbg_excluded_files_count += 1;
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
}
continue;
}
if (!STR_STARTS_WITH(absolute_path, ScanCtx.index.desc.root)) {
LOG_FATALF("walk.c", "File is not a children of root folder (%s): %s", ScanCtx.index.desc.root, buf);
}
int base = (int) (strrchr(buf, '/') - buf) + 1;
parse_job_t *job = create_fs_parse_job(absolute_path, &info, base);
free(absolute_path);
tpool_add_work(ScanCtx.pool, parse, job);
}
return 0;
}

View File

@@ -5,4 +5,6 @@
int walk_directory_tree(const char *);
int iterate_file_list(void* input_file);
#endif

View File

@@ -55,10 +55,14 @@ void vsist_logf(const char *filepath, int level, char *format, va_list ap) {
log_len += 1;
}
int ret = write(STDERR_FILENO, log_str, log_len);
if (ret == -1) {
LOG_FATALF("serialize.c", "Could not write index descriptor: %s", strerror(errno))
if (PrintingProgressBar) {
PrintingProgressBar = FALSE;
memmove(log_str + 1, log_str, log_len);
log_str[0] = '\n';
log_len += 1;
}
write(STDERR_FILENO, log_str, log_len);
}
void sist_logf(const char *filepath, int level, char *format, ...) {
@@ -104,8 +108,12 @@ void sist_log(const char *filepath, int level, char *str) {
);
}
int ret = write(STDERR_FILENO, log_str, log_len);
if (ret == -1) {
LOG_FATALF("serialize.c", "Could not write index descriptor: %s", strerror(errno));
if (PrintingProgressBar) {
PrintingProgressBar = FALSE;
memmove(log_str + 1, log_str, log_len);
log_str[0] = '\n';
log_len += 1;
}
write(STDERR_FILENO, log_str, log_len);
}

View File

@@ -14,6 +14,9 @@
#include "parsing/mime.h"
#include "parsing/parse.h"
#include <signal.h>
#include <unistd.h>
#include "stats.h"
#define DESCRIPTION "Lightning-fast file system indexer and search tool."
@@ -29,8 +32,6 @@ static const char *const usage[] = {
NULL,
};
#include<signal.h>
#include<unistd.h>
static __sighandler_t sigsegv_handler = NULL;
static __sighandler_t sigabrt_handler = NULL;
@@ -169,6 +170,9 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.dbg_current_files = g_hash_table_new_full(g_int64_hash, g_int64_equal, NULL, NULL);
pthread_mutex_init(&ScanCtx.dbg_current_files_mu, NULL);
pthread_mutex_init(&ScanCtx.dbg_file_counts_mu, NULL);
pthread_mutex_init(&ScanCtx.copy_table_mu, NULL);
ScanCtx.calculate_checksums = args->calculate_checksums;
// Archive
ScanCtx.arc_ctx.mode = args->archive_mode;
@@ -216,6 +220,11 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.media_ctx.store = _store;
ScanCtx.media_ctx.max_media_buffer = (long) args->max_memory_buffer * 1024 * 1024;
ScanCtx.media_ctx.read_subtitles = args->read_subtitles;
if (args->ocr_images) {
ScanCtx.media_ctx.tesseract_lang = args->tesseract_lang;
ScanCtx.media_ctx.tesseract_path = args->tesseract_path;
}
init_media();
// OOXML
@@ -258,6 +267,19 @@ void initialize_scan_context(scan_args_t *args) {
ScanCtx.raw_ctx.log = _log;
ScanCtx.raw_ctx.logf = _logf;
ScanCtx.raw_ctx.store = _store;
// Wpd
ScanCtx.wpd_ctx.content_size = args->content_size;
ScanCtx.wpd_ctx.log = _log;
ScanCtx.wpd_ctx.logf = _logf;
ScanCtx.wpd_ctx.wpd_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/wordperfect");
// Json
ScanCtx.json_ctx.content_size = args->content_size;
ScanCtx.json_ctx.log = _log;
ScanCtx.json_ctx.logf = _logf;
ScanCtx.json_ctx.json_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/json");
ScanCtx.json_ctx.ndjson_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/ndjson");
}
@@ -319,10 +341,20 @@ void sist2_scan(scan_args_t *args) {
ScanCtx.writer_pool = tpool_create(1, writer_cleanup, TRUE, FALSE);
tpool_start(ScanCtx.writer_pool);
int walk_ret = walk_directory_tree(ScanCtx.index.desc.root);
if (walk_ret == -1) {
LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno)
if (args->list_path) {
// Scan using file list
int list_ret = iterate_file_list(args->list_file);
if (list_ret != 0) {
LOG_FATALF("main.c", "iterate_file_list() failed! (%d)", list_ret)
}
} else {
// Scan directory recursively
int walk_ret = walk_directory_tree(ScanCtx.index.desc.root);
if (walk_ret == -1) {
LOG_FATALF("main.c", "walk_directory_tree() failed! %s (%d)", strerror(errno), errno)
}
}
tpool_wait(ScanCtx.pool);
tpool_destroy(ScanCtx.pool);
@@ -418,7 +450,7 @@ void sist2_index(index_args_t *args) {
cleanup = elastic_cleanup;
}
IndexCtx.pool = tpool_create(args->threads, cleanup, FALSE, FALSE);
IndexCtx.pool = tpool_create(args->threads, cleanup, FALSE, args->print == 0);
tpool_start(IndexCtx.pool);
struct dirent *de;
@@ -474,7 +506,7 @@ void sist2_web(web_args_t *args) {
WebCtx.tag_auth_enabled = args->tag_auth_enabled;
WebCtx.tagline = args->tagline;
WebCtx.dev = args->dev;
strcpy(WebCtx.lang, "en");
strcpy(WebCtx.lang, args->lang);
for (int i = 0; i < args->index_count; i++) {
char *abs_path = abspath(args->indices[i]);
@@ -549,8 +581,11 @@ int main(int argc, const char *argv[]) {
OPT_STRING(0, "archive-passphrase", &scan_args->archive_passphrase,
"Passphrase for encrypted archive files"),
OPT_STRING(0, "ocr", &scan_args->tesseract_lang, "Tesseract language (use tesseract --list-langs to see "
"which are installed on your machine)"),
OPT_STRING(0, "ocr-lang", &scan_args->tesseract_lang,
"Tesseract language (use 'tesseract --list-langs' to see "
"which are installed on your machine)"),
OPT_BOOLEAN(0, "ocr-images", &scan_args->ocr_images, "Enable OCR'ing of image files."),
OPT_BOOLEAN(0, "ocr-ebooks", &scan_args->ocr_ebooks, "Enable OCR'ing of ebook files."),
OPT_STRING('e', "exclude", &scan_args->exclude_regex, "Files that match this regex will not be scanned"),
OPT_BOOLEAN(0, "fast", &scan_args->fast, "Only index file names & mime type"),
OPT_STRING(0, "treemap-threshold", &scan_args->treemap_threshold_str, "Relative size threshold for treemap "
@@ -561,6 +596,10 @@ int main(int argc, const char *argv[]) {
OPT_BOOLEAN(0, "read-subtitles", &scan_args->read_subtitles, "Read subtitles from media files."),
OPT_BOOLEAN(0, "fast-epub", &scan_args->fast_epub,
"Faster but less accurate EPUB parsing (no thumbnails, metadata)"),
OPT_BOOLEAN(0, "checksums", &scan_args->calculate_checksums, "Calculate file checksums when scanning."),
OPT_STRING(0, "list-file", &scan_args->list_path, "Specify a list of newline-delimited paths to be scanned"
" instead of normal directory traversal. Use '-' to read"
" from stdin."),
OPT_GROUP("Index options"),
OPT_INTEGER('t', "threads", &common_threads, "Number of threads. DEFAULT=1"),
@@ -583,6 +622,7 @@ int main(int argc, const char *argv[]) {
OPT_STRING(0, "tag-auth", &web_args->tag_credentials, "Basic auth in user:password format for tagging"),
OPT_STRING(0, "tagline", &web_args->tagline, "Tagline in navbar"),
OPT_BOOLEAN(0, "dev", &web_args->dev, "Serve html & js files from disk (for development)"),
OPT_STRING(0, "lang", &web_args->lang, "Default UI language. Can be changed by the user"),
OPT_GROUP("Exec-script options"),
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url. DEFAULT=http://localhost:9200"),

View File

@@ -35,427 +35,426 @@ enum mime {
application_mime=655387,
application_mspowerpoint=655388,
application_msword=655389,
application_netmc=655390,
application_octet_stream=655391,
application_oda=655392,
application_ogg=655393,
application_pdf=655394 | 0x40000000,
application_pgp_keys=655395,
application_pgp_signature=655396,
application_pkcs7_signature=655397,
application_pkix_cert=655398,
application_postscript=655399,
application_pro_eng=655400,
application_ringing_tones=655401,
application_smil=655402,
application_solids=655403,
application_sounder=655404,
application_step=655405,
application_streamingmedia=655406,
application_vda=655407,
application_vnd_amazon_mobi8_ebook=655408 | 0x02000000,
application_vnd_coffeescript=655409,
application_vnd_fdf=655410,
application_vnd_font_fontforge_sfd=655411,
application_vnd_hp_hpgl=655412,
application_vnd_iccprofile=655413,
application_vnd_lotus_1_2_3=655414,
application_vnd_ms_cab_compressed=655415,
application_vnd_ms_excel=655416,
application_vnd_ms_fontobject=655417,
application_vnd_ms_opentype=655418 | 0x20000000,
application_vnd_ms_outlook=655419,
application_vnd_ms_pki_certstore=655420,
application_vnd_ms_pki_pko=655421,
application_vnd_ms_pki_seccat=655422,
application_vnd_ms_powerpoint=655423,
application_vnd_ms_project=655424,
application_vnd_oasis_opendocument_base=655425,
application_vnd_oasis_opendocument_formula=655426,
application_vnd_oasis_opendocument_graphics=655427,
application_vnd_oasis_opendocument_presentation=655428,
application_vnd_oasis_opendocument_spreadsheet=655429,
application_vnd_oasis_opendocument_text=655430,
application_vnd_openxmlformats_officedocument_presentationml_presentation=655431 | 0x04000000,
application_vnd_openxmlformats_officedocument_spreadsheetml_sheet=655432 | 0x04000000,
application_vnd_openxmlformats_officedocument_wordprocessingml_document=655433 | 0x04000000,
application_vnd_symbian_install=655434,
application_vnd_tcpdump_pcap=655435,
application_vnd_wap_wmlc=655436,
application_vnd_wap_wmlscriptc=655437,
application_vnd_xara=655438,
application_vocaltec_media_desc=655439,
application_vocaltec_media_file=655440,
application_warc=655441,
application_winhelp=655442,
application_wordperfect=655443,
application_wordperfect6_0=655444,
application_wordperfect6_1=655445,
application_x_123=655446,
application_x_7z_compressed=655447 | 0x10000000,
application_x_aim=655448,
application_x_apple_diskimage=655449,
application_x_arc=655450 | 0x10000000,
application_x_archive=655451,
application_x_atari_7800_rom=655452,
application_x_authorware_bin=655453,
application_x_authorware_map=655454,
application_x_authorware_seg=655455,
application_x_avira_qua=655456,
application_x_bcpio=655457,
application_x_bittorrent=655458,
application_x_bsh=655459,
application_x_bytecode_python=655460,
application_x_bzip=655461,
application_x_bzip2=655462 | 0x08000000,
application_x_cbr=655463,
application_x_cbz=655464,
application_x_cdlink=655465,
application_x_chat=655466,
application_x_chrome_extension=655467,
application_x_cocoa=655468,
application_x_conference=655469,
application_x_coredump=655470,
application_x_cpio=655471,
application_x_dbf=655472,
application_x_dbt=655473,
application_x_debian_package=655474,
application_x_deepv=655475,
application_x_director=655476,
application_x_dmp=655477,
application_x_dosdriver=655478,
application_x_dosexec=655479,
application_x_dvi=655480,
application_x_elc=655481,
application_ndjson=655390,
application_netmc=655391,
application_octet_stream=655392,
application_oda=655393,
application_ogg=655394,
application_pdf=655395 | 0x40000000,
application_pgp_keys=655396,
application_pgp_signature=655397,
application_pkcs7_signature=655398,
application_pkix_cert=655399,
application_postscript=655400,
application_pro_eng=655401,
application_ringing_tones=655402,
application_smil=655403,
application_solids=655404,
application_sounder=655405,
application_step=655406,
application_streamingmedia=655407,
application_vda=655408,
application_vnd_amazon_mobi8_ebook=655409 | 0x02000000,
application_vnd_coffeescript=655410,
application_vnd_fdf=655411,
application_vnd_font_fontforge_sfd=655412,
application_vnd_hp_hpgl=655413,
application_vnd_iccprofile=655414,
application_vnd_lotus_1_2_3=655415,
application_vnd_ms_cab_compressed=655416,
application_vnd_ms_excel=655417,
application_vnd_ms_fontobject=655418,
application_vnd_ms_opentype=655419 | 0x20000000,
application_vnd_ms_outlook=655420,
application_vnd_ms_pki_certstore=655421,
application_vnd_ms_pki_pko=655422,
application_vnd_ms_pki_seccat=655423,
application_vnd_ms_powerpoint=655424,
application_vnd_ms_project=655425,
application_vnd_oasis_opendocument_base=655426,
application_vnd_oasis_opendocument_formula=655427,
application_vnd_oasis_opendocument_graphics=655428,
application_vnd_oasis_opendocument_presentation=655429,
application_vnd_oasis_opendocument_spreadsheet=655430,
application_vnd_oasis_opendocument_text=655431,
application_vnd_openxmlformats_officedocument_presentationml_presentation=655432 | 0x04000000,
application_vnd_openxmlformats_officedocument_spreadsheetml_sheet=655433 | 0x04000000,
application_vnd_openxmlformats_officedocument_wordprocessingml_document=655434 | 0x04000000,
application_vnd_symbian_install=655435,
application_vnd_tcpdump_pcap=655436,
application_vnd_wap_wmlc=655437,
application_vnd_wap_wmlscriptc=655438,
application_vnd_xara=655439,
application_vocaltec_media_desc=655440,
application_vocaltec_media_file=655441,
application_warc=655442,
application_winhelp=655443,
application_wordperfect=655444,
application_x_123=655445,
application_x_7z_compressed=655446 | 0x10000000,
application_x_aim=655447,
application_x_apple_diskimage=655448,
application_x_arc=655449 | 0x10000000,
application_x_archive=655450,
application_x_atari_7800_rom=655451,
application_x_authorware_bin=655452,
application_x_authorware_map=655453,
application_x_authorware_seg=655454,
application_x_avira_qua=655455,
application_x_bcpio=655456,
application_x_bittorrent=655457,
application_x_bsh=655458,
application_x_bytecode_python=655459,
application_x_bzip=655460,
application_x_bzip2=655461 | 0x08000000,
application_x_cbr=655462,
application_x_cbz=655463,
application_x_cdlink=655464,
application_x_chat=655465,
application_x_chrome_extension=655466,
application_x_cocoa=655467,
application_x_conference=655468,
application_x_coredump=655469,
application_x_cpio=655470,
application_x_dbf=655471,
application_x_dbt=655472,
application_x_debian_package=655473,
application_x_deepv=655474,
application_x_director=655475,
application_x_dmp=655476,
application_x_dosdriver=655477,
application_x_dosexec=655478,
application_x_dvi=655479,
application_x_elc=655480,
application_x_empty=1,
application_x_envoy=655482,
application_x_esrehber=655483,
application_x_excel=655484,
application_x_executable=655485,
application_x_font_gdos=655486,
application_x_font_pf2=655487,
application_x_font_pfm=655488,
application_x_font_sfn=655489,
application_x_font_ttf=655490 | 0x20000000,
application_x_fptapplication_x_dbt=655491,
application_x_freelance=655492,
application_x_gamecube_rom=655493,
application_x_gdbm=655494,
application_x_gettext_translation=655495,
application_x_git=655496,
application_x_gsp=655497,
application_x_gss=655498,
application_x_gtar=655499,
application_x_gzip=655500,
application_x_hdf=655501,
application_x_helpfile=655502,
application_x_httpd_imap=655503,
application_x_ima=655504,
application_x_innosetup=655505,
application_x_internett_signup=655506,
application_x_inventor=655507,
application_x_ip2=655508,
application_x_java_applet=655509,
application_x_java_commerce=655510,
application_x_java_image=655511,
application_x_java_jmod=655512,
application_x_java_keystore=655513,
application_x_kdelnk=655514,
application_x_koan=655515,
application_x_latex=655516,
application_x_livescreen=655517,
application_x_lotus=655518,
application_x_lz4=655519 | 0x08000000,
application_x_lz4_json=655520,
application_x_lzh=655521,
application_x_lzh_compressed=655522,
application_x_lzip=655523 | 0x08000000,
application_x_lzma=655524 | 0x08000000,
application_x_lzop=655525 | 0x08000000,
application_x_lzx=655526,
application_x_mach_binary=655527,
application_x_mach_executable=655528,
application_x_magic_cap_package_1_0=655529,
application_x_mathcad=655530,
application_x_maxis_dbpf=655531,
application_x_meme=655532,
application_x_midi=655533,
application_x_mif=655534,
application_x_mix_transfer=655535,
application_x_mobipocket_ebook=655536 | 0x02000000,
application_x_ms_compress_szdd=655537,
application_x_ms_pdb=655538,
application_x_ms_reader=655539,
application_x_msaccess=655540,
application_x_n64_rom=655541,
application_x_navi_animation=655542,
application_x_navidoc=655543,
application_x_navimap=655544,
application_x_navistyle=655545,
application_x_nes_rom=655546,
application_x_netcdf=655547,
application_x_newton_compatible_pkg=655548,
application_x_nintendo_ds_rom=655549,
application_x_object=655550,
application_x_omc=655551,
application_x_omcdatamaker=655552,
application_x_omcregerator=655553,
application_x_pagemaker=655554,
application_x_pcl=655555,
application_x_pgp_keyring=655556,
application_x_pixclscript=655557,
application_x_pkcs7_certreqresp=655558,
application_x_pkcs7_signature=655559,
application_x_project=655560,
application_x_qpro=655561,
application_x_rar=655562 | 0x10000000,
application_x_rpm=655563,
application_x_sdp=655564,
application_x_sea=655565,
application_x_seelogo=655566,
application_x_setupscript=655567,
application_x_shar=655568,
application_x_sharedlib=655569,
application_x_shockwave_flash=655570,
application_x_snappy_framed=655571,
application_x_sprite=655572,
application_x_sqlite3=655573,
application_x_stargallery_thm=655574,
application_x_stuffit=655575,
application_x_sv4cpio=655576,
application_x_sv4crc=655577,
application_x_tar=655578 | 0x10000000,
application_x_tbook=655579,
application_x_terminfo=655580,
application_x_terminfo2=655581,
application_x_tex_tfm=655582,
application_x_texinfo=655583,
application_x_ustar=655584,
application_x_visio=655585,
application_x_vnd_audioexplosion_mzz=655586,
application_x_vnd_ls_xpix=655587,
application_x_vrml=655588,
application_x_wais_source=655589,
application_x_wine_extension_ini=655590,
application_x_wintalk=655591,
application_x_world=655592,
application_x_wri=655593,
application_x_x509_ca_cert=655594,
application_x_xz=655595 | 0x08000000,
application_x_zip=655596,
application_x_zstd=655597 | 0x08000000,
application_x_zstd_dictionary=655598,
application_xml=655599,
application_zip=655600 | 0x10000000,
application_zlib=655601,
audio_basic=458994 | 0x80000000,
audio_it=458995,
audio_make=458996,
audio_mid=458997,
audio_midi=458998,
audio_mp4=458999,
audio_mpeg=459000,
audio_ogg=459001,
audio_s3m=459002,
audio_tsp_audio=459003,
audio_tsplayer=459004,
audio_vnd_qcelp=459005,
audio_voxware=459006,
audio_x_aiff=459007,
audio_x_flac=459008,
audio_x_gsm=459009,
audio_x_hx_aac_adts=459010,
audio_x_jam=459011,
audio_x_liveaudio=459012,
audio_x_m4a=459013,
audio_x_midi=459014,
audio_x_mod=459015,
audio_x_mp4a_latm=459016,
audio_x_mpeg_3=459017,
audio_x_mpequrl=459018,
audio_x_nspaudio=459019,
audio_x_pn_realaudio=459020,
audio_x_psid=459021,
audio_x_realaudio=459022,
audio_x_s3m=459023,
audio_x_twinvq=459024,
audio_x_twinvq_plugin=459025,
audio_x_voc=459026,
audio_x_wav=459027,
audio_x_xbox_executable=459028 | 0x80000000,
audio_x_xbox360_executable=459029 | 0x80000000,
audio_xm=459030,
font_otf=327959 | 0x20000000,
font_sfnt=327960 | 0x20000000,
font_woff=327961 | 0x20000000,
font_woff2=327962 | 0x20000000,
image_bmp=524571,
image_cmu_raster=524572,
image_fif=524573,
image_florian=524574,
image_g3fax=524575,
image_gif=524576,
image_heic=524577,
image_ief=524578,
image_jpeg=524579,
image_jutvision=524580,
image_naplps=524581,
image_pict=524582,
image_png=524583,
image_svg=524584 | 0x80000000,
image_svg_xml=524585 | 0x80000000,
image_tiff=524586,
image_vnd_adobe_photoshop=524587 | 0x80000000,
image_vnd_djvu=524588 | 0x80000000,
image_vnd_fpx=524589,
image_vnd_microsoft_icon=524590,
image_vnd_rn_realflash=524591,
image_vnd_rn_realpix=524592,
image_vnd_wap_wbmp=524593,
image_vnd_xiff=524594,
image_webp=524595,
image_wmf=524596,
image_x_3ds=524597,
image_x_adobe_dng=524598 | 0x00800000,
image_x_award_bioslogo=524599,
image_x_canon_cr2=524600 | 0x00800000,
image_x_canon_crw=524601 | 0x00800000,
image_x_cmu_raster=524602,
image_x_cur=524603,
image_x_dcraw=524604 | 0x00800000,
image_x_dwg=524605,
image_x_eps=524606,
image_x_epson_erf=524607 | 0x00800000,
image_x_exr=524608,
image_x_fuji_raf=524609 | 0x00800000,
image_x_gem=524610,
image_x_icns=524611,
image_x_icon=524612 | 0x80000000,
image_x_jg=524613,
image_x_jps=524614,
image_x_kodak_dcr=524615 | 0x00800000,
image_x_kodak_k25=524616 | 0x00800000,
image_x_kodak_kdc=524617 | 0x00800000,
image_x_minolta_mrw=524618 | 0x00800000,
image_x_ms_bmp=524619,
image_x_niff=524620,
image_x_nikon_nef=524621 | 0x00800000,
image_x_olympus_orf=524622 | 0x00800000,
image_x_panasonic_raw=524623 | 0x00800000,
image_x_pcx=524624,
image_x_pentax_pef=524625 | 0x00800000,
image_x_pict=524626,
image_x_portable_bitmap=524627,
image_x_portable_graymap=524628,
image_x_portable_pixmap=524629,
image_x_quicktime=524630,
image_x_rgb=524631,
image_x_sigma_x3f=524632 | 0x00800000,
image_x_sony_arw=524633 | 0x00800000,
image_x_sony_sr2=524634 | 0x00800000,
image_x_sony_srf=524635 | 0x00800000,
image_x_tga=524636,
image_x_tiff=524637,
image_x_win_bitmap=524638,
image_x_xcf=524639 | 0x80000000,
image_x_xpixmap=524640 | 0x80000000,
image_x_xwindowdump=524641,
message_news=196962,
message_rfc822=196963,
model_vnd_dwf=65892,
model_vnd_gdl=65893,
model_vnd_gs_gdl=65894,
model_vrml=65895,
model_x_pov=65896,
application_x_envoy=655481,
application_x_esrehber=655482,
application_x_excel=655483,
application_x_executable=655484,
application_x_font_gdos=655485,
application_x_font_pf2=655486,
application_x_font_pfm=655487,
application_x_font_sfn=655488,
application_x_font_ttf=655489 | 0x20000000,
application_x_fptapplication_x_dbt=655490,
application_x_freelance=655491,
application_x_gamecube_rom=655492,
application_x_gdbm=655493,
application_x_gettext_translation=655494,
application_x_git=655495,
application_x_gsp=655496,
application_x_gss=655497,
application_x_gtar=655498,
application_x_gzip=655499,
application_x_hdf=655500,
application_x_helpfile=655501,
application_x_httpd_imap=655502,
application_x_ima=655503,
application_x_innosetup=655504,
application_x_internett_signup=655505,
application_x_inventor=655506,
application_x_ip2=655507,
application_x_java_applet=655508,
application_x_java_commerce=655509,
application_x_java_image=655510,
application_x_java_jmod=655511,
application_x_java_keystore=655512,
application_x_kdelnk=655513,
application_x_koan=655514,
application_x_latex=655515,
application_x_livescreen=655516,
application_x_lotus=655517,
application_x_lz4=655518 | 0x08000000,
application_x_lz4_json=655519,
application_x_lzh=655520,
application_x_lzh_compressed=655521,
application_x_lzip=655522 | 0x08000000,
application_x_lzma=655523 | 0x08000000,
application_x_lzop=655524 | 0x08000000,
application_x_lzx=655525,
application_x_mach_binary=655526,
application_x_mach_executable=655527,
application_x_magic_cap_package_1_0=655528,
application_x_mathcad=655529,
application_x_maxis_dbpf=655530,
application_x_meme=655531,
application_x_midi=655532,
application_x_mif=655533,
application_x_mix_transfer=655534,
application_x_mobipocket_ebook=655535 | 0x02000000,
application_x_ms_compress_szdd=655536,
application_x_ms_pdb=655537,
application_x_ms_reader=655538,
application_x_msaccess=655539,
application_x_n64_rom=655540,
application_x_navi_animation=655541,
application_x_navidoc=655542,
application_x_navimap=655543,
application_x_navistyle=655544,
application_x_nes_rom=655545,
application_x_netcdf=655546,
application_x_newton_compatible_pkg=655547,
application_x_nintendo_ds_rom=655548,
application_x_object=655549,
application_x_omc=655550,
application_x_omcdatamaker=655551,
application_x_omcregerator=655552,
application_x_pagemaker=655553,
application_x_pcl=655554,
application_x_pgp_keyring=655555,
application_x_pixclscript=655556,
application_x_pkcs7_certreqresp=655557,
application_x_pkcs7_signature=655558,
application_x_project=655559,
application_x_qpro=655560,
application_x_rar=655561 | 0x10000000,
application_x_rpm=655562,
application_x_sdp=655563,
application_x_sea=655564,
application_x_seelogo=655565,
application_x_setupscript=655566,
application_x_shar=655567,
application_x_sharedlib=655568,
application_x_shockwave_flash=655569,
application_x_snappy_framed=655570,
application_x_sprite=655571,
application_x_sqlite3=655572,
application_x_stargallery_thm=655573,
application_x_stuffit=655574,
application_x_sv4cpio=655575,
application_x_sv4crc=655576,
application_x_tar=655577 | 0x10000000,
application_x_tbook=655578,
application_x_terminfo=655579,
application_x_terminfo2=655580,
application_x_tex_tfm=655581,
application_x_texinfo=655582,
application_x_ustar=655583,
application_x_visio=655584,
application_x_vnd_audioexplosion_mzz=655585,
application_x_vnd_ls_xpix=655586,
application_x_vrml=655587,
application_x_wais_source=655588,
application_x_wine_extension_ini=655589,
application_x_wintalk=655590,
application_x_world=655591,
application_x_wri=655592,
application_x_x509_ca_cert=655593,
application_x_xz=655594 | 0x08000000,
application_x_zip=655595,
application_x_zstd=655596 | 0x08000000,
application_x_zstd_dictionary=655597,
application_xml=655598,
application_zip=655599 | 0x10000000,
application_zlib=655600,
audio_basic=458993 | 0x80000000,
audio_it=458994,
audio_make=458995,
audio_mid=458996,
audio_midi=458997,
audio_mp4=458998,
audio_mpeg=458999,
audio_ogg=459000,
audio_s3m=459001,
audio_tsp_audio=459002,
audio_tsplayer=459003,
audio_vnd_qcelp=459004,
audio_voxware=459005,
audio_x_aiff=459006,
audio_x_flac=459007,
audio_x_gsm=459008,
audio_x_hx_aac_adts=459009,
audio_x_jam=459010,
audio_x_liveaudio=459011,
audio_x_m4a=459012,
audio_x_midi=459013,
audio_x_mod=459014,
audio_x_mp4a_latm=459015,
audio_x_mpeg_3=459016,
audio_x_mpequrl=459017,
audio_x_nspaudio=459018,
audio_x_pn_realaudio=459019,
audio_x_psid=459020,
audio_x_realaudio=459021,
audio_x_s3m=459022,
audio_x_twinvq=459023,
audio_x_twinvq_plugin=459024,
audio_x_voc=459025,
audio_x_wav=459026,
audio_x_xbox_executable=459027 | 0x80000000,
audio_x_xbox360_executable=459028 | 0x80000000,
audio_xm=459029,
font_otf=327958 | 0x20000000,
font_sfnt=327959 | 0x20000000,
font_woff=327960 | 0x20000000,
font_woff2=327961 | 0x20000000,
image_bmp=524570,
image_cmu_raster=524571,
image_fif=524572,
image_florian=524573,
image_g3fax=524574,
image_gif=524575,
image_heic=524576,
image_ief=524577,
image_jpeg=524578,
image_jutvision=524579,
image_naplps=524580,
image_pict=524581,
image_png=524582,
image_svg=524583 | 0x80000000,
image_svg_xml=524584 | 0x80000000,
image_tiff=524585,
image_vnd_adobe_photoshop=524586 | 0x80000000,
image_vnd_djvu=524587 | 0x80000000,
image_vnd_fpx=524588,
image_vnd_microsoft_icon=524589,
image_vnd_rn_realflash=524590,
image_vnd_rn_realpix=524591,
image_vnd_wap_wbmp=524592,
image_vnd_xiff=524593,
image_webp=524594,
image_wmf=524595,
image_x_3ds=524596,
image_x_adobe_dng=524597 | 0x00800000,
image_x_award_bioslogo=524598,
image_x_canon_cr2=524599 | 0x00800000,
image_x_canon_crw=524600 | 0x00800000,
image_x_cmu_raster=524601,
image_x_cur=524602,
image_x_dcraw=524603 | 0x00800000,
image_x_dwg=524604,
image_x_eps=524605,
image_x_epson_erf=524606 | 0x00800000,
image_x_exr=524607,
image_x_fuji_raf=524608 | 0x00800000,
image_x_gem=524609,
image_x_icns=524610,
image_x_icon=524611 | 0x80000000,
image_x_jg=524612,
image_x_jps=524613,
image_x_kodak_dcr=524614 | 0x00800000,
image_x_kodak_k25=524615 | 0x00800000,
image_x_kodak_kdc=524616 | 0x00800000,
image_x_minolta_mrw=524617 | 0x00800000,
image_x_ms_bmp=524618,
image_x_niff=524619,
image_x_nikon_nef=524620 | 0x00800000,
image_x_olympus_orf=524621 | 0x00800000,
image_x_panasonic_raw=524622 | 0x00800000,
image_x_pcx=524623,
image_x_pentax_pef=524624 | 0x00800000,
image_x_pict=524625,
image_x_portable_bitmap=524626,
image_x_portable_graymap=524627,
image_x_portable_pixmap=524628,
image_x_quicktime=524629,
image_x_rgb=524630,
image_x_sigma_x3f=524631 | 0x00800000,
image_x_sony_arw=524632 | 0x00800000,
image_x_sony_sr2=524633 | 0x00800000,
image_x_sony_srf=524634 | 0x00800000,
image_x_tga=524635,
image_x_tiff=524636,
image_x_win_bitmap=524637,
image_x_xcf=524638 | 0x80000000,
image_x_xpixmap=524639 | 0x80000000,
image_x_xwindowdump=524640,
message_news=196961,
message_rfc822=196962,
model_vnd_dwf=65891,
model_vnd_gdl=65892,
model_vnd_gs_gdl=65893,
model_vrml=65894,
model_x_pov=65895,
sist2_sidecar=2,
text_PGP=590185,
text_asp=590186,
text_css=590187,
text_html=590188 | 0x01000000,
text_javascript=590189,
text_mcf=590190,
text_pascal=590191,
text_plain=590192,
text_richtext=590193,
text_rtf=590194,
text_scriplet=590195,
text_tab_separated_values=590196,
text_troff=590197,
text_uri_list=590198,
text_vnd_abc=590199,
text_vnd_fmi_flexstor=590200,
text_vnd_wap_wml=590201,
text_vnd_wap_wmlscript=590202,
text_webviewhtml=590203,
text_x_Algol68=590204,
text_x_asm=590205,
text_x_audiosoft_intra=590206,
text_x_awk=590207,
text_x_bcpl=590208,
text_x_c=590209,
text_x_c__=590210,
text_x_component=590211,
text_x_diff=590212,
text_x_fortran=590213,
text_x_java=590214,
text_x_la_asf=590215,
text_x_lisp=590216,
text_x_m=590217,
text_x_m4=590218,
text_x_makefile=590219,
text_x_ms_regedit=590220,
text_x_msdos_batch=590221,
text_x_objective_c=590222,
text_x_pascal=590223,
text_x_perl=590224,
text_x_php=590225,
text_x_po=590226,
text_x_python=590227,
text_x_ruby=590228,
text_x_sass=590229,
text_x_scss=590230,
text_x_server_parsed_html=590231,
text_x_setext=590232,
text_x_sgml=590233 | 0x01000000,
text_x_shellscript=590234,
text_x_speech=590235,
text_x_tcl=590236,
text_x_tex=590237,
text_x_uil=590238,
text_x_uuencode=590239,
text_x_vcalendar=590240,
text_x_vcard=590241,
text_xml=590242 | 0x01000000,
video_MP2T=393635,
video_animaflex=393636,
video_avi=393637,
video_avs_video=393638,
video_mp4=393639,
video_mpeg=393640,
video_quicktime=393641,
video_vdo=393642,
video_vivo=393643,
video_vnd_rn_realvideo=393644,
video_vosaic=393645,
video_webm=393646,
video_x_amt_demorun=393647,
video_x_amt_showrun=393648,
video_x_atomic3d_feature=393649,
video_x_dl=393650,
video_x_dv=393651,
video_x_fli=393652,
video_x_flv=393653,
video_x_isvideo=393654,
video_x_jng=393655 | 0x80000000,
video_x_m4v=393656,
video_x_matroska=393657,
video_x_mng=393658,
video_x_motion_jpeg=393659,
video_x_ms_asf=393660,
video_x_msvideo=393661,
video_x_qtc=393662,
video_x_sgi_movie=393663,
x_epoc_x_sisx_app=721344,
text_PGP=590184,
text_asp=590185,
text_css=590186,
text_html=590187 | 0x01000000,
text_javascript=590188,
text_mcf=590189,
text_pascal=590190,
text_plain=590191,
text_richtext=590192,
text_rtf=590193,
text_scriplet=590194,
text_tab_separated_values=590195,
text_troff=590196,
text_uri_list=590197,
text_vnd_abc=590198,
text_vnd_fmi_flexstor=590199,
text_vnd_wap_wml=590200,
text_vnd_wap_wmlscript=590201,
text_webviewhtml=590202,
text_x_Algol68=590203,
text_x_asm=590204,
text_x_audiosoft_intra=590205,
text_x_awk=590206,
text_x_bcpl=590207,
text_x_c=590208,
text_x_c__=590209,
text_x_component=590210,
text_x_diff=590211,
text_x_fortran=590212,
text_x_java=590213,
text_x_la_asf=590214,
text_x_lisp=590215,
text_x_m=590216,
text_x_m4=590217,
text_x_makefile=590218,
text_x_ms_regedit=590219,
text_x_msdos_batch=590220,
text_x_objective_c=590221,
text_x_pascal=590222,
text_x_perl=590223,
text_x_php=590224,
text_x_po=590225,
text_x_python=590226,
text_x_ruby=590227,
text_x_sass=590228,
text_x_scss=590229,
text_x_server_parsed_html=590230,
text_x_setext=590231,
text_x_sgml=590232 | 0x01000000,
text_x_shellscript=590233,
text_x_speech=590234,
text_x_tcl=590235,
text_x_tex=590236,
text_x_uil=590237,
text_x_uuencode=590238,
text_x_vcalendar=590239,
text_x_vcard=590240,
text_xml=590241 | 0x01000000,
video_MP2T=393634,
video_animaflex=393635,
video_avi=393636,
video_avs_video=393637,
video_mp4=393638,
video_mpeg=393639,
video_quicktime=393640,
video_vdo=393641,
video_vivo=393642,
video_vnd_rn_realvideo=393643,
video_vosaic=393644,
video_webm=393645,
video_x_amt_demorun=393646,
video_x_amt_showrun=393647,
video_x_atomic3d_feature=393648,
video_x_dl=393649,
video_x_dv=393650,
video_x_fli=393651,
video_x_flv=393652,
video_x_isvideo=393653,
video_x_jng=393654 | 0x80000000,
video_x_m4v=393655,
video_x_matroska=393656,
video_x_mng=393657,
video_x_motion_jpeg=393658,
video_x_ms_asf=393659,
video_x_msvideo=393660,
video_x_qtc=393661,
video_x_sgi_movie=393662,
x_epoc_x_sisx_app=721343,
};
char *mime_get_mime_text(unsigned int mime_id) {switch (mime_id) {
case application_arj: return "application/arj";
@@ -482,6 +481,7 @@ case application_java_archive: return "application/java-archive";
case application_java: return "application/java";
case application_javascript: return "application/javascript";
case application_json: return "application/json";
case application_ndjson: return "application/ndjson";
case application_marc: return "application/marc";
case application_mbedlet: return "application/mbedlet";
case application_mime: return "application/mime";
@@ -537,8 +537,6 @@ case application_vocaltec_media_desc: return "application/vocaltec-media-desc";
case application_vocaltec_media_file: return "application/vocaltec-media-file";
case application_warc: return "application/warc";
case application_winhelp: return "application/winhelp";
case application_wordperfect6_0: return "application/wordperfect6.0";
case application_wordperfect6_1: return "application/wordperfect6.1";
case application_wordperfect: return "application/wordperfect";
case application_x_123: return "application/x-123";
case application_x_7z_compressed: return "application/x-7z-compressed";
@@ -934,6 +932,8 @@ g_hash_table_insert(ext_table, "inf", (gpointer)application_inf);
g_hash_table_insert(ext_table, "jar", (gpointer)application_java_archive);
g_hash_table_insert(ext_table, "class", (gpointer)application_java);
g_hash_table_insert(ext_table, "json", (gpointer)application_json);
g_hash_table_insert(ext_table, "jsonl", (gpointer)application_ndjson);
g_hash_table_insert(ext_table, "ndjson", (gpointer)application_ndjson);
g_hash_table_insert(ext_table, "mrc", (gpointer)application_marc);
g_hash_table_insert(ext_table, "mbd", (gpointer)application_mbedlet);
g_hash_table_insert(ext_table, "aps", (gpointer)application_mime);
@@ -1008,12 +1008,12 @@ g_hash_table_insert(ext_table, "vmd", (gpointer)application_vocaltec_media_desc)
g_hash_table_insert(ext_table, "vmf", (gpointer)application_vocaltec_media_file);
g_hash_table_insert(ext_table, "warc", (gpointer)application_warc);
g_hash_table_insert(ext_table, "hlp", (gpointer)application_winhelp);
g_hash_table_insert(ext_table, "w60", (gpointer)application_wordperfect6_0);
g_hash_table_insert(ext_table, "w61", (gpointer)application_wordperfect6_1);
g_hash_table_insert(ext_table, "wp", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "wp5", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "wp6", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "wpd", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "w60", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "w61", (gpointer)application_wordperfect);
g_hash_table_insert(ext_table, "wk1", (gpointer)application_x_123);
g_hash_table_insert(ext_table, "7z", (gpointer)application_x_7z_compressed);
g_hash_table_insert(ext_table, "aim", (gpointer)application_x_aim);
@@ -1478,6 +1478,7 @@ g_hash_table_insert(mime_table, "application/java-archive", (gpointer)applicatio
g_hash_table_insert(mime_table, "application/java", (gpointer)application_java);
g_hash_table_insert(mime_table, "application/javascript", (gpointer)application_javascript);
g_hash_table_insert(mime_table, "application/json", (gpointer)application_json);
g_hash_table_insert(mime_table, "application/ndjson", (gpointer)application_ndjson);
g_hash_table_insert(mime_table, "application/marc", (gpointer)application_marc);
g_hash_table_insert(mime_table, "application/mbedlet", (gpointer)application_mbedlet);
g_hash_table_insert(mime_table, "application/mime", (gpointer)application_mime);
@@ -1533,8 +1534,6 @@ g_hash_table_insert(mime_table, "application/vocaltec-media-desc", (gpointer)app
g_hash_table_insert(mime_table, "application/vocaltec-media-file", (gpointer)application_vocaltec_media_file);
g_hash_table_insert(mime_table, "application/warc", (gpointer)application_warc);
g_hash_table_insert(mime_table, "application/winhelp", (gpointer)application_winhelp);
g_hash_table_insert(mime_table, "application/wordperfect6.0", (gpointer)application_wordperfect6_0);
g_hash_table_insert(mime_table, "application/wordperfect6.1", (gpointer)application_wordperfect6_1);
g_hash_table_insert(mime_table, "application/wordperfect", (gpointer)application_wordperfect);
g_hash_table_insert(mime_table, "application/x-123", (gpointer)application_x_123);
g_hash_table_insert(mime_table, "application/x-7z-compressed", (gpointer)application_x_7z_compressed);

View File

@@ -9,26 +9,35 @@
#include <magic.h>
#define MIN_VIDEO_SIZE 1024 * 64
#define MIN_IMAGE_SIZE 1024 * 2
#define MIN_VIDEO_SIZE (1024 * 64)
#define MIN_IMAGE_SIZE (512)
int fs_read(struct vfile *f, void *buf, size_t size) {
if (f->fd == -1) {
SHA1_Init(&f->sha1_ctx);
f->fd = open(f->filepath, O_RDONLY);
if (f->fd == -1) {
LOG_ERRORF(f->filepath, "open(): [%d] %s", errno, strerror(errno))
return -1;
}
}
return read(f->fd, buf, size);
int ret = (int) read(f->fd, buf, size);
if (ret != 0 && f->calculate_checksum) {
f->has_checksum = TRUE;
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, ret);
}
return ret;
}
#define CLOSE_FILE(f) if ((f).close != NULL) {(f).close(&(f));};
void fs_close(struct vfile *f) {
if (f->fd != -1) {
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
close(f->fd);
}
}
@@ -66,11 +75,13 @@ void parse(void *arg) {
doc->meta_tail = NULL;
doc->mime = 0;
doc->size = job->vfile.info.st_size;
doc->mtime = job->vfile.info.st_mtim.tv_sec;
doc->mtime = (int) job->vfile.info.st_mtim.tv_sec;
int inc_ts = incremental_get(ScanCtx.original_table, doc->path_md5);
if (inc_ts != 0 && inc_ts == job->vfile.info.st_mtim.tv_sec) {
pthread_mutex_lock(&ScanCtx.copy_table_mu);
incremental_mark_file_for_copy(ScanCtx.copy_table, doc->path_md5);
pthread_mutex_unlock(&ScanCtx.copy_table_mu);
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
ScanCtx.dbg_skipped_files_count += 1;
@@ -93,18 +104,17 @@ void parse(void *arg) {
doc->mime = mime_get_mime_by_ext(ScanCtx.ext_table, job->filepath + job->ext);
}
int bytes_read = 0;
if (doc->mime == 0 && !ScanCtx.fast) {
// Get mime type with libmagic
if (!job->vfile.is_fs_file) {
if (job->vfile.read_rewindable == NULL) {
LOG_WARNING(job->filepath,
"Guessing mime type with libmagic inside archive files is not currently supported");
"File does not support rewindable reads, cannot guess Media type");
goto abort;
}
bytes_read = job->vfile.read(&job->vfile, buf, MAGIC_BUF_SIZE);
int bytes_read = job->vfile.read_rewindable(&job->vfile, buf, MAGIC_BUF_SIZE);
if (bytes_read < 0) {
if (job->vfile.is_fs_file) {
@@ -135,7 +145,9 @@ void parse(void *arg) {
}
}
job->vfile.reset(&job->vfile);
if (job->vfile.reset != NULL) {
job->vfile.reset(&job->vfile);
}
magic_close(magic);
}
@@ -149,7 +161,7 @@ void parse(void *arg) {
} else if ((mmime == MimeVideo && doc->size >= MIN_VIDEO_SIZE) ||
(mmime == MimeImage && doc->size >= MIN_IMAGE_SIZE) || mmime == MimeAudio) {
parse_media(&ScanCtx.media_ctx, &job->vfile, doc);
parse_media(&ScanCtx.media_ctx, &job->vfile, doc, mime_get_mime_text(doc->mime));
} else if (IS_PDF(doc->mime)) {
parse_ebook(&ScanCtx.ebook_ctx, &job->vfile, mime_get_mime_text(doc->mime), doc);
@@ -169,7 +181,7 @@ void parse(void *arg) {
IS_ARC(doc->mime) ||
(IS_ARC_FILTER(doc->mime) && should_parse_filtered_file(doc->filepath, doc->ext))
)) {
parse_archive(&ScanCtx.arc_ctx, &job->vfile, doc);
parse_archive(&ScanCtx.arc_ctx, &job->vfile, doc, ScanCtx.exclude, ScanCtx.exclude_extra);
} else if ((ScanCtx.ooxml_ctx.content_size > 0 || ScanCtx.media_ctx.tn_size > 0) && IS_DOC(doc->mime)) {
parse_ooxml(&ScanCtx.ooxml_ctx, &job->vfile, doc);
} else if (is_cbr(&ScanCtx.comic_ctx, doc->mime) || is_cbz(&ScanCtx.comic_ctx, doc->mime)) {
@@ -179,9 +191,15 @@ void parse(void *arg) {
} else if (doc->mime == MIME_SIST2_SIDECAR) {
parse_sidecar(&job->vfile, doc);
CLOSE_FILE(job->vfile)
free(doc->filepath);
free(doc);
return;
} else if (is_msdoc(&ScanCtx.msdoc_ctx, doc->mime)) {
parse_msdoc(&ScanCtx.msdoc_ctx, &job->vfile, doc);
} else if (is_json(&ScanCtx.json_ctx, doc->mime)) {
parse_json(&ScanCtx.json_ctx, &job->vfile, doc);
} else if (is_ndjson(&ScanCtx.json_ctx, doc->mime)) {
parse_ndjson(&ScanCtx.json_ctx, &job->vfile, doc);
}
abort:
@@ -198,9 +216,15 @@ void parse(void *arg) {
doc->has_parent = FALSE;
}
write_document(doc);
CLOSE_FILE(job->vfile)
if (job->vfile.has_checksum) {
char sha1_digest_str[SHA1_STR_LENGTH];
buf2hex((unsigned char *) job->vfile.sha1_digest, SHA1_DIGEST_LENGTH, (char *) sha1_digest_str);
APPEND_STR_META(doc, MetaChecksum, (const char *) sha1_digest_str);
}
write_document(doc);
}
void cleanup_parse() {

View File

@@ -3,7 +3,7 @@
#include "../sist.h"
#define MAGIC_BUF_SIZE 4096 * 6
#define MAGIC_BUF_SIZE (4096 * 6)
int fs_read(struct vfile *f, void *buf, size_t size);
void fs_close(struct vfile *f);

View File

@@ -27,7 +27,10 @@ void parse_sidecar(vfile_t *vfile, document_t *doc) {
MD5((unsigned char *) vfile->filepath + ScanCtx.index.desc.root_len, doc->ext - 1 - ScanCtx.index.desc.root_len,
path_md5);
store_write(ScanCtx.index.meta_store, (char *) path_md5, sizeof(path_md5), json_str, strlen(json_str) + 1);
char path_md5_str[MD5_STR_LENGTH];
buf2hex(path_md5, MD5_DIGEST_LENGTH, path_md5_str);
store_write(ScanCtx.index.meta_store, path_md5_str, MD5_STR_LENGTH, json_str, strlen(json_str) + 1);
cJSON_Delete(json);
free(json_str);

View File

@@ -1,6 +1,8 @@
#ifndef SIST_H
#define SIST_H
#define _GNU_SOURCE
#ifndef FALSE
#define FALSE (0)
#define BOOL int
@@ -26,6 +28,8 @@
#define UNUSED(x) __attribute__((__unused__)) x
#define MD5_STR_LENGTH 33
#define SHA1_STR_LENGTH 41
#define SHA1_DIGEST_LENGTH 20
#include "util.h"
#include "log.h"
@@ -49,7 +53,7 @@
#include <ctype.h>
#include "git_hash.h"
#define VERSION "2.11.1"
#define VERSION "2.11.6"
static const char *const Version = VERSION;
#ifndef SIST_PLATFORM

View File

@@ -177,7 +177,7 @@ static void *tpool_worker(void *arg) {
}
void tpool_wait(tpool_t *pool) {
LOG_INFO("tpool.c", "Waiting for worker threads to finish")
LOG_DEBUG("tpool.c", "Waiting for worker threads to finish")
pthread_mutex_lock(&(pool->work_mutex));
while (TRUE) {
if (pool->done_cnt < pool->work_cnt) {
@@ -191,7 +191,9 @@ void tpool_wait(tpool_t *pool) {
}
}
}
progress_bar_print(1.0, ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
if (pool->print_progress) {
progress_bar_print(1.0, ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
}
pthread_mutex_unlock(&(pool->work_mutex));
LOG_INFO("tpool.c", "Worker threads finished")

View File

@@ -84,11 +84,13 @@ char *expandpath(const char *path) {
return expanded;
}
int PrintingProgressBar = 0;
void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {
static int last_val = -1;
int val = (int) (percentage * 100);
if (last_val == val || val > 100 || index_size < 1024) {
if (last_val == val || val > 100) {
return;
}
last_val = val;
@@ -114,13 +116,21 @@ void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {
index_unit = 'M';
}
printf(
"\r%3d%%[%.*s>%*s] TN:%3d%c IDX:%3d%c",
val, lpad, PBSTR, rpad, "",
(int) tn_size, tn_unit,
(int) index_size, index_unit
);
fflush(stdout);
if (tn_size == 0 && index_size == 0) {
fprintf(stderr,
"\r%3d%%[%.*s>%*s]",
val, lpad, PBSTR, rpad, ""
);
} else {
fprintf(stderr,
"\r%3d%%[%.*s>%*s] TN:%3d%c IDX:%3d%c",
val, lpad, PBSTR, rpad, "",
(int) tn_size, tn_unit,
(int) index_size, index_unit
);
}
PrintingProgressBar = TRUE;
}
GHashTable *incremental_get_table() {

View File

@@ -19,6 +19,8 @@ char *expandpath(const char *path);
dyn_buffer_t url_escape(char *str);
extern int PrintingProgressBar;
void progress_bar_print(double percentage, size_t tn_size, size_t index_size);
GHashTable *incremental_get_table();
@@ -131,6 +133,9 @@ static int incremental_get_str(GHashTable *table, const char *path_md5) {
}
}
/**
* Not thread safe!
*/
__always_inline
static int incremental_mark_file_for_copy(GHashTable *table, const unsigned char path_md5[MD5_DIGEST_LENGTH]) {
char *ptr = malloc(MD5_STR_LENGTH);

View File

@@ -252,15 +252,34 @@ void serve_file_from_disk(cJSON *json, index_t *idx, struct mg_connection *nc, s
mg_http_serve_file(nc, hm, full_path, mime, disposition);
}
void cache_es_version() {
static int is_cached = FALSE;
if (is_cached == TRUE) {
return;
}
es_version_t *es_version = elastic_get_version(WebCtx.es_url);
if (es_version != NULL) {
WebCtx.es_version = es_version;
is_cached = TRUE;
}
}
void index_info(struct mg_connection *nc) {
cache_es_version();
cJSON *json = cJSON_CreateObject();
cJSON *arr = cJSON_AddArrayToObject(json, "indices");
cJSON_AddStringToObject(json, "esIndex", WebCtx.es_index);
cJSON_AddStringToObject(json, "version", Version);
cJSON_AddStringToObject(json, "esVersion", format_es_version(WebCtx.es_version));
cJSON_AddBoolToObject(json, "esVersionSupported", IS_SUPPORTED_ES_VERSION(WebCtx.es_version));
cJSON_AddBoolToObject(json, "esVersionLegacy", USE_LEGACY_ES_SETTINGS(WebCtx.es_version));
cJSON_AddStringToObject(json, "platform", QUOTE(SIST_PLATFORM));
cJSON_AddStringToObject(json, "sist2Hash", Sist2CommitHash);
cJSON_AddStringToObject(json, "libscanHash", LibScanCommitHash);
cJSON_AddStringToObject(json, "lang", WebCtx.lang);
cJSON_AddBoolToObject(json, "dev", WebCtx.dev);
#ifdef SIST_DEBUG

File diff suppressed because one or more lines are too long

1
third-party/libscan vendored

Submodule third-party/libscan deleted from 22522d7d4a

12
third-party/libscan/.gitignore vendored Normal file
View File

@@ -0,0 +1,12 @@
.idea/
cmake_install.cmake
Makefile
libscan.a
libscan.so
*.cbp
CMakeFiles
CMakeCache.txt
scan_test
third-party/ext_*
libscan-test-files
scan_*_test

233
third-party/libscan/CMakeLists.txt vendored Normal file
View File

@@ -0,0 +1,233 @@
cmake_minimum_required(VERSION 3.15)
project(scan)
set(CMAKE_C_STANDARD 11)
option(BUILD_TESTS "Build tests" on)
add_subdirectory(third-party/antiword)
add_compile_definitions(
antiword
NDEBUG
)
add_library(
scan
libscan/util.c libscan/util.h
libscan/scan.h
libscan/macros.h
libscan/text/text.c libscan/text/text.h
libscan/arc/arc.c libscan/arc/arc.h
libscan/ebook/ebook.c libscan/ebook/ebook.h
libscan/comic/comic.c libscan/comic/comic.h
libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h
libscan/media/media.c libscan/media/media.h
libscan/font/font.c libscan/font/font.h
libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h
libscan/json/json.c libscan/json/json.h
libscan/wpd/wpd.c libscan/wpd/wpd.h libscan/wpd/libwpd_c_api.h libscan/wpd/libwpd_c_api.cpp
third-party/utf8.h
libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h)
set_target_properties(scan PROPERTIES LINKER_LANGUAGE C)
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib .so)
find_package(cJSON CONFIG REQUIRED)
find_package(LibArchive REQUIRED)
find_package(BZip2 REQUIRED)
find_package(lz4 REQUIRED)
find_package(Threads REQUIRED)
find_package(Tesseract CONFIG REQUIRED)
find_package(OpenJPEG CONFIG REQUIRED)
find_package(JPEG REQUIRED)
find_package(LibXml2 REQUIRED)
find_package(LibLZMA REQUIRED)
find_package(ZLIB REQUIRED)
find_package(unofficial-pcre CONFIG REQUIRED)
find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec)
find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd)
find_library(FREETYPE_LIB NAMES freetype freetyped)
find_package(unofficial-brotli CONFIG REQUIRED)
find_library(LZO2_LIB NAMES lzo2)
find_library(RAW_LIB NAMES libraw.a)
find_library(MUPDF_LIB NAMES liblibmupdf.a)
find_library(CMS_LIB NAMES lcms2)
find_library(JAS_LIB NAMES jasper)
find_library(GUMBO_LIB NAMES gumbo)
find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/)
target_compile_options(
scan
PRIVATE
-g
)
include(ExternalProject)
find_program(MAKE_EXE NAMES gmake nmake make)
ExternalProject_Add(
libmobi
GIT_REPOSITORY https://github.com/simon987/libmobi.git
GIT_TAG "public"
UPDATE_COMMAND ""
PATCH_COMMAND ""
TEST_COMMAND ""
CONFIGURE_COMMAND ./autogen.sh && ./configure
INSTALL_COMMAND ""
PREFIX "third-party/ext_libmobi"
SOURCE_DIR "third-party/ext_libmobi/src/libmobi"
BINARY_DIR "third-party/ext_libmobi/src/libmobi"
BUILD_COMMAND ${MAKE_EXE} -j 8 --silent
)
SET(MOBI_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/.libs/)
SET(MOBI_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/)
if (SIST_DEBUG)
SET(FFMPEG_DEBUG "--enable-debug=3" "--disable-optimizations")
else()
SET(FFMPEG_DEBUG "")
endif()
ExternalProject_Add(
ffmpeg
GIT_REPOSITORY https://git.ffmpeg.org/ffmpeg.git
GIT_TAG "n4.4"
UPDATE_COMMAND ""
PATCH_COMMAND ""
TEST_COMMAND ""
CONFIGURE_COMMAND ./configure --disable-shared --enable-static --disable-ffmpeg --disable-ffplay
--disable-ffprobe --disable-doc --disable-manpages --disable-postproc --disable-avfilter --disable-alsa
--disable-lzma --disable-xlib --disable-vdpau --disable-vaapi --disable-sdl2
--disable-network ${FFMPEG_DEBUG}
INSTALL_COMMAND ""
PREFIX "third-party/ext_ffmpeg"
SOURCE_DIR "third-party/ext_ffmpeg/src/ffmpeg"
BINARY_DIR "third-party/ext_ffmpeg/src/ffmpeg"
BUILD_COMMAND ${MAKE_EXE} -j33 --silent
)
SET(FFMPEG_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
SET(FFMPEG_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
ExternalProject_Add(
libwpd
URL http://prdownloads.sourceforge.net/libwpd/libwpd-0.9.9.tar.gz
UPDATE_COMMAND ""
PATCH_COMMAND ""
TEST_COMMAND ""
CONFIGURE_COMMAND ./configure --without-docs --enable-static --disable-shared
INSTALL_COMMAND ""
PREFIX "third-party/ext_libwpd"
SOURCE_DIR "third-party/ext_libwpd/src/libwpd"
BINARY_DIR "third-party/ext_libwpd/src/libwpd"
BUILD_COMMAND ${MAKE_EXE} -j33
)
SET(WPD_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/src/lib/.libs/)
SET(WPD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/inc/)
add_dependencies(
scan
libmobi
ffmpeg
antiword
libwpd
)
target_link_libraries(
scan
PUBLIC
cjson
${LibArchive_LIBRARIES}
ZLIB::ZLIB
BZip2::BZip2
lz4::lz4
${LZO2_LIB}
LibLZMA::LibLZMA
${MUPDF_LIB}
openjp2
${MOBI_LIB_DIR}/libmobi.a
${WPD_LIB_DIR}/libwpd-0.9.a
${WPD_LIB_DIR}/libwpd-stream-0.9.a
${FREETYPE_LIB}
${HARFBUZZ_LIB}
${JBIG2DEC_LIB}
stdc++
-Wl,--whole-archive
m
-Wl,--no-whole-archive
${JPEG_LIBRARIES}
${Tesseract_LIBRARIES}
${LIBXML2_LIBRARIES}
${FREETYPE_LIB}
unofficial::brotli::brotlidec-static
${FFMPEG_LIB_DIR}/libavformat/libavformat.a
${FFMPEG_LIB_DIR}/libavcodec/libavcodec.a
${FFMPEG_LIB_DIR}/libavutil/libavutil.a
${FFMPEG_LIB_DIR}/libswresample/libswresample.a
${FFMPEG_LIB_DIR}/libswscale/libswscale.a
z
${CMAKE_THREAD_LIBS_INIT}
${RAW_LIB}
${GOMP_LIB}
${CMS_LIB}
${JAS_LIB}
${GUMBO_LIB}
dl
antiword
unofficial::pcre::pcre unofficial::pcre::pcre16 unofficial::pcre::pcre32 unofficial::pcre::pcrecpp
)
target_include_directories(
scan
PUBLIC
${MUPDF_INC_DIR}
${JPEG_INCLUDE_DIR}
${LIBXML2_INCLUDE_DIR}
${FFMPEG_INCLUDE_DIR}
${MOBI_INCLUDE_DIR}
${WPD_INCLUDE_DIR}
)
if (BUILD_TESTS)
find_package(GTest CONFIG REQUIRED)
add_executable(scan_ub_test test/main.cpp test/test_util.cpp test/test_util.h)
target_compile_options(scan_ub_test PRIVATE -g -fsanitize=undefined -fno-omit-frame-pointer)
target_link_libraries(scan_ub_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=undefined scan)
add_executable(scan_a_test test/main.cpp test/test_util.cpp test/test_util.h)
target_compile_options(scan_a_test PRIVATE -g -fsanitize=address -fno-omit-frame-pointer)
target_link_libraries(scan_a_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=address scan)
add_executable(scan_test test/main.cpp test/test_util.cpp test/test_util.h)
target_compile_options(scan_test PRIVATE -g -fno-omit-frame-pointer)
target_link_libraries(scan_test PRIVATE GTest::gtest GTest::gtest_main scan)
endif()

4
third-party/libscan/README.md vendored Normal file
View File

@@ -0,0 +1,4 @@
### Run fuzz tests:
```bash
./scan_a_test --gtest_filter=*Fuzz* --gtest_repeat=100
```

244
third-party/libscan/libscan/arc/arc.c vendored Normal file
View File

@@ -0,0 +1,244 @@
#include "arc.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <openssl/evp.h>
#include <pcre.h>
int should_parse_filtered_file(const char *filepath, int ext) {
char tmp[PATH_MAX * 2];
if (ext == 0) {
return FALSE;
}
if (strncmp(filepath + ext, "tgz", 3) == 0) {
return TRUE;
}
memcpy(tmp, filepath, ext - 1);
*(tmp + ext - 1) = '\0';
char *idx = strrchr(tmp, '.');
if (idx == NULL) {
return FALSE;
}
if (strcmp(idx, ".tar") == 0) {
return TRUE;
}
return FALSE;
}
void arc_close(struct vfile *f) {
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
if (f->rewind_buffer != NULL) {
free(f->rewind_buffer);
f->rewind_buffer = NULL;
f->rewind_buffer_size = 0;
f->rewind_buffer_cursor = 0;
}
}
int arc_read(struct vfile *f, void *buf, size_t size) {
int bytes_copied = 0;
if (f->rewind_buffer_size != 0) {
if (size > f->rewind_buffer_size) {
memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, f->rewind_buffer_size);
bytes_copied = f->rewind_buffer_size;
size -= f->rewind_buffer_size;
buf += f->rewind_buffer_size;
f->rewind_buffer_size = 0;
} else {
memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, size);
f->rewind_buffer_size -= (int) size;
f->rewind_buffer_cursor += (int) size;
return (int) size;
}
}
size_t bytes_read = archive_read_data(f->arc, buf, size);
if (bytes_read != 0 && bytes_read <= size && f->calculate_checksum) {
f->has_checksum = TRUE;
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, bytes_read);
}
if (bytes_read != size && archive_errno(f->arc) != 0) {
const char *error_str = archive_error_string(f->arc);
if (error_str != NULL) {
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
}
return -1;
}
return (int) bytes_read + bytes_copied;
}
int arc_read_rewindable(struct vfile *f, void *buf, size_t size) {
if (f->rewind_buffer != NULL) {
fprintf(stderr, "Allocated rewind buffer more than once for %s", f->filepath);
exit(-1);
}
size_t bytes_read = archive_read_data(f->arc, buf, size);
if (bytes_read != size && archive_errno(f->arc) != 0) {
const char *error_str = archive_error_string(f->arc);
if (error_str != NULL) {
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
}
return -1;
}
f->rewind_buffer = malloc(size);
f->rewind_buffer_size = (int) size;
f->rewind_buffer_cursor = 0;
memcpy(f->rewind_buffer, buf, size);
return (int) bytes_read;
}
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) {
arc_data->f = f;
if (f->is_fs_file) {
*a = archive_read_new();
archive_read_support_filter_all(*a);
archive_read_support_format_all(*a);
if (ctx->passphrase[0] != 0) {
archive_read_add_passphrase(*a, ctx->passphrase);
}
return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE);
} else if (allow_recurse) {
*a = archive_read_new();
archive_read_support_filter_all(*a);
archive_read_support_format_all(*a);
if (ctx->passphrase[0] != 0) {
archive_read_add_passphrase(*a, ctx->passphrase);
}
return archive_read_open(
*a, arc_data,
vfile_open_callback,
vfile_read_callback,
vfile_close_callback
);
} else {
return ARC_SKIPPED;
}
}
static __thread int sub_strings[30];
#define EXCLUDED(str) (pcre_exec(exclude, exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
arc_data.f = f;
int ret = arc_open(ctx, f, &a, &arc_data, ctx->mode == ARC_MODE_RECURSE);
if (ret == ARC_SKIPPED) {
return SCAN_OK;
}
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(arc.c) [%d] %s", ret, archive_error_string(a))
archive_read_free(a);
return SCAN_ERR_READ;
}
if (ctx->mode == ARC_MODE_LIST) {
dyn_buffer_t buf = dyn_buffer_create();
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
dyn_buffer_append_string(&buf, file_path);
dyn_buffer_write_char(&buf, ' ');
}
}
dyn_buffer_write_char(&buf, '\0');
meta_line_t *meta_list = malloc(sizeof(meta_line_t) + buf.cur);
meta_list->key = MetaContent;
strcpy(meta_list->str_val, buf.buf);
APPEND_META(doc, meta_list)
dyn_buffer_destroy(&buf);
} else {
parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2);
sub_job->vfile.close = arc_close;
sub_job->vfile.read = arc_read;
sub_job->vfile.read_rewindable = arc_read_rewindable;
sub_job->vfile.reset = NULL;
sub_job->vfile.arc = a;
sub_job->vfile.filepath = sub_job->filepath;
sub_job->vfile.is_fs_file = FALSE;
sub_job->vfile.rewind_buffer_size = 0;
sub_job->vfile.rewind_buffer = NULL;
sub_job->vfile.log = ctx->log;
sub_job->vfile.logf = ctx->logf;
sub_job->vfile.has_checksum = FALSE;
sub_job->vfile.calculate_checksum = f->calculate_checksum;
memcpy(sub_job->parent, doc->path_md5, MD5_DIGEST_LENGTH);
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
sub_job->vfile.info = *archive_entry_stat(entry);
if (S_ISREG(sub_job->vfile.info.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
if (utf8_name == NULL) {
sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry));
} else {
sprintf(sub_job->filepath, "%s#/%s", f->filepath, utf8_name);
}
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
// Handle excludes
if (exclude != NULL && EXCLUDED(sub_job->filepath)) {
CTX_LOG_DEBUGF("arc.c", "Excluded: %s", sub_job->filepath)
continue;
}
char *p = strrchr(sub_job->filepath, '.');
if (p != NULL && (p - sub_job->filepath) > strlen(f->filepath)) {
sub_job->ext = (int) (p - sub_job->filepath + 1);
} else {
sub_job->ext = (int) strlen(sub_job->filepath);
}
SHA1_Init(&sub_job->vfile.sha1_ctx);
ctx->parse(sub_job);
}
}
free(sub_job);
}
archive_read_free(a);
return SCAN_OK;
}

80
third-party/libscan/libscan/arc/arc.h vendored Normal file
View File

@@ -0,0 +1,80 @@
#ifndef SCAN_ARC_H
#define SCAN_ARC_H
#include <archive.h>
#include <archive_entry.h>
#include <fcntl.h>
#include <pcre.h>
#include "../scan.h"
# define ARC_SKIPPED (-1)
#define ARC_MODE_SKIP 0
#define ARC_MODE_LIST 1
#define ARC_MODE_SHALLOW 2
#define ARC_MODE_RECURSE 3
typedef int archive_mode_t;
typedef struct {
archive_mode_t mode;
parse_callback_t parse;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
char passphrase[4096];
} scan_arc_ctx_t;
#define ARC_BUF_SIZE 8192
typedef struct {
vfile_t *f;
char buf[ARC_BUF_SIZE];
} arc_data_t;
static int vfile_open_callback(struct archive *a, void *user_data) {
arc_data_t *data = (arc_data_t *) user_data;
if (!data->f->is_fs_file) {
SHA1_Init(&data->f->sha1_ctx);
}
return ARCHIVE_OK;
}
static long vfile_read_callback(struct archive *a, void *user_data, const void **buf) {
arc_data_t *data = (arc_data_t *) user_data;
*buf = data->buf;
long ret = data->f->read(data->f, data->buf, sizeof(data->buf));
if (!data->f->is_fs_file && ret > 0) {
data->f->has_checksum = TRUE;
safe_sha1_update(&data->f->sha1_ctx, (unsigned char*)data->buf, ret);
}
return ret;
}
static int vfile_close_callback(struct archive *a, void *user_data) {
arc_data_t *data = (arc_data_t *) user_data;
if (!data->f->is_fs_file) {
SHA1_Final((unsigned char *) data->f->sha1_digest, &data->f->sha1_ctx);
}
return ARCHIVE_OK;
}
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse);
int should_parse_filtered_file(const char *filepath, int ext);
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra);
int arc_read(struct vfile *f, void *buf, size_t size);
int arc_read_rewindable(struct vfile *f, void *buf, size_t size);
void arc_close(struct vfile *f);
#endif

View File

@@ -0,0 +1,58 @@
#include "comic.h"
#include "../media/media.h"
#include "../arc/arc.h"
#include <stdlib.h>
#include <archive.h>
static scan_arc_ctx_t arc_ctx = (scan_arc_ctx_t) {.passphrase = {0,}};
void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
if (ctx->tn_size <= 0) {
return;
}
int ret = arc_open(&arc_ctx, f, &a, &arc_data, TRUE);
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(cbr.c) [%d] %s", ret, archive_error_string(a))
archive_read_free(a);
return;
}
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
struct stat info = *archive_entry_stat(entry);
if (S_ISREG(info.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
char *p = strrchr(file_path, '.');
if (p != NULL && (strcmp(p, ".png") == 0 || strcmp(p, ".jpg") == 0 || strcmp(p, ".jpeg") == 0)) {
size_t entry_size = archive_entry_size(entry);
void *buf = malloc(entry_size);
size_t read = archive_read_data(a, buf, entry_size);
if (read != entry_size) {
const char *err_str = archive_error_string(a);
if (err_str) {
CTX_LOG_ERRORF("comic.c", "Error while reading entry: %s", err_str)
}
free(buf);
break;
}
ret = store_image_thumbnail((scan_media_ctx_t *) ctx, buf, entry_size, doc, file_path);
free(buf);
if (ret == TRUE) {
break;
}
}
}
}
archive_read_free(a);
}

View File

@@ -0,0 +1,31 @@
#ifndef SCAN_CBR_H
#define SCAN_CBR_H
#include <stdlib.h>
#include "../ebook/ebook.h"
typedef struct {
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int tn_size;
float tn_qscale;
unsigned int cbr_mime;
unsigned int cbz_mime;
} scan_comic_ctx_t;
__always_inline
static int is_cbr(scan_comic_ctx_t *ctx, unsigned int mime) {
return mime == ctx->cbr_mime;
}
__always_inline
static int is_cbz(scan_comic_ctx_t *ctx, unsigned int mime) {
return mime == ctx->cbz_mime;
}
void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

View File

@@ -0,0 +1,478 @@
#include "ebook.h"
#include <mupdf/fitz.h>
#include <pthread.h>
#include <tesseract/capi.h>
#include "../media/media.h"
#include "../arc/arc.h"
#include "../ocr/ocr.h"
/* fill_image callback doesn't let us pass opaque pointers unless I create my own device */
__thread text_buffer_t thread_buffer;
__thread scan_ebook_ctx_t thread_ctx;
pthread_mutex_t Mutex;
static void my_fz_lock(UNUSED(void *user), int lock) {
if (lock == FZ_LOCK_FREETYPE) {
pthread_mutex_lock(&Mutex);
}
}
static void my_fz_unlock(UNUSED(void *user), int lock) {
if (lock == FZ_LOCK_FREETYPE) {
pthread_mutex_unlock(&Mutex);
}
}
int pixmap_is_blank(const fz_pixmap *pixmap) {
int pixmap_size = pixmap->n * pixmap->w * pixmap->h;
const int pixel0 = pixmap->samples[0];
for (int i = 0; i < pixmap_size; i++) {
if (pixmap->samples[i] != pixel0) {
return FALSE;
}
}
return TRUE;
}
fz_pixmap *
load_pixmap(scan_ebook_ctx_t *ctx, int page, fz_context *fzctx, fz_document *fzdoc, document_t *doc, fz_page **cover) {
int err = 0;
fz_var(cover);
fz_var(err);
fz_try(fzctx)*cover = fz_load_page(fzctx, fzdoc, page);
fz_catch(fzctx)err = 1;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message)
return NULL;
}
fz_rect bounds = fz_bound_page(fzctx, *cover);
float scale;
float w = bounds.x1 - bounds.x0;
float h = bounds.y1 - bounds.y0;
if (w > h) {
scale = (float) ctx->tn_size / w;
} else {
scale = (float) ctx->tn_size / h;
}
fz_matrix m = fz_scale(scale, scale);
bounds = fz_transform_rect(bounds, m);
fz_irect bbox = fz_round_rect(bounds);
fz_pixmap *pixmap = fz_new_pixmap_with_bbox(fzctx, fz_device_rgb(fzctx), bbox, NULL, 0);
fz_clear_pixmap_with_value(fzctx, pixmap, 0xFF);
fz_device *dev = fz_new_draw_device(fzctx, m, pixmap);
fz_var(err);
fz_try(fzctx) {
fz_run_page(fzctx, *cover, dev, fz_identity, NULL);
} fz_always(fzctx) {
fz_close_device(fzctx, dev);
fz_drop_device(fzctx, dev);
} fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message)
fz_drop_page(fzctx, *cover);
fz_drop_pixmap(fzctx, pixmap);
return NULL;
}
if (pixmap->n != 3) {
CTX_LOG_ERRORF(doc->filepath, "Got unexpected pixmap depth: %d", pixmap->n)
fz_drop_page(fzctx, *cover);
fz_drop_pixmap(fzctx, pixmap);
return NULL;
}
return pixmap;
}
int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_document *fzdoc) {
fz_page *cover = NULL;
fz_pixmap *pixmap = load_pixmap(ctx, 0, fzctx, fzdoc, doc, &cover);
if (pixmap == NULL) {
return FALSE;
}
if (pixmap_is_blank(pixmap)) {
fz_drop_page(fzctx, cover);
fz_drop_pixmap(fzctx, pixmap);
CTX_LOG_DEBUG(doc->filepath, "Cover page is blank, using page 1 instead")
pixmap = load_pixmap(ctx, 1, fzctx, fzdoc, doc, &cover);
if (pixmap == NULL) {
return FALSE;
}
}
// RGB24 -> YUV420p
AVFrame *scaled_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
pixmap->w, pixmap->h, AV_PIX_FMT_RGB24,
pixmap->w, pixmap->h, AV_PIX_FMT_YUV420P,
SIST_SWS_ALGO, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, pixmap->w, pixmap->h, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len);
av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, pixmap->w, pixmap->h,
1);
unsigned char *samples = calloc(1, 1024 * 1024 * 1024);
memcpy(samples, pixmap->samples, pixmap->stride * pixmap->h);
const uint8_t *in_data[1] = {samples,};
int in_line_size[1] = {(int) pixmap->stride};
sws_scale(sws_ctx,
in_data, in_line_size,
0, pixmap->h,
scaled_frame->data, scaled_frame->linesize
);
scaled_frame->width = pixmap->w;
scaled_frame->height = pixmap->h;
scaled_frame->format = AV_PIX_FMT_YUV420P;
sws_freeContext(sws_ctx);
// YUV420p -> JPEG
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(pixmap->w, pixmap->h, ctx->tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
APPEND_TN_META(doc, pixmap->w, pixmap->h)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
free(samples);
av_packet_unref(&jpeg_packet);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
avcodec_free_context(&jpeg_encoder);
fz_drop_pixmap(fzctx, pixmap);
fz_drop_page(fzctx, cover);
return TRUE;
}
void fz_err_callback(void *user, const char *message) {
document_t *doc = (document_t *) user;
const scan_ebook_ctx_t *ctx = &thread_ctx;
CTX_LOG_WARNINGF(doc->filepath, "FZ: %s", message)
}
void fz_warn_callback(void *user, const char *message) {
document_t *doc = (document_t *) user;
const scan_ebook_ctx_t *ctx = &thread_ctx;
CTX_LOG_DEBUGF(doc->filepath, "FZ: %s", message)
}
static void init_fzctx(fz_context *fzctx, document_t *doc) {
fz_register_document_handlers(fzctx);
static int mu_is_initialized = FALSE;
if (!mu_is_initialized) {
pthread_mutex_init(&Mutex, NULL);
mu_is_initialized = TRUE;
}
fzctx->warn.print_user = doc;
fzctx->warn.print = fz_warn_callback;
fzctx->error.print_user = doc;
fzctx->error.print = fz_err_callback;
fzctx->locks.lock = my_fz_lock;
fzctx->locks.unlock = my_fz_unlock;
}
static int read_stext_block(fz_stext_block *block, text_buffer_t *tex) {
if (block->type != FZ_STEXT_BLOCK_TEXT) {
return 0;
}
fz_stext_line *line = block->u.t.first_line;
while (line != NULL) {
text_buffer_append_char(tex, ' ');
fz_stext_char *c = line->first_char;
while (c != NULL) {
if (text_buffer_append_char(tex, c->c) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
c = c->next;
}
line = line->next;
}
text_buffer_append_char(tex, ' ');
return 0;
}
static void fill_image_ocr_cb(const char* text, size_t len) {
text_buffer_append_string(&thread_buffer, text, len - 1);
}
void fill_image(fz_context *fzctx, UNUSED(fz_device *dev),
fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
UNUSED(fz_color_params color_params)) {
int l2factor = 0;
if (img->w >= MIN_OCR_WIDTH && img->h >= MIN_OCR_HEIGHT && OCR_IS_VALID_BPP(img->n)) {
fz_pixmap *pix = img->get_pixmap(fzctx, img, NULL, img->w, img->h, &l2factor);
ocr_extract_text(thread_ctx.tesseract_path, thread_ctx.tesseract_lang, pix->samples, pix->w, pix->h, pix->n, pix->stride, pix->xres, fill_image_ocr_cb);
fz_drop_pixmap(fzctx, pix);
}
}
void
parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only) {
fz_context *fzctx = fz_new_context(NULL, NULL, FZ_STORE_DEFAULT);
thread_ctx = *ctx;
init_fzctx(fzctx, doc);
int err = 0;
fz_document *fzdoc = NULL;
fz_stream *stream = NULL;
fz_var(fzdoc);
fz_var(stream);
fz_var(err);
fz_try(fzctx) {
stream = fz_open_memory(fzctx, buf, buf_len);
fzdoc = fz_open_document_with_stream(fzctx, mime_str, stream);
} fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
int page_count = -1;
fz_var(err);
fz_try(fzctx)page_count = fz_count_pages(fzctx, fzdoc);
fz_catch(fzctx)err = fzctx->error.errcode;
if (err) {
CTX_LOG_WARNINGF(doc->filepath, "fz_count_pages() returned error code [%d] %s", err, fzctx->error.message)
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
APPEND_LONG_META(doc, MetaPages, page_count)
if (ctx->tn_size > 0) {
if (render_cover(ctx, fzctx, doc, fzdoc) == FALSE) {
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
}
if (tn_only) {
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
char title[8192] = {'\0',};
fz_try(fzctx)fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title));
fz_catch(fzctx);
if (strlen(title) > 0) {
APPEND_UTF8_META(doc, MetaTitle, title)
}
char author[4096] = {'\0',};
fz_try(fzctx)fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_AUTHOR, author, sizeof(author));
fz_catch(fzctx);
if (strlen(author) > 0) {
APPEND_UTF8_META(doc, MetaAuthor, author)
}
if (ctx->content_size > 0) {
fz_stext_options opts = {0};
thread_buffer = text_buffer_create(ctx->content_size);
for (int current_page = 0; current_page < page_count; current_page++) {
fz_page *page = NULL;
fz_var(err);
fz_try(fzctx)page = fz_load_page(fzctx, fzdoc, current_page);
fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message)
text_buffer_destroy(&thread_buffer);
fz_drop_page(fzctx, page);
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
fz_stext_page *stext = fz_new_stext_page(fzctx, fz_bound_page(fzctx, page));
fz_device *dev = fz_new_stext_device(fzctx, stext, &opts);
dev->stroke_path = NULL;
dev->stroke_text = NULL;
dev->clip_text = NULL;
dev->clip_stroke_path = NULL;
dev->clip_stroke_text = NULL;
if (ctx->tesseract_lang != NULL) {
dev->fill_image = fill_image;
}
fz_var(err);
fz_try(fzctx)fz_run_page(fzctx, page, dev, fz_identity, NULL);
fz_always(fzctx) {
fz_close_device(fzctx, dev);
fz_drop_device(fzctx, dev);
} fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message)
text_buffer_destroy(&thread_buffer);
fz_drop_page(fzctx, page);
fz_drop_stext_page(fzctx, stext);
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
fz_stext_block *block = stext->first_block;
while (block != NULL) {
int ret = read_stext_block(block, &thread_buffer);
if (ret == TEXT_BUF_FULL) {
break;
}
block = block->next;
}
fz_drop_stext_page(fzctx, stext);
fz_drop_page(fzctx, page);
if (thread_buffer.dyn_buffer.cur >= ctx->content_size) {
break;
}
}
text_buffer_terminate_string(&thread_buffer);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + thread_buffer.dyn_buffer.cur);
meta_content->key = MetaContent;
memcpy(meta_content->str_val, thread_buffer.dyn_buffer.buf, thread_buffer.dyn_buffer.cur);
APPEND_META(doc, meta_content)
text_buffer_destroy(&thread_buffer);
}
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
}
static scan_arc_ctx_t arc_ctx = (scan_arc_ctx_t) {.passphrase = {0,}};
void parse_epub_fast(scan_ebook_ctx_t *ctx, vfile_t *f, document_t *doc) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
text_buffer_t content_buffer = text_buffer_create(ctx->content_size);
if (ctx->tn_size <= 0) {
return;
}
int ret = arc_open(&arc_ctx, f, &a, &arc_data, TRUE);
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(ebook.c) [%d] %s", ret, archive_error_string(a))
archive_read_free(a);
return;
}
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
struct stat info = *archive_entry_stat(entry);
if (S_ISREG(info.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
char *p = strrchr(file_path, '.');
if (p != NULL && (strcmp(p, ".html") == 0 || (strcmp(p, ".xhtml") == 0))) {
size_t entry_size = archive_entry_size(entry);
void *buf = malloc(entry_size + 1);
size_t read = archive_read_data(a, buf, entry_size);
*(char *) (buf + entry_size) = '\0';
if (read != entry_size) {
const char *err_str = archive_error_string(a);
if (err_str) {
CTX_LOG_ERRORF("ebook.c", "Error while reading entry: %s", err_str)
}
free(buf);
break;
}
ret = text_buffer_append_markup(&content_buffer, buf);
free(buf);
if (ret == TEXT_BUF_FULL) {
break;
}
}
}
}
text_buffer_terminate_string(&content_buffer);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + content_buffer.dyn_buffer.cur);
meta_content->key = MetaContent;
memcpy(meta_content->str_val, content_buffer.dyn_buffer.buf, content_buffer.dyn_buffer.cur);
APPEND_META(doc, meta_content)
text_buffer_destroy(&content_buffer);
archive_read_free(a);
}
void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, document_t *doc) {
if (ctx->fast_epub_parse && is_epub(mime_str)) {
parse_epub_fast(ctx, f, doc);
return;
}
size_t buf_len;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
parse_ebook_mem(ctx, buf, buf_len, mime_str, doc, FALSE);
free(buf);
}

View File

@@ -0,0 +1,30 @@
#ifndef SCAN_EBOOK_H
#define SCAN_EBOOK_H
#include "../scan.h"
typedef struct {
long content_size;
int tn_size;
const char *tesseract_lang;
const char *tesseract_path;
pthread_mutex_t mupdf_mutex;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int fast_epub_parse;
float tn_qscale;
} scan_ebook_ctx_t;
void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, document_t *doc);
void
parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only);
__always_inline
static int is_epub(const char *mime_string) {
return strcmp(mime_string, "application/epub+zip") == 0;
}
#endif

246
third-party/libscan/libscan/font/font.c vendored Normal file
View File

@@ -0,0 +1,246 @@
#include "font.h"
#include <ft2build.h>
#include <freetype/freetype.h>
#include "../util.h"
__thread FT_Library ft_lib = NULL;
typedef struct text_dimensions {
unsigned int width;
unsigned int height;
unsigned int baseline;
} text_dimensions_t;
typedef struct glyph {
int top;
int height;
int width;
int descent;
int ascent;
int advance_width;
unsigned char *pixmap;
} glyph_t;
__always_inline
int kerning_offset(char c, char pc, FT_Face face) {
FT_Vector kerning;
FT_Get_Kerning(face, c, pc, FT_KERNING_DEFAULT, &kerning);
return (int) (kerning.x / 64);
}
__always_inline
glyph_t ft_glyph_to_glyph(FT_GlyphSlot slot) {
glyph_t glyph;
glyph.pixmap = slot->bitmap.buffer;
glyph.width = (int) slot->bitmap.width;
glyph.height = (int) slot->bitmap.rows;
glyph.top = slot->bitmap_top;
glyph.advance_width = (int) slot->advance.x / 64;
glyph.descent = MAX(0, glyph.height - glyph.top);
glyph.ascent = MAX(0, MAX(glyph.top, glyph.height) - glyph.descent);
return glyph;
}
text_dimensions_t text_dimension(char *text, FT_Face face) {
text_dimensions_t dimensions;
dimensions.width = 0;
int num_chars = (int) strlen(text);
unsigned int max_ascent = 0;
int max_descent = 0;
char pc = 0;
for (int i = 0; i < num_chars; i++) {
char c = text[i];
FT_Load_Char(face, c, 0);
glyph_t glyph = ft_glyph_to_glyph(face->glyph);
max_descent = MAX(max_descent, glyph.descent);
max_ascent = MAX(max_ascent, MAX(glyph.height, glyph.ascent));
int kerning_x = kerning_offset(c, pc, face);
dimensions.width += MAX(glyph.advance_width, glyph.width) + kerning_x;
pc = c;
}
dimensions.height = max_ascent + max_descent;
dimensions.baseline = max_descent;
return dimensions;
}
void draw_glyph(glyph_t *glyph, int x, int y, struct text_dimensions text_info, unsigned char *bitmap) {
unsigned int src = 0;
unsigned int dst = y * text_info.width + x;
unsigned int row_offset = text_info.width - glyph->width;
unsigned int buf_len = text_info.width * text_info.height;
for (unsigned int sy = 0; sy < glyph->height; sy++) {
for (unsigned int sx = 0; sx < glyph->width; sx++) {
if (dst < buf_len) {
bitmap[dst] |= glyph->pixmap[src];
}
src++;
dst++;
}
dst += row_offset;
}
}
void bmp_format(dyn_buffer_t *buf, text_dimensions_t dimensions, const unsigned char *bitmap) {
dyn_buffer_write_short(buf, 0x4D42); // Magic
dyn_buffer_write_int(buf, 0); // Size placeholder
dyn_buffer_write_int(buf, 0x5157); //Reserved
dyn_buffer_write_int(buf, 14 + 40 + 256 * 4); // pixels offset
dyn_buffer_write_int(buf, 40); // DIB size
dyn_buffer_write_int(buf, (int) dimensions.width);
dyn_buffer_write_int(buf, (int) dimensions.height);
dyn_buffer_write_short(buf, 1); // Color planes
dyn_buffer_write_short(buf, 8); // bits per pixel
dyn_buffer_write_int(buf, 0); // compression
dyn_buffer_write_int(buf, 0); // Ignored
dyn_buffer_write_int(buf, 3800); // hres
dyn_buffer_write_int(buf, 3800); // vres
dyn_buffer_write_int(buf, 256); // Color count
dyn_buffer_write_int(buf, 0); // Ignored
// RGBA32 Color table (Grayscale)
for (int i = 255; i >= 0; i--) {
dyn_buffer_write_int(buf, i + (i << 8) + (i << 16));
}
// Pixel array: write from bottom to top, with rows padded to multiples of 4-bytes
for (int y = (int) dimensions.height - 1; y >= 0; y--) {
for (unsigned int x = 0; x < dimensions.width; x++) {
dyn_buffer_write_char(buf, (char) bitmap[y * dimensions.width + x]);
}
while (buf->cur % 4 != 0) {
dyn_buffer_write_char(buf, 0);
}
}
// Size
*(int *) ((char *) buf->buf + 2) = buf->cur;
}
void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc) {
if (ft_lib == NULL) {
FT_Init_FreeType(&ft_lib);
}
size_t buf_len = 0;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FT_Face face;
FT_Error err = FT_New_Memory_Face(ft_lib, (unsigned char *) buf, (int) buf_len, 0, &face);
if (err != 0) {
CTX_LOG_ERRORF(doc->filepath, "(font.c) FT_New_Memory_Face() returned error code [%d] %s", err,
FT_Error_String(err))
free(buf);
return;
}
char font_name[4096];
if (face->style_name == NULL || (strcmp(face->style_name, "?") == 0)) {
if (face->family_name == NULL) {
strcpy(font_name, "(null)");
} else {
strncpy(font_name, face->family_name, sizeof(font_name));
}
} else {
snprintf(font_name, sizeof(font_name), "%s %s", face->family_name, face->style_name);
}
meta_line_t *meta_name = malloc(sizeof(meta_line_t) + strlen(font_name));
meta_name->key = MetaFontName;
strcpy(meta_name->str_val, font_name);
APPEND_META(doc, meta_name)
if (ctx->enable_tn == TRUE) {
FT_Done_Face(face);
free(buf);
return;
}
int pixel = 64;
int num_chars = (int) strlen(font_name);
err = FT_Set_Pixel_Sizes(face, 0, pixel);
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "(font.c) FT_Set_Pixel_Sizes() returned error code [%d] %s", err,
FT_Error_String(err))
FT_Done_Face(face);
free(buf);
return;
}
text_dimensions_t dimensions = text_dimension(font_name, face);
unsigned char *bitmap = calloc(dimensions.width * dimensions.height, 1);
FT_Vector pen;
pen.x = 0;
char pc = 0;
for (int i = 0; i < num_chars; i++) {
char c = font_name[i];
err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
if (err != 0) {
c = c >= 'a' && c <= 'z' ? c - 32 : c + 32;
err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "(font.c) FT_Load_Char() returned error code [%d] %s", err,
FT_Error_String(err))
continue;
}
}
glyph_t glyph = ft_glyph_to_glyph(face->glyph);
pen.x += kerning_offset(c, pc, face);
if (pen.x <= 0) {
pen.x = ABS(glyph.advance_width - glyph.width);
}
pen.y = dimensions.height - glyph.ascent - dimensions.baseline;
draw_glyph(&glyph, pen.x, pen.y, dimensions, bitmap);
pen.x += glyph.advance_width;
pc = c;
}
dyn_buffer_t bmp_data = dyn_buffer_create();
bmp_format(&bmp_data, dimensions, bitmap);
APPEND_TN_META(doc, dimensions.width, dimensions.height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) bmp_data.buf, bmp_data.cur);
dyn_buffer_destroy(&bmp_data);
free(bitmap);
FT_Done_Face(face);
free(buf);
}
void cleanup_font() {
FT_Done_FreeType(ft_lib);
}

17
third-party/libscan/libscan/font/font.h vendored Normal file
View File

@@ -0,0 +1,17 @@
#ifndef SCAN_FONT_H
#define SCAN_FONT_H
#include "../scan.h"
typedef struct {
int enable_tn;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
} scan_font_ctx_t;
void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc);
void cleanup_font();
#endif

119
third-party/libscan/libscan/json/json.c vendored Normal file
View File

@@ -0,0 +1,119 @@
#include "json.h"
#include "cjson/cJSON.h"
#define JSON_MAX_FILE_SIZE (1024 * 1024 * 50)
int json_extract_text(cJSON *json, text_buffer_t *tex) {
if (cJSON_IsObject(json)) {
for (cJSON *child = json->child; child != NULL; child = child->next) {
if (json_extract_text(child, tex)) {
return TRUE;
}
}
} else if (cJSON_IsArray(json)) {
cJSON *child;
cJSON_ArrayForEach(child, json) {
if (json_extract_text(child, tex)) {
return TRUE;
}
}
} else if (cJSON_IsString(json)) {
if (text_buffer_append_string0(tex, json->valuestring) == TEXT_BUF_FULL) {
return TRUE;
}
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
return TRUE;
}
}
return FALSE;
}
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
if (f->info.st_size > JSON_MAX_FILE_SIZE) {
CTX_LOG_WARNINGF("json.c", "File larger than maximum allowed [%s]", f->filepath)
return SCAN_ERR_SKIP;
}
size_t buf_len;
char *buf = read_all(f, &buf_len);
if (buf == NULL) {
return SCAN_ERR_READ;
}
buf_len += 1;
buf = realloc(buf, buf_len);
*(buf + buf_len - 1) = '\0';
cJSON *json = cJSON_ParseWithOpts(buf, NULL, TRUE);
text_buffer_t tex = text_buffer_create(ctx->content_size);
json_extract_text(json, &tex);
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
cJSON_Delete(json);
free(buf);
text_buffer_destroy(&tex);
return SCAN_OK;
}
#define JSON_BUF_SIZE (1024 * 1024 * 5)
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
char *buf = calloc(JSON_BUF_SIZE + 1, sizeof(char));
*(buf + JSON_BUF_SIZE) = '\0';
text_buffer_t tex = text_buffer_create(ctx->content_size);
size_t ret;
int eof = FALSE;
const char *parse_end = buf;
size_t to_read;
char *ptr = buf;
while (TRUE) {
cJSON *json;
if (!eof) {
to_read = parse_end == buf ? JSON_BUF_SIZE : parse_end - buf;
ret = f->read(f, ptr, to_read);
if (ret != to_read) {
eof = TRUE;
}
}
json = cJSON_ParseWithOpts(buf, &parse_end, FALSE);
if (parse_end == buf + JSON_BUF_SIZE) {
CTX_LOG_ERRORF("json.c", "Line too large for buffer [%s]", doc->filepath);
cJSON_Delete(json);
break;
}
if (parse_end == buf) {
cJSON_Delete(json);
break;
}
json_extract_text(json, &tex);
cJSON_Delete(json);
memmove(buf, parse_end, (buf + JSON_BUF_SIZE - parse_end));
ptr = buf + JSON_BUF_SIZE - parse_end + buf;
}
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
free(buf);
text_buffer_destroy(&tex);
}

30
third-party/libscan/libscan/json/json.h vendored Normal file
View File

@@ -0,0 +1,30 @@
#ifndef SCAN_JSON_H
#define SCAN_JSON_H
#include "../scan.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
unsigned int json_mime;
unsigned int ndjson_mime;
} scan_json_ctx_t;
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
__always_inline
static int is_json(scan_json_ctx_t *ctx, unsigned int mime) {
return mime == ctx->json_mime;
}
__always_inline
static int is_ndjson(scan_json_ctx_t *ctx, unsigned int mime) {
return mime == ctx->ndjson_mime;
}
#endif

62
third-party/libscan/libscan/macros.h vendored Normal file
View File

@@ -0,0 +1,62 @@
#ifndef FALSE
#define FALSE (0)
#define BOOL int
#endif
#ifndef TRUE
#define TRUE (!FALSE)
#endif
#undef MAX
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#undef MIN
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
#ifndef PATH_MAX
#define PATH_MAX 4096
#endif
#undef ABS
#define ABS(a) (((a) < 0) ? -(a) : (a))
#define SHA1_STR_LENGTH 41
#define SHA1_DIGEST_LENGTH 20
#define APPEND_STR_META(doc, keyname, value) \
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \
meta_str->key = keyname; \
strcpy(meta_str->str_val, value); \
APPEND_META(doc, meta_str)}
#define APPEND_LONG_META(doc, keyname, value) \
{meta_line_t *meta_long = malloc(sizeof(meta_line_t)); \
meta_long->key = keyname; \
meta_long->long_val = value; \
APPEND_META(doc, meta_long)}
#define APPEND_TN_META(doc, width, height) \
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + 4 + 1 + 4); \
meta_str->key = MetaThumbnail; \
sprintf(meta_str->str_val, "%04d,%04d", width, height); \
APPEND_META(doc, meta_str)}
#define APPEND_META(doc, meta) \
meta->next = NULL;\
if (doc->meta_head == NULL) {\
doc->meta_head = meta;\
doc->meta_tail = doc->meta_head;\
} else {\
doc->meta_tail->next = meta;\
doc->meta_tail = meta;\
}
#define APPEND_UTF8_META(doc, keyname, str) \
text_buffer_t tex = text_buffer_create(-1); \
text_buffer_append_string0(&tex, str); \
text_buffer_terminate_string(&tex); \
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \
meta_tag->key = keyname; \
strcpy(meta_tag->str_val, tex.dyn_buffer.buf); \
APPEND_META(doc, meta_tag) \
text_buffer_destroy(&tex);

View File

@@ -0,0 +1,809 @@
#include "media.h"
#include "../ocr/ocr.h"
#include <ctype.h>
#define MIN_SIZE 32
#define AVIO_BUF_SIZE 8192
#define IS_VIDEO(fmt) ((fmt)->iformat->name && strcmp((fmt)->iformat->name, "image2") != 0)
#define STREAM_IS_IMAGE (stream->nb_frames <= 1)
#define STORE_AS_IS ((void*)-1)
// Pointer to document being processed
__thread document_t *thread_doc;
const char *get_filepath_with_ext(document_t *doc, const char *filepath, const char *mime_str) {
int has_extension = doc->ext > doc->base;
if (!has_extension) {
if (strcmp(mime_str, "image/png") == 0) {
return "file.png";
} else if (strcmp(mime_str, "image/jpeg") == 0) {
return "file.jpg";
}
}
return filepath;
}
__always_inline
void *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) {
if (frame->pict_type == AV_PICTURE_TYPE_NONE) {
return NULL;
}
int dstW;
int dstH;
if (frame->width <= size && frame->height <= size) {
if (decoder->codec_id == AV_CODEC_ID_MJPEG || decoder->codec_id == AV_CODEC_ID_PNG) {
return STORE_AS_IS;
}
dstW = frame->width;
dstH = frame->height;
} else {
double ratio = (double) frame->width / frame->height;
if (frame->width > frame->height) {
dstW = size;
dstH = (int) (size / ratio);
} else {
dstW = (int) (size * ratio);
dstH = size;
}
}
if (dstW <= MIN_SIZE || dstH <= MIN_SIZE) {
return NULL;
}
AVFrame *scaled_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
decoder->width, decoder->height, decoder->pix_fmt,
dstW, dstH, AV_PIX_FMT_YUVJ420P,
SIST_SWS_ALGO, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, dstW, dstH, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len * 2);
av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, dstW, dstH, 1);
sws_scale(sws_ctx,
(const uint8_t *const *) frame->data, frame->linesize,
0, decoder->height,
scaled_frame->data, scaled_frame->linesize
);
scaled_frame->width = dstW;
scaled_frame->height = dstH;
scaled_frame->format = AV_PIX_FMT_YUV420P;
sws_freeContext(sws_ctx);
return scaled_frame;
}
typedef struct {
AVPacket *packet;
AVFrame *frame;
} frame_and_packet_t;
static void frame_and_packet_free(frame_and_packet_t *frame_and_packet) {
if (frame_and_packet->packet != NULL) {
av_packet_free(&frame_and_packet->packet);
}
if (frame_and_packet->frame != NULL) {
av_frame_free(&frame_and_packet->frame);
}
free(frame_and_packet->packet);
free(frame_and_packet);
}
__always_inline
static void read_subtitles(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, int stream_idx, document_t *doc) {
text_buffer_t tex = text_buffer_create(-1);
AVPacket packet;
AVSubtitle subtitle;
AVCodec *subtitle_codec = avcodec_find_decoder(pFormatCtx->streams[stream_idx]->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(subtitle_codec);
avcodec_parameters_to_context(decoder, pFormatCtx->streams[stream_idx]->codecpar);
avcodec_open2(decoder, subtitle_codec, NULL);
decoder->sub_text_format = FF_SUB_TEXT_FMT_ASS;
int got_sub;
while (1) {
int read_frame_ret = av_read_frame(pFormatCtx, &packet);
if (read_frame_ret != 0) {
break;
}
if (packet.stream_index != stream_idx) {
av_packet_unref(&packet);
continue;
}
avcodec_decode_subtitle2(decoder, &subtitle, &got_sub, &packet);
if (got_sub) {
for (int i = 0; i < subtitle.num_rects; i++) {
const char *text = subtitle.rects[i]->ass;
if (text == NULL) {
continue;
}
char *idx = strstr(text, "\\N");
if (idx != NULL && strlen(idx + 2) > 1) {
text_buffer_append_string0(&tex, idx + 2);
text_buffer_append_char(&tex, ' ');
}
}
avsubtitle_free(&subtitle);
}
av_packet_unref(&packet);
}
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
text_buffer_destroy(&tex);
avcodec_free_context(&decoder);
}
__always_inline
static frame_and_packet_t *
read_frame(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVCodecContext *decoder, int stream_idx,
document_t *doc) {
frame_and_packet_t *result = calloc(1, sizeof(frame_and_packet_t));
result->packet = av_packet_alloc();
result->frame = av_frame_alloc();
av_init_packet(result->packet);
int receive_ret = -EAGAIN;
while (receive_ret == -EAGAIN) {
// Get video frame
while (1) {
int read_frame_ret = av_read_frame(pFormatCtx, result->packet);
if (read_frame_ret != 0) {
if (read_frame_ret != AVERROR_EOF) {
CTX_LOG_WARNINGF(doc->filepath,
"(media.c) avcodec_read_frame() returned error code [%d] %s",
read_frame_ret, av_err2str(read_frame_ret)
)
}
frame_and_packet_free(result);
return NULL;
}
//Ignore audio/other frames
if (result->packet->stream_index != stream_idx) {
av_packet_unref(result->packet);
continue;
}
break;
}
// Feed it to decoder
int decode_ret = avcodec_send_packet(decoder, result->packet);
if (decode_ret != 0) {
CTX_LOG_ERRORF(doc->filepath,
"(media.c) avcodec_send_packet() returned error code [%d] %s",
decode_ret, av_err2str(decode_ret)
)
frame_and_packet_free(result);
return NULL;
}
receive_ret = avcodec_receive_frame(decoder, result->frame);
if (receive_ret == -EAGAIN && result->packet != NULL) {
av_packet_unref(result->packet);
}
}
return result;
}
void append_tag_meta_if_not_exists(scan_media_ctx_t *ctx, document_t *doc, AVDictionaryEntry *tag, enum metakey key) {
meta_line_t *meta = doc->meta_head;
while (meta != NULL) {
if (meta->key == key) {
CTX_LOG_DEBUGF(doc->filepath, "Ignoring duplicate tag: '%02x=%s' and '%02x=%s'",
key, meta->str_val, key, tag->value)
return;
}
meta = meta->next;
}
text_buffer_t tex = text_buffer_create(-1);
text_buffer_append_string0(&tex, tag->value);
text_buffer_terminate_string(&tex);
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta_tag->key = key;
strcpy(meta_tag->str_val, tex.dyn_buffer.buf);
APPEND_META(doc, meta_tag)
text_buffer_destroy(&tex);
}
#define APPEND_TAG_META(keyname) \
APPEND_UTF8_META(doc, keyname, tag->value)
#define STRCPY_TOLOWER(dst, str) \
strncpy(dst, str, sizeof(dst)); \
char *ptr = dst; \
for (; *ptr; ++ptr) *ptr = (char) tolower(*ptr);
__always_inline
static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
AVDictionaryEntry *tag = NULL;
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
char key[256];
STRCPY_TOLOWER(key, tag->key)
if (strcmp(key, "artist") == 0) {
APPEND_TAG_META(MetaArtist)
} else if (strcmp(key, "genre") == 0) {
APPEND_TAG_META(MetaGenre)
} else if (strcmp(key, "title") == 0) {
APPEND_TAG_META(MetaTitle)
} else if (strcmp(key, "album_artist") == 0) {
APPEND_TAG_META(MetaAlbumArtist)
} else if (strcmp(key, "album") == 0) {
APPEND_TAG_META(MetaAlbum)
} else if (strcmp(key, "comment") == 0) {
APPEND_TAG_META(MetaContent)
}
}
}
__always_inline
static void
append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *frame, document_t *doc, int is_video) {
if (is_video) {
meta_line_t *meta_duration = malloc(sizeof(meta_line_t));
meta_duration->key = MetaMediaDuration;
meta_duration->long_val = pFormatCtx->duration / AV_TIME_BASE;
if (meta_duration->long_val > INT32_MAX) {
meta_duration->long_val = 0;
}
APPEND_META(doc, meta_duration)
meta_line_t *meta_bitrate = malloc(sizeof(meta_line_t));
meta_bitrate->key = MetaMediaBitrate;
meta_bitrate->long_val = pFormatCtx->bit_rate;
APPEND_META(doc, meta_bitrate)
}
AVDictionaryEntry *tag = NULL;
if (is_video) {
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
char key[256];
STRCPY_TOLOWER(key, tag->key)
if (strcmp(key, "title") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaTitle);
} else if (strcmp(key, "comment") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaContent);
} else if (strcmp(key, "artist") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
}
}
} else {
// EXIF metadata
while ((tag = av_dict_get(frame->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
char key[256];
STRCPY_TOLOWER(key, tag->key)
if (strcmp(key, "artist") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
} else if (strcmp(key, "imagedescription") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaContent);
} else if (strcmp(key, "make") == 0) {
APPEND_TAG_META(MetaExifMake)
} else if (strcmp(key, "model") == 0) {
APPEND_TAG_META(MetaExifModel)
} else if (strcmp(key, "software") == 0) {
APPEND_TAG_META(MetaExifSoftware)
} else if (strcmp(key, "fnumber") == 0) {
APPEND_TAG_META(MetaExifFNumber)
} else if (strcmp(key, "focallength") == 0) {
APPEND_TAG_META(MetaExifFocalLength)
} else if (strcmp(key, "usercomment") == 0) {
APPEND_TAG_META(MetaExifUserComment)
} else if (strcmp(key, "isospeedratings") == 0) {
APPEND_TAG_META(MetaExifIsoSpeedRatings)
} else if (strcmp(key, "exposuretime") == 0) {
APPEND_TAG_META(MetaExifExposureTime)
} else if (strcmp(key, "datetime") == 0) {
APPEND_TAG_META(MetaExifDateTime)
} else if (strcmp(key, "gpslatitude") == 0) {
APPEND_TAG_META(MetaExifGpsLatitudeDMS)
} else if (strcmp(key, "gpslatituderef") == 0) {
APPEND_TAG_META(MetaExifGpsLatitudeRef)
} else if (strcmp(key, "gpslongitude") == 0) {
APPEND_TAG_META(MetaExifGpsLongitudeDMS)
} else if (strcmp(key, "gpslongituderef") == 0) {
APPEND_TAG_META(MetaExifGpsLongitudeRef)
}
}
}
}
static void ocr_image_cb(const char *text, size_t len) {
APPEND_STR_META(thread_doc, MetaContent, text);
}
#define OCR_PIXEL_FORMAT AV_PIX_FMT_RGB32
#define OCR_BYTES_PER_PIXEL 4
#define OCR_PIXELS_PER_INCH 70
void ocr_image(scan_media_ctx_t *ctx, document_t *doc, const AVCodecContext *decoder, AVFrame *frame) {
// Convert to RGB32
AVFrame *rgb_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
frame->width, frame->height, decoder->pix_fmt,
frame->width, frame->height, OCR_PIXEL_FORMAT,
SWS_LANCZOS, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(OCR_PIXEL_FORMAT, frame->width, frame->height, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len * 2);
av_image_fill_arrays(rgb_frame->data, rgb_frame->linesize, dst_buf, OCR_PIXEL_FORMAT, frame->width, frame->height,
1);
sws_scale(sws_ctx,
(const uint8_t *const *) frame->data, frame->linesize,
0, frame->height,
rgb_frame->data, rgb_frame->linesize
);
thread_doc = doc;
ocr_extract_text(
ctx->tesseract_path,
ctx->tesseract_lang,
rgb_frame->data[0],
frame->width,
frame->height,
OCR_BYTES_PER_PIXEL,
rgb_frame->linesize[0],
OCR_PIXELS_PER_INCH,
ocr_image_cb
);
sws_freeContext(sws_ctx);
av_free(*rgb_frame->data);
av_frame_free(&rgb_frame);
}
void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) {
int video_stream = -1;
int audio_stream = -1;
int subtitle_stream = -1;
avformat_find_stream_info(pFormatCtx, NULL);
for (int i = (int) pFormatCtx->nb_streams - 1; i >= 0; i--) {
AVStream *stream = pFormatCtx->streams[i];
if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
if (audio_stream == -1) {
const AVCodecDescriptor *desc = avcodec_descriptor_get(stream->codecpar->codec_id);
if (desc != NULL) {
APPEND_STR_META(doc, MetaMediaAudioCodec, desc->name)
}
audio_stream = i;
}
} else if (stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
if (video_stream == -1) {
const AVCodecDescriptor *desc = avcodec_descriptor_get(stream->codecpar->codec_id);
if (desc != NULL) {
APPEND_STR_META(doc, MetaMediaVideoCodec, desc->name)
}
meta_line_t *meta_w = malloc(sizeof(meta_line_t));
meta_w->key = MetaWidth;
meta_w->long_val = stream->codecpar->width;
APPEND_META(doc, meta_w)
meta_line_t *meta_h = malloc(sizeof(meta_line_t));
meta_h->key = MetaHeight;
meta_h->long_val = stream->codecpar->height;
APPEND_META(doc, meta_h)
video_stream = i;
}
} else if (stream->codecpar->codec_type == AVMEDIA_TYPE_SUBTITLE) {
subtitle_stream = i;
}
}
if (subtitle_stream != -1 && ctx->read_subtitles) {
read_subtitles(ctx, pFormatCtx, subtitle_stream, doc);
// Reset stream
if (video_stream != -1) {
av_seek_frame(pFormatCtx, video_stream, 0, 0);
}
}
if (audio_stream != -1) {
append_audio_meta(pFormatCtx, doc);
}
if (video_stream != -1 && ctx->tn_size > 0) {
AVStream *stream = pFormatCtx->streams[video_stream];
if (stream->codecpar->width <= MIN_SIZE || stream->codecpar->height <= MIN_SIZE) {
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
// Decoder
AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
avcodec_parameters_to_context(decoder, stream->codecpar);
avcodec_open2(decoder, video_codec, NULL);
//Seek
if (!STREAM_IS_IMAGE && stream->codecpar->codec_id != AV_CODEC_ID_GIF) {
int seek_ret;
for (int i = 20; i >= 0; i--) {
seek_ret = av_seek_frame(pFormatCtx, video_stream,
(long) ((double) stream->duration * 0.10), 0);
if (seek_ret == 0) {
break;
}
}
}
frame_and_packet_t *frame_and_packet = read_frame(ctx, pFormatCtx, decoder, video_stream, doc);
if (frame_and_packet == NULL) {
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
if (ctx->tesseract_lang != NULL && STREAM_IS_IMAGE) {
ocr_image(ctx, doc, decoder, frame_and_packet->frame);
}
// NOTE: OCR'd content takes precedence over exif image description
append_video_meta(ctx, pFormatCtx, frame_and_packet->frame, doc, IS_VIDEO(pFormatCtx));
// Scale frame
AVFrame *scaled_frame = scale_frame(decoder, frame_and_packet->frame, ctx->tn_size);
if (scaled_frame == NULL) {
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
if (scaled_frame == STORE_AS_IS) {
APPEND_TN_META(doc, frame_and_packet->frame->width, frame_and_packet->frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data,
frame_and_packet->packet->size);
} else {
// Encode frame to jpeg
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
ctx->tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
// Save thumbnail
APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
avcodec_free_context(&jpeg_encoder);
av_packet_unref(&jpeg_packet);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
}
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
}
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
}
void parse_media_filename(scan_media_ctx_t *ctx, const char *filepath, document_t *doc) {
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
return;
}
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
if (res < 0) {
CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
parse_media_format_ctx(ctx, pFormatCtx, doc);
}
int vfile_read(void *ptr, uint8_t *buf, int buf_size) {
struct vfile *f = ptr;
int ret = f->read(f, buf, buf_size);
if (ret == 0) {
return AVERROR_EOF;
}
return ret;
}
typedef struct {
size_t size;
FILE *file;
void *buf;
} memfile_t;
int memfile_read(void *ptr, uint8_t *buf, int buf_size) {
memfile_t *mem = ptr;
size_t ret = fread(buf, 1, buf_size, mem->file);
if (ret == 0 && feof(mem->file)) {
return AVERROR_EOF;
}
return (int) ret;
}
long memfile_seek(void *ptr, long offset, int whence) {
memfile_t *mem = ptr;
if (whence == 0x10000) {
return (long) mem->size;
}
int ret = fseek(mem->file, offset, whence);
if (ret != 0) {
return AVERROR_EOF;
}
return ftell(mem->file);
}
int memfile_open(vfile_t *f, memfile_t *mem) {
mem->size = f->info.st_size;
mem->buf = malloc(mem->size);
if (mem->buf == NULL) {
return -1;
}
int ret = f->read(f, mem->buf, mem->size);
mem->file = fmemopen(mem->buf, mem->size, "rb");
if (f->calculate_checksum) {
SHA1_Init(&f->sha1_ctx);
safe_sha1_update(&f->sha1_ctx, mem->buf, mem->size);
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
f->has_checksum = TRUE;
}
return (ret == mem->size && mem->file != NULL) ? 0 : -1;
}
int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) {
mem->size = (int) buf_len;
mem->buf = buf;
mem->file = fmemopen(mem->buf, mem->size, "rb");
return mem->file != NULL ? 0 : -1;
}
void memfile_close(memfile_t *mem) {
if (mem->buf != NULL) {
free(mem->buf);
fclose(mem->file);
}
}
void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc, const char *mime_str) {
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
return;
}
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
AVIOContext *io_ctx = NULL;
memfile_t memfile = {0, 0, 0};
const char *filepath = get_filepath_with_ext(doc, f->filepath, mime_str);
if (f->info.st_size <= ctx->max_media_buffer) {
int ret = memfile_open(f, &memfile);
if (ret == 0) {
CTX_LOG_DEBUGF(f->filepath, "Loading media file in memory (%ldB)", f->info.st_size)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, &memfile, memfile_read, NULL, memfile_seek);
}
}
if (io_ctx == NULL) {
CTX_LOG_DEBUGF(f->filepath, "Reading media file without seek support", f->info.st_size)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, f, vfile_read, NULL, NULL);
}
pFormatCtx->pb = io_ctx;
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
if (res < 0) {
if (res != -5) {
CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
}
av_free(io_ctx->buffer);
memfile_close(&memfile);
avio_context_free(&io_ctx);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
parse_media_format_ctx(ctx, pFormatCtx, doc);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
memfile_close(&memfile);
}
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char *mime_str) {
if (f->is_fs_file) {
parse_media_filename(ctx, f->filepath, doc);
} else {
parse_media_vfile(ctx, f, doc, mime_str);
}
}
void init_media() {
av_log_set_level(AV_LOG_QUIET);
}
int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) {
memfile_t memfile = {0, 0, 0};
AVIOContext *io_ctx = NULL;
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
return FALSE;
}
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
int ret = memfile_open_buf(buf, buf_len, &memfile);
if (ret == 0) {
CTX_LOG_DEBUGF(doc->filepath, "Loading media file in memory (%ldB)", buf_len)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, &memfile, memfile_read, NULL, memfile_seek);
} else {
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
fclose(memfile.file);
return FALSE;
}
pFormatCtx->pb = io_ctx;
int res = avformat_open_input(&pFormatCtx, url, NULL, NULL);
if (res != 0) {
av_free(io_ctx->buffer);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
AVStream *stream = pFormatCtx->streams[0];
// Decoder
const AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
avcodec_parameters_to_context(decoder, stream->codecpar);
avcodec_open2(decoder, video_codec, NULL);
frame_and_packet_t *frame_and_packet = read_frame(ctx, pFormatCtx, decoder, 0, doc);
if (frame_and_packet == NULL) {
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
// Scale frame
AVFrame *scaled_frame = scale_frame(decoder, frame_and_packet->frame, ctx->tn_size);
if (scaled_frame == NULL) {
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
if (scaled_frame == STORE_AS_IS) {
APPEND_TN_META(doc, frame_and_packet->frame->width, frame_and_packet->frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data,
frame_and_packet->packet->size);
} else {
// Encode frame to jpeg
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
ctx->tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
// Save thumbnail
APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
av_packet_unref(&jpeg_packet);
avcodec_free_context(&jpeg_encoder);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
}
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return TRUE;
}

View File

@@ -0,0 +1,55 @@
#ifndef SIST2_MEDIA_H
#define SIST2_MEDIA_H
#include "../scan.h"
#include "libavformat/avformat.h"
#include "libswscale/swscale.h"
#include "libswresample/swresample.h"
#include "libavcodec/avcodec.h"
#include "libavutil/imgutils.h"
typedef struct {
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int tn_size;
float tn_qscale;
long max_media_buffer;
int read_subtitles;
const char *tesseract_lang;
const char *tesseract_path;
} scan_media_ctx_t;
__always_inline
static AVCodecContext *alloc_jpeg_encoder(int w, int h, float qscale) {
const AVCodec *jpeg_codec = avcodec_find_encoder(AV_CODEC_ID_MJPEG);
AVCodecContext *jpeg = avcodec_alloc_context3(jpeg_codec);
jpeg->width = w;
jpeg->height = h;
jpeg->time_base.den = 1000000;
jpeg->time_base.num = 1;
jpeg->i_quant_factor = qscale;
jpeg->pix_fmt = AV_PIX_FMT_YUVJ420P;
int ret = avcodec_open2(jpeg, jpeg_codec, NULL);
if (ret != 0) {
return NULL;
}
return jpeg;
}
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char*mime_str);
void init_media();
int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url);
#endif

View File

@@ -0,0 +1,79 @@
#include "scan_mobi.h"
#include <mobi.h>
#include <errno.h>
#include "stdlib.h"
void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc) {
MOBIData *m = mobi_init();
if (m == NULL) {
CTX_LOG_ERROR(f->filepath, "mobi_init() failed")
return;
}
size_t buf_len;
char* buf = read_all(f, &buf_len);
if (buf == NULL) {
mobi_free(m);
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FILE *file = fmemopen(buf, buf_len, "rb");
if (file == NULL) {
mobi_free(m);
free(buf);
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
return;
}
MOBI_RET mobi_ret = mobi_load_file(m, file);
fclose(file);
if (mobi_ret != MOBI_SUCCESS) {
mobi_free(m);
free(buf);
CTX_LOG_ERRORF(f->filepath, "mobi_laod_file() returned error code [%d]", mobi_ret)
return;
}
char *author = mobi_meta_get_author(m);
if (author != NULL) {
APPEND_STR_META(doc, MetaAuthor, author)
free(author);
}
char *title = mobi_meta_get_title(m);
if (title != NULL) {
APPEND_STR_META(doc, MetaTitle, title)
free(title);
}
const size_t maxlen = mobi_get_text_maxsize(m);
if (maxlen == MOBI_NOTSET) {
free(buf);
CTX_LOG_DEBUGF("%s", "Invalid text maxsize: %zu", maxlen)
return;
}
char *content_str = malloc(maxlen + 1);
size_t length = maxlen;
mobi_ret = mobi_get_rawml(m, content_str, &length);
if (mobi_ret != MOBI_SUCCESS) {
mobi_free(m);
free(content_str);
free(buf);
CTX_LOG_ERRORF(f->filepath, "mobi_get_rawml() returned error code [%d]", mobi_ret)
return;
}
text_buffer_t tex = text_buffer_create(ctx->content_size);
text_buffer_append_markup(&tex, content_str);
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
free(content_str);
free(buf);
text_buffer_destroy(&tex);
mobi_free(m);
}

View File

@@ -0,0 +1,14 @@
#ifndef SCAN_SCAN_MOBI_H
#define SCAN_SCAN_MOBI_H
#include "../scan.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
} scan_mobi_ctx_t;
void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

View File

@@ -0,0 +1,147 @@
#include "msdoc.h"
#include <errno.h>
#include <sys/mman.h>
#include "../../third-party/antiword/src/antiword.h"
#include "../ebook/ebook.h"
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void *buf, size_t buf_len) {
// Open word doc
options_type *opts = direct_vGetOptions();
opts->iParagraphBreak = 74;
opts->eConversionType = conversion_text;
opts->bHideHiddenText = 1;
opts->bRemoveRemovedText = 1;
opts->bUseLandscape = 0;
opts->eEncoding = encoding_utf_8;
opts->iPageHeight = 842; // A4
opts->iPageWidth = 595;
opts->eImageLevel = level_ps_3;
int doc_word_version = iGuessVersionNumber(file_in, (int) buf_len);
if (doc_word_version < 0 || doc_word_version == 3) {
free(buf);
return;
}
rewind(file_in);
size_t out_len;
char *out_buf;
FILE *file_out = open_memstream(&out_buf, &out_len);
diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
if (diag == NULL) {
fclose(file_in);
return;
}
iInitDocument(file_in, (int) buf_len);
const char *author = szGetAuthor();
if (author != NULL) {
APPEND_UTF8_META(doc, MetaAuthor, author)
}
const char *title = szGetTitle();
if (title != NULL) {
APPEND_UTF8_META(doc, MetaTitle, title)
}
vFreeDocument();
bWordDecryptor(file_in, (int) buf_len, diag);
vDestroyDiagram(diag);
fclose(file_out);
if (buf_len > 0) {
text_buffer_t tex = text_buffer_create(ctx->content_size);
text_buffer_append_string(&tex, out_buf, out_len);
text_buffer_terminate_string(&tex);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta_content->key = MetaContent;
memcpy(meta_content->str_val, tex.dyn_buffer.buf, tex.dyn_buffer.cur);
APPEND_META(doc, meta_content)
text_buffer_destroy(&tex);
}
free(buf);
free(out_buf);
}
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void *buf, size_t buf_len) {
scan_ebook_ctx_t ebook_ctx = {
.content_size = ctx->content_size,
.tn_size = ctx->tn_size,
.log = ctx->log,
.logf = ctx->logf,
.store = ctx->store,
};
// Open word doc
options_type *opts = direct_vGetOptions();
opts->iParagraphBreak = 74;
opts->eConversionType = conversion_pdf;
opts->bHideHiddenText = 1;
opts->bRemoveRemovedText = 1;
opts->bUseLandscape = 0;
opts->eEncoding = encoding_latin_1;
opts->iPageHeight = 842; // A4
opts->iPageWidth = 595;
opts->eImageLevel = level_ps_3;
int doc_word_version = iGuessVersionNumber(file, (int) buf_len);
if (doc_word_version < 0 || doc_word_version == 3) {
free(buf);
return;
}
rewind(file);
size_t out_len;
char *out_buf;
FILE *file_out = open_memstream(&out_buf, &out_len);
diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
if (diag == NULL) {
return;
}
bWordDecryptor(file, (int) buf_len, diag);
vDestroyDiagram(diag);
fclose(file_out);
parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc, TRUE);
free(buf);
free(out_buf);
}
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
char *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FILE *file = fmemopen(buf, buf_len, "rb");
if (file == NULL) {
free(buf);
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
return;
}
if (ctx->tn_size > 0) {
char *buf_pdf = malloc(buf_len);
memcpy(buf_pdf, buf, buf_len);
parse_msdoc_pdf(ctx, doc, file, buf_pdf, buf_len);
}
parse_msdoc_text(ctx, doc, file, buf, buf_len);
fclose(file);
}

View File

@@ -0,0 +1,24 @@
#ifndef SCAN_SCAN_MSDOC_H
#define SCAN_SCAN_MSDOC_H
#include "../scan.h"
typedef struct {
long content_size;
int tn_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
unsigned int msdoc_mime;
} scan_msdoc_ctx_t;
__always_inline
static int is_msdoc(scan_msdoc_ctx_t *ctx, unsigned int mime) {
return mime == ctx->msdoc_mime;
}
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc);
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void* buf, size_t buf_len);
#endif

47
third-party/libscan/libscan/ocr/ocr.h vendored Normal file
View File

@@ -0,0 +1,47 @@
#ifndef OCR_H
#define OCR_H
#include "../scan.h"
#include <tesseract/capi.h>
#define MIN_OCR_WIDTH 350
#define MIN_OCR_HEIGHT 100
#define MIN_OCR_LEN 10
#define OCR_IS_VALID_BPP(d) \
((d) == 1 || (d) == 2 || (d) == 4 || (d) == 8 || (d) == 16 || (d) == 24 || \
(d) == 32)
typedef void (*ocr_extract_callback_t)(const char *, size_t);
__always_inline static void
ocr_extract_text(const char *tesseract_path, const char *tesseract_lang,
const unsigned char *img_buf, const int img_w, const int img_h,
const int img_bpp, const int img_stride, const int img_xres,
const ocr_extract_callback_t cb) {
if (img_w < MIN_OCR_WIDTH || img_h < MIN_OCR_HEIGHT || img_xres <= 0 ||
!OCR_IS_VALID_BPP(img_bpp)) {
return;
}
TessBaseAPI *api = TessBaseAPICreate();
TessBaseAPIInit3(api, tesseract_path, tesseract_lang);
TessBaseAPISetImage(api, img_buf, img_w, img_h, img_bpp, img_stride);
TessBaseAPISetSourceResolution(api, img_xres);
char *text = TessBaseAPIGetUTF8Text(api);
if (text != NULL) {
size_t len = strlen(text);
if (len >= MIN_OCR_LEN) {
cb(text, len);
}
TessDeleteText(text);
}
TessBaseAPIEnd(api);
TessBaseAPIDelete(api);
}
#endif

View File

@@ -0,0 +1,260 @@
#include "ooxml.h"
#include <archive.h>
#include <archive_entry.h>
#include <libxml/xmlstring.h>
#include <libxml/parser.h>
#define _X(str) ((const xmlChar*)str)
__always_inline
static int should_read_part(const char *part) {
if (part == NULL) {
return FALSE;
}
if ( // Word
STR_STARTS_WITH_CONSTANT(part, "word/document.xml")
|| STR_STARTS_WITH_CONSTANT(part, "word/footnotes.xml")
|| STR_STARTS_WITH_CONSTANT(part, "word/endnotes.xml")
|| STR_STARTS_WITH_CONSTANT(part, "word/footer")
|| STR_STARTS_WITH_CONSTANT(part, "word/header")
// PowerPoint
|| STR_STARTS_WITH_CONSTANT(part, "ppt/slides/slide")
|| STR_STARTS_WITH_CONSTANT(part, "ppt/notesSlides/slide")
// Excel
|| STR_STARTS_WITH_CONSTANT(part, "xl/worksheets/sheet")
|| STR_STARTS_WITH_CONSTANT(part, "xl/sharedStrings.xml")
|| STR_STARTS_WITH_CONSTANT(part, "xl/workbook.xml")
) {
return TRUE;
}
return FALSE;
}
int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_t *buf) {
//TODO: Check which nodes are likely to have a 't' child, and ignore nodes that aren't
xmlErrorPtr err = xmlGetLastError();
if (err != NULL) {
if (err->level == XML_ERR_FATAL) {
CTX_LOG_ERRORF("ooxml.c", "Got fatal XML error while parsing document: %s", err->message)
return -1;
}
}
for (xmlNode *child = node; child; child = child->next) {
if (child->name != NULL && *child->name == 't' && *(child->name + 1) == '\0') {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text) {
int ret = text_buffer_append_string0(buf, (char *) text);
text_buffer_append_char(buf, ' ');
xmlFree(text);
if (ret == TEXT_BUF_FULL) {
return ret;
}
}
}
if (extract_text(ctx, xml, child->children, buf) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
}
return 0;
}
int xml_io_read(void *context, char *buffer, int len) {
struct archive *a = context;
return (int) archive_read_data(a, buffer, len);
}
int xml_io_close(UNUSED(void *context)) {
//noop
return 0;
}
#define READ_PART_ERR (-2)
__always_inline
static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
return READ_PART_ERR;
}
xmlNode *root = xmlDocGetRootElement(xml);
if (root == NULL) {
CTX_LOG_ERROR(doc->filepath, "Empty document")
xmlFreeDoc(xml);
return READ_PART_ERR;
}
int ret = extract_text(ctx, xml, root, buf);
xmlFreeDoc(xml);
return ret;
}
__always_inline
static int read_doc_props_app(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
return -1;
}
xmlNode *root = xmlDocGetRootElement(xml);
if (root == NULL) {
CTX_LOG_ERROR(doc->filepath, "Empty document")
xmlFreeDoc(xml);
return -1;
}
if (xmlStrEqual(root->name, _X("Properties"))) {
for (xmlNode *child = root->children; child; child = child->next) {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text == NULL) {
continue;
}
if (xmlStrEqual(child->name, _X("Pages"))) {
APPEND_LONG_META(doc, MetaPages, strtol((char *) text, NULL, 10))
}
xmlFree(text);
}
}
xmlFreeDoc(xml);
return 0;
}
__always_inline
static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
return -1;
}
xmlNode *root = xmlDocGetRootElement(xml);
if (root == NULL) {
CTX_LOG_ERROR(doc->filepath, "Empty document")
xmlFreeDoc(xml);
return -1;
}
if (xmlStrEqual(root->name, _X("coreProperties"))) {
for (xmlNode *child = root->children; child; child = child->next) {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text == NULL) {
continue;
}
if (xmlStrEqual(child->name, _X("title"))) {
APPEND_STR_META(doc, MetaTitle, (char *) text)
} else if (xmlStrEqual(child->name, _X("creator"))) {
APPEND_STR_META(doc, MetaAuthor, (char *) text)
} else if (xmlStrEqual(child->name, _X("lastModifiedBy"))) {
APPEND_STR_META(doc, MetaModifiedBy, (char *) text)
}
xmlFree(text);
}
}
xmlFreeDoc(xml);
return 0;
}
#define MAX_TN_SIZE (1024 * 1024 * 15)
void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, struct archive_entry *entry) {
size_t entry_size = archive_entry_size(entry);
if (entry_size <= 0 || entry_size > MAX_TN_SIZE) {
return;
}
char *buf = malloc(entry_size);
archive_read_data(a, buf, entry_size);
APPEND_TN_META(doc, 1, 1) // Size unknown
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), buf, entry_size);
free(buf);
}
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
struct archive *a = archive_read_new();
archive_read_support_format_zip(a);
int ret = archive_read_open_memory(a, buf, buf_len);
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a))
archive_read_free(a);
free(buf);
return;
}
text_buffer_t tex = text_buffer_create(ctx->content_size);
struct archive_entry *entry;
int buffer_full = FALSE;
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
const char *path = archive_entry_pathname(entry);
if (!buffer_full && should_read_part(path) && ctx->content_size > 0) {
ret = read_part(ctx, a, &tex, doc);
if (ret == READ_PART_ERR) {
break;
} else if (ret == TEXT_BUF_FULL) {
buffer_full = TRUE;
}
} else if (strcmp(path, "docProps/app.xml") == 0) {
if (read_doc_props_app(ctx, a, doc) != 0) {
break;
}
} else if (strcmp(path, "docProps/core.xml") == 0) {
if (read_doc_props(ctx, a, doc) != 0) {
break;
}
} else if (strcmp(path, "docProps/thumbnail.jpeg") == 0) {
read_thumbnail(ctx, doc, a, entry);
}
}
}
if (tex.dyn_buffer.cur > 0) {
text_buffer_terminate_string(&tex);
meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta->key = MetaContent;
strcpy(meta->str_val, tex.dyn_buffer.buf);
APPEND_META(doc, meta)
}
archive_read_close(a);
archive_read_free(a);
text_buffer_destroy(&tex);
free(buf);
}

View File

@@ -0,0 +1,16 @@
#ifndef SCAN_OOXML_H
#define SCAN_OOXML_H
#include <stdlib.h>
#include "../scan.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
} scan_ooxml_ctx_t;
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

224
third-party/libscan/libscan/raw/raw.c vendored Normal file
View File

@@ -0,0 +1,224 @@
#include "raw.h"
#include <libraw/libraw.h>
#include "../media/media.h"
#include <unistd.h>
#define MIN_SIZE 32
int store_thumbnail_jpeg(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
return store_image_thumbnail((scan_media_ctx_t *) ctx, img->data, img->data_size, doc, "x.jpeg");
}
int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
int dstW;
int dstH;
if (img->width <= ctx->tn_size && img->height <= ctx->tn_size) {
dstW = img->width;
dstH = img->height;
} else {
double ratio = (double) img->width / img->height;
if (img->width > img->height) {
dstW = ctx->tn_size;
dstH = (int) (ctx->tn_size / ratio);
} else {
dstW = (int) (ctx->tn_size * ratio);
dstH = ctx->tn_size;
}
}
if (dstW <= MIN_SIZE || dstH <= MIN_SIZE) {
return FALSE;
}
AVFrame *scaled_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
img->width, img->height, AV_PIX_FMT_RGB24,
dstW, dstH, AV_PIX_FMT_YUVJ420P,
SIST_SWS_ALGO, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, dstW, dstH, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len);
av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, dstW, dstH, 1);
const uint8_t *in_data[1] = {img->data};
int in_line_size[1] = {3 * img->width};
sws_scale(sws_ctx,
in_data, in_line_size,
0, img->height,
scaled_frame->data, scaled_frame->linesize
);
scaled_frame->width = dstW;
scaled_frame->height = dstH;
scaled_frame->format = AV_PIX_FMT_YUV420P;
sws_freeContext(sws_ctx);
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, 1.0f);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
av_packet_unref(&jpeg_packet);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
avcodec_free_context(&jpeg_encoder);
return TRUE;
}
#define DMS_REF(ref) (((ref) == 'S' || (ref) == 'W') ? -1 : 1)
void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
libraw_data_t *libraw_lib = libraw_init(0);
if (!libraw_lib) {
CTX_LOG_ERROR("raw.c", "Cannot create libraw handle")
return;
}
size_t buf_len = 0;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
int ret = libraw_open_buffer(libraw_lib, buf, buf_len);
if (ret != 0) {
CTX_LOG_ERROR(f->filepath, "Could not open raw file")
free(buf);
libraw_close(libraw_lib);
return;
}
if (*libraw_lib->idata.model != '\0') {
APPEND_STR_META(doc, MetaExifModel, libraw_lib->idata.model)
}
if (*libraw_lib->idata.make != '\0') {
APPEND_STR_META(doc, MetaExifMake, libraw_lib->idata.make)
}
if (*libraw_lib->idata.software != '\0') {
APPEND_STR_META(doc, MetaExifSoftware, libraw_lib->idata.software)
}
APPEND_LONG_META(doc, MetaWidth, libraw_lib->sizes.width)
APPEND_LONG_META(doc, MetaHeight, libraw_lib->sizes.height)
char tmp[1024];
snprintf(tmp, sizeof(tmp), "%g", libraw_lib->other.iso_speed);
APPEND_STR_META(doc, MetaExifIsoSpeedRatings, tmp)
if (*libraw_lib->other.desc != '\0') {
APPEND_STR_META(doc, MetaContent, libraw_lib->other.desc)
}
if (*libraw_lib->other.artist != '\0') {
APPEND_STR_META(doc, MetaArtist, libraw_lib->other.artist)
}
struct tm *time = localtime(&libraw_lib->other.timestamp);
strftime(tmp, sizeof(tmp), "%Y:%m:%d %H:%M:%S", time);
APPEND_STR_META(doc, MetaExifDateTime, tmp)
snprintf(tmp, sizeof(tmp), "%.1f", libraw_lib->other.focal_len);
APPEND_STR_META(doc, MetaExifFocalLength, tmp)
snprintf(tmp, sizeof(tmp), "%.1f", libraw_lib->other.aperture);
APPEND_STR_META(doc, MetaExifFNumber, tmp)
int denominator = (int) roundf(1 / libraw_lib->other.shutter);
snprintf(tmp, sizeof(tmp), "1/%d", denominator);
APPEND_STR_META(doc, MetaExifExposureTime, tmp)
libraw_gps_info_t gps = libraw_lib->other.parsed_gps;
double gps_longitude_dec =
(gps.longitude[0] + gps.longitude[1] / 60 + gps.longitude[2] / 3600) * DMS_REF(gps.longref);
snprintf(tmp, sizeof(tmp), "%.15f", gps_longitude_dec);
if (gps_longitude_dec != 0.0) {
APPEND_STR_META(doc, MetaExifGpsLongitudeDec, tmp)
}
double gps_latitude_dec = (gps.latitude[0] + gps.latitude[1] / 60 + gps.latitude[2] / 3600) * DMS_REF(gps.latref);
snprintf(tmp, sizeof(tmp), "%.15f", gps_latitude_dec);
if (gps_latitude_dec != 0.0) {
APPEND_STR_META(doc, MetaExifGpsLatitudeDec, tmp)
}
APPEND_STR_META(doc, MetaMediaVideoCodec, "raw")
if (ctx->tn_size <= 0) {
free(buf);
libraw_close(libraw_lib);
return;
}
int unpack_ret = libraw_unpack_thumb(libraw_lib);
if (unpack_ret != 0) {
CTX_LOG_ERRORF(f->filepath, "libraw_unpack_thumb returned error code %d", unpack_ret)
free(buf);
libraw_close(libraw_lib);
return;
}
int errc = 0;
libraw_processed_image_t *thumb = libraw_dcraw_make_mem_thumb(libraw_lib, &errc);
if (errc != 0) {
free(buf);
libraw_dcraw_clear_mem(thumb);
libraw_close(libraw_lib);
return;
}
int tn_ok = 0;
if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_JPEG) {
tn_ok = store_thumbnail_jpeg(ctx, thumb, doc);
} else if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_BITMAP) {
// TODO: technically this should work but is currently untested
tn_ok = store_thumbnail_rgb24(ctx, thumb, doc);
}
libraw_dcraw_clear_mem(thumb);
if (tn_ok == TRUE) {
free(buf);
libraw_close(libraw_lib);
return;
}
ret = libraw_unpack(libraw_lib);
if (ret != 0) {
CTX_LOG_ERROR(f->filepath, "Could not unpack raw file")
free(buf);
libraw_close(libraw_lib);
return;
}
libraw_dcraw_process(libraw_lib);
errc = 0;
libraw_processed_image_t *img = libraw_dcraw_make_mem_image(libraw_lib, &errc);
if (errc != 0) {
free(buf);
libraw_dcraw_clear_mem(img);
libraw_close(libraw_lib);
return;
}
store_thumbnail_rgb24(ctx, img, doc);
libraw_dcraw_clear_mem(img);
libraw_close(libraw_lib);
free(buf);
}

Some files were not shown because too many files have changed in this diff Show More