Compare commits

...

56 Commits

Author SHA1 Message Date
e03625838b Settings menu (#30) and UI tweaks 2020-02-29 19:26:09 -05:00
86840b46f4 Version bump 2020-02-27 09:47:06 -05:00
e57f9916eb Rewrite documentation 2020-02-27 09:45:14 -05:00
565ba6ee76 Fix for #29 2020-02-27 09:44:19 -05:00
d83fc2c373 Fix docker build for 1.2.15 2020-02-27 09:42:18 -05:00
d4da28249e --fast option #27 2020-02-22 18:37:08 -05:00
483a454c8d --exclude argument #26 2020-02-22 16:55:35 -05:00
018ac86640 fix build... 2020-02-22 13:20:41 -05:00
398f1aead4 Support for cbr documents 2020-02-22 13:11:19 -05:00
d19a75926b Fix invalid read in terminate_string() 2020-02-22 13:10:40 -05:00
1ac8b40e3d Code style 2020-02-22 09:02:59 -05:00
a8505cb8c1 Fix for #28 2020-02-20 16:42:13 -05:00
ae8652d86e UI tweaks, search syntax (#25) 2020-02-16 15:24:29 -05:00
849beb09d8 hotfix 2020-02-15 19:33:18 -05:00
e1aaaee617 UI tweak 2020-02-15 09:30:14 -05:00
c02b940945 (I forgot to commit this) 2020-02-14 20:58:10 -05:00
2934ddb07f Add image viewer (#2) 2020-02-14 18:28:55 -05:00
7f6f3c02fa OCR tweaks 2020-02-11 21:13:47 -05:00
7f98d5a682 Fix buffer overflow (whoops) 2020-02-09 18:11:29 -05:00
7eb9c5d7d5 Fix web/index issue with NULL mime types 2020-02-09 17:23:49 -05:00
184439aa38 increase minimum image size for OCR 2020-02-09 14:06:59 -05:00
1ce8b298a1 Display EXIF tags on document info panel, remove march=native on openjp 2020-02-09 13:21:19 -05:00
75f99025d9 add exif dateTime, allow some special characters in text meta 2020-02-09 08:47:13 -05:00
ebe852bd5a Fix rewrite-url arg 2020-02-09 08:23:17 -05:00
402b103c49 Fix total count for ES 7.5 2020-02-08 09:25:00 -05:00
e9b6e1cdc2 Turn off auto optimisation in libtesseract build 2020-02-08 08:32:04 -05:00
ed1ce8ab5e Handle XML errors #18 2020-02-07 10:08:01 -05:00
d1fa4febc4 Improve scroll feature, UI fix 2020-02-07 10:08:01 -05:00
048c55df7b Update README.md 2020-02-06 19:56:29 -05:00
f77bc6a025 Update README.md 2020-02-06 19:55:32 -05:00
efdde2734e version bump 2020-02-06 19:28:05 -05:00
66658fa8f7 Remove trailing/leading white space in text meta fields 2020-02-06 19:27:30 -05:00
df41c251e4 (Breaking!) Add some exif tags 2020-02-06 19:21:50 -05:00
3282ab56ba Version bump 2020-02-02 09:26:54 -05:00
8300838d30 Suppress XML parsing errors (#18) 2020-02-02 09:26:03 -05:00
c9870a6d3d Remove -march=native for release build... 2020-02-02 09:03:06 -05:00
a143cc4fcf bundle openssl... 2020-02-02 08:39:20 -05:00
9ef1f3781d fix attempt for #11 2020-02-01 20:04:26 -05:00
bbee8aa721 tesseract ocr path fix 2020-02-01 20:03:59 -05:00
d22f83c797 curl fix 2020-02-01 15:22:43 -05:00
50615486a4 curl fix attempt 2020-02-01 14:42:42 -05:00
ca79e4f797 add /status endpoint 2020-01-28 10:18:37 -05:00
6a9fd08a80 Merge pull request #21 from simon987/wip-20
Fixes #20
2020-01-27 09:16:00 -05:00
cab890dc9b #20 wip 2020-01-27 09:09:42 -05:00
b3c4faf2df Update README.md 2020-01-26 12:37:13 -05:00
353937171a Update README.md 2020-01-20 15:54:53 -05:00
c80002bea4 Bundle libcurl attempt 2 2020-01-18 11:53:12 -05:00
56adee9d81 Bundle libcurl, libopc bugfix #18 2020-01-18 10:25:02 -05:00
d6493d6d5f Bundle libpng 2020-01-16 16:21:38 -05:00
0967e9676d remove static build in CI... 2020-01-16 15:45:18 -05:00
487e998ea0 Display error message on /d/ error 2020-01-16 15:04:50 -05:00
919f45c79c Document info modal #19 2020-01-16 14:37:19 -05:00
d42129cfcb CI fix attempt 2020-01-15 20:11:45 -05:00
754983e34a Minor cleanup 2020-01-15 18:16:06 -05:00
7c8a3e2f9d Support for external json indices 2020-01-14 15:44:31 -05:00
3bb24b4453 Use bundled libtiff 2020-01-14 12:21:26 -05:00
56 changed files with 1656 additions and 396 deletions

6
.gitmodules vendored
View File

@@ -37,3 +37,9 @@
[submodule "lib/leptonica"]
path = lib/leptonica
url = https://github.com/danbloomberg/leptonica
[submodule "lib/libtiff"]
path = lib/libtiff
url = https://gitlab.com/libtiff/libtiff
[submodule "lib/libpng"]
path = lib/libpng
url = https://github.com/glennrp/libpng

View File

@@ -26,6 +26,7 @@ add_executable(
src/parsing/arc.c src/parsing/arc.h
src/parsing/doc.c src/parsing/doc.h
src/log.c src/log.h
src/parsing/cbr.h src/parsing/cbr.c
# argparse
argparse/argparse.h argparse/argparse.c
@@ -45,7 +46,6 @@ add_executable(
find_package(PkgConfig REQUIRED)
set(ENV{PKG_CONFIG_PATH} "$ENV{PKG_CONFIG_PATH}:/usr/local/lib/pkgconfig/")
#find_package(OpenSSL REQUIRED)
find_package(Freetype REQUIRED)
pkg_check_modules(GLIB REQUIRED glib-2.0)
@@ -84,7 +84,8 @@ target_link_directories(
target_compile_options(sist2
PRIVATE
-Ofast
# -march=native
# -march=native
-fPIC
-fno-stack-protector
-fomit-frame-pointer
)
@@ -111,7 +112,7 @@ TARGET_LINK_LIBRARIES(
${PROJECT_SOURCE_DIR}/lib/libonion_static.a
pthread
curl
m
bz2
${PROJECT_SOURCE_DIR}/lib/libmagic.a
@@ -127,9 +128,17 @@ TARGET_LINK_LIBRARIES(
${PROJECT_SOURCE_DIR}/lib/libtesseract.a
${PROJECT_SOURCE_DIR}/lib/liblept.a
png
tiff
${PROJECT_SOURCE_DIR}/lib/libtiff.a
${PROJECT_SOURCE_DIR}/lib/libpng16.a
stdc++
# curl
${PROJECT_SOURCE_DIR}/lib/libcurl.a
${PROJECT_SOURCE_DIR}/lib/libcrypto.a
${PROJECT_SOURCE_DIR}/lib/libssl.a
dl
pcre
)
add_custom_target(

View File

@@ -3,7 +3,7 @@ MAINTAINER simon987 <me@simon987.net>
RUN apt update
RUN apt install -y libglib2.0-0 libcurl4 libmagic1 libharfbuzz-bin libopenjp2-7 libarchive13 liblzma5 libzstd1 liblz4-1 \
curl libtiff5 libpng16-16
curl libtiff5 libpng16-16 libpcre3
RUN mkdir -p /usr/share/tessdata && \
cd /usr/share/tessdata/ && \
@@ -16,4 +16,7 @@ RUN mkdir -p /usr/share/tessdata && \
ADD sist2 /root/sist2
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
ENTRYPOINT ["/root/sist2"]

View File

@@ -5,6 +5,11 @@ strip sist2
version=$(./sist2 --version)
echo "Version ${version}"
docker build . -t simon987/sist2:${version} -t simon987/sist2:latest
docker build . -t simon987/sist2:${version} -t simon987/sist2:latest \
-t docker.pkg.github.com/simon987/sist2/sist2:latest -t docker.pkg.github.com/simon987/sist2/sist2:${version}
docker push simon987/sist2:${version}
docker push simon987/sist2:latest
docker push docker.pkg.github.com/simon987/sist2/sist2:latest
docker push docker.pkg.github.com/simon987/sist2/sist2:${version}
docker run --rm -it simon987/sist2 -v

View File

@@ -8,9 +8,12 @@ sist2 (Simple incremental search tool)
*Warning: sist2 is in early development*
![sist2.png](sist2.png)
## Features
* Fast, low memory usage, multi-threaded
* Mobile-friendly Web interface
* Portable (all its features are packaged in a single executable)
* Extracts text from common file types \*
* Generates thumbnails \*
@@ -26,73 +29,49 @@ sist2 (Simple incremental search tool)
## Getting Started
1. Have an [Elasticsearch](https://www.elastic.co/downloads/elasticsearch) instance running
1.
1. Have an Elasticsearch (>= 6.X.X) instance running
1. Download [from official website](https://www.elastic.co/downloads/elasticsearch)
1. *(or)* Run using docker:
```bash
docker run -d --name es1 --net sist2_net -p 9200:9200 \
-e "discovery.type=single-node" elasticsearch:7.5.2
```
1. *(or)* Run using docker-compose:
```yaml
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:7.5.2
environment:
- discovery.type=single-node
- "ES_JAVA_OPTS=-Xms1G -Xmx2G"
```
1. Download sist2 executable
1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases) *
1. *(or)* Download an [development snapshot](https://files.simon987.net/artifacts/Sist2/Build/) *(Not recommended!)*
1. *(or)* Download a [development snapshot](https://files.simon987.net/artifacts/Sist2/Build/) *(Not recommended!)*
1. *(or)* `docker pull simon987/sist2:latest`
1. See [Usage guide](USAGE.md)
\* *Windows users*: **sist2** runs under [WSL](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux)
\* *Mac users*: See [#1](https://github.com/simon987/sist2/issues/1)
## Example usage
See help page `sist2 --help` for more details.
See [Usage guide](USAGE.md) for more details
**Scan a directory**
```bash
sist2 scan ~/Documents -o ./orig_idx/
sist2 scan --threads 4 --content-size 16384 /mnt/Pictures
sist2 scan --incremental ./orig_idx/ -o ./updated_idx/ ~/Documents
```
**Push index to Elasticsearch or file**
```bash
sist2 index --force-reset ./my_idx
sist2 index --print ./my_idx > raw_documents.ndjson
```
**Start web interface**
```bash
sist2 web --bind 0.0.0.0 --port 4321 ./my_idx1 ./my_idx2 ./my_idx3
```
### Use sist2 with docker
**scan**
```bash
docker run -it \
-v /path/to/files/:/files \
-v $PWD/out/:/out \
simon987/sist2 scan -t 4 /files -o /out/my_idx1
```
**index**
```bash
docker run -it --network host\
-v $PWD/out/:/out \
simon987/sist2 index /out/my_idx1
```
**web**
```bash
docker run --rm --network host -d --name sist2\
-v $PWD/out/my_idx:/idx \
-v $PWD/my/files:/files
simon987/sist2 web --bind 0.0.0.0 /idx
docker stop sist2
```
1. Scan a directory: `sist2 scan ~/Documents -o ./docs_idx`
1. Push index to Elasticsearch: `sist2 index ./docs_idx`
1. Start web interface: `sist2 web ./docs_idx`
## Format support
File type | Library | Content | Thumbnail | Metadata
:---|:---|:---|:---|:---
pdf,xps,cbz,fb2,epub | MuPDF | text+ocr | yes, `png` | title |
pdf,xps,cbz,cbr,fb2,epub | MuPDF | text+ocr | yes, `png` | title |
`audio/*` | ffmpeg | - | yes, `jpeg` | ID3 tags |
`video/*` | ffmpeg | - | yes, `jpeg` | title, comment, artist |
`image/*` | ffmpeg | - | yes, `jpeg` | `EXIF:Artist`, `EXIF:ImageDescription` |
`image/*` | ffmpeg | - | yes, `jpeg` | [Common EXIF tags](https://github.com/simon987/sist2/blob/efdde2734eca9b14a54f84568863b7ffd59bdba3/src/parsing/media.c#L190) |
ttf,ttc,cff,woff,fnt,otf | Freetype2 | - | yes, `bmp` | Name & style |
`text/plain` | *(none)* | yes | no | - |
tar, zip, rar, 7z, ar ... | Libarchive | yes\* | - | no |
@@ -117,7 +96,7 @@ To check if a media file can be parsed without *seek*, execute `cat file.mp4 | f
### OCR
You can enable OCR support for pdf,xps,cbz,fb2,epub file types with the
You can enable OCR support for pdf,xps,cbz,cbr,fb2,epub file types with the
`--ocr <lang>` option. Download the language data files with your
package manager (`apt install tesseract-ocr-eng`) or directly [from Github](https://github.com/tesseract-ocr/tesseract/wiki/Data-Files).
@@ -142,8 +121,9 @@ binaries.
```bash
apt install git cmake pkg-config libglib2.0-dev \
libssl-dev uuid-dev python3 libmagic-dev libfreetype6-dev \
libcurl-dev libbz2-dev yasm libharfbuzz-dev ragel \
libarchive-dev libtiff5 libpng16-16
libcurl4-openssl-dev libbz2-dev yasm libharfbuzz-dev ragel \
libarchive-dev libtiff5 libpng16-16 libpango1.0-dev \
libxml2-dev
```
2. Build

275
USAGE.md Normal file
View File

@@ -0,0 +1,275 @@
# Usage
*More examples (specifically with docker/compose) are in progress*
* [scan](#scan)
* [options](#scan-options)
* [examples](#scan-examples)
* [index format](#index-format)
* [index](#index)
* [options](#index-options)
* [examples](#index-examples)
* [web](#web)
* [options](#web-options)
* [examples](#web-examples)
* [rewrite_url](#rewrite_url)
* [link to specific indices](#link-to-specific-indices)
```
Usage: sist2 scan [OPTION]... PATH
or: sist2 index [OPTION]... INDEX
or: sist2 web [OPTION]... INDEX...
Lightning-fast file system indexer and search tool.
-h, --help show this help message and exit
-v, --version Show version and exit
--verbose Turn on logging
--very-verbose Turn on debug messages
Scan options
-t, --threads=<int> Number of threads. DEFAULT=1
-q, --quality=<flt> Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. DEFAULT=5
--size=<int> Thumbnail size, in pixels. Use negative value to disable. DEFAULT=500
--content-size=<int> Number of bytes to be extracted from text documents. Use negative value to disable. DEFAULT=32768
--incremental=<str> Reuse an existing index and only scan modified files.
-o, --output=<str> Output directory. DEFAULT=index.sist2/
--rewrite-url=<str> Serve files from this url instead of from disk.
--name=<str> Index display name. DEFAULT: (name of the directory)
--depth=<int> Scan up to DEPTH subdirectories deep. Use 0 to only scan files in PATH. DEFAULT: -1
--archive=<str> Archive file mode (skip|list|shallow|recurse). skip: Don't parse, list: only get file names as text, shallow: Don't parse archives inside archives. DEFAULT: recurse
--ocr=<str> Tesseract language (use tesseract --list-langs to see which are installed on your machine)
-e, --exclude=<str> Files that match this regex will not be scanned
--fast Only index file names & mime type
Index options
--es-url=<str> Elasticsearch url with port. DEFAULT=http://localhost:9200
-p, --print Just print JSON documents to stdout.
--script-file=<str> Path to user script.
--batch-size=<int> Index batch size. DEFAULT: 100
-f, --force-reset Reset Elasticsearch mappings and settings. (You must use this option the first time you use the index command)
Web options
--es-url=<str> Elasticsearch url. DEFAULT=http://localhost:9200
--bind=<str> Listen on this address. DEFAULT=localhost
--port=<str> Listen on this port. DEFAULT=4090
--auth=<str> Basic auth in user:password format
Made by simon987 <me@simon987.net>. Released under GPL-3.0
```
## Scan
### Scan options
* `-t, --threads`
Number of threads for file parsing. **Do not set a number higher than `$(nproc)`!**.
* `-q, --quality`
Thumbnail quality, on a scale of 1.0 to 31.0, 1.0 being the best. *Does not affect PDF thumbnails quality*
* `--size`
Thumbnail size in pixels.
* `--content-size`
Number of bytes of text to be extracted from the content of files (plain text and PDFs).
Repeated whitespace and special characters do not count toward this limit.
* `--incremental`
Specify an existing index. Information about files in this index that were not modified (based on *mtime* attribute)
will be copied to the new index and will not be parsed again.
* `-o, --output` Output directory.
* `--rewrite-url` Set the `rewrite_url` option for the web module (See [rewrite_url](#rewrite_url))
* `--name` Set the `name` option for the web module
* `--depth` Maximum scan dept. Set to 0 only scan files directly in the root directory, set to -1 for infinite depth
* `--archive` Archive file mode.
* skip: Don't parse
* list: Only get file names as text
* shallow: Don't parse archives inside archives.
* recurse: Scan archives recursively (default)
* `--ocr` See [OCR](README.md#OCR)
* `-e, --exclude` Regex pattern to exclude files. A file is excluded if the pattern matches any
part of the full absolute path.
Examples:
* `-e ".*\.ttf"`: Ignore ttf files
* `-e ".*\.(ttf|rar)"`: Ignore ttf and rar files
* `-e "^/mnt/backups/"`: Ignore all files in the `/mnt/backups/` directory
* `-e "^/mnt/Data[12]/"`: Ignore all files in the `/mnt/Data1/` and `/mnt/Data2/` directory
* `-e "(^/usr/)|(^/var/)|(^/media/DRIVE-A/tmp/)|(^/media/DRIVE-B/Trash/)"` Exclude the
`/usr`, `/var`, `/media/DRIVE-A/tmp`, `/media/DRIVE-B/Trash` directories
* `--fast` Only index file names and mime type
### Scan examples
Simple scan
```bash
sist2 scan ~/Documents
sist2 scan \
--threads 4 --content-size 16000000 --quality 1.0 --archive shallow \
--name "My Documents" --rewrite-url "http://nas.domain.local/My Documents/" \
~/Documents -o ./documents.idx/
```
Incremental scan
```
sist2 scan --incremental ./orig_idx/ -o ./updated_idx/ ~/Documents
```
### Index format
A typical `binary` type index structure looks like this:
```
documents.idx/
├── descriptor.json
├── _index_139965416830720
├── _index_139965425223424
├── _index_139965433616128
├── _index_139965442008832
└── thumbs
├── data.mdb
└── lock.mdb
```
The `_index_*` files contain the raw binary index data and are not meant to be
read by other applications. The format is generally compatible across different
sist2 versions.
The `thumbs/` folder is a [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database)
database containing the thumbnails.
The `descriptor.json` file contains general information about the index. The
following fields are safe to modify manually: `root`, `name`, [rewrite_url](#rewrite_url) and `timestamp`.
*Advanced usage*
Instead of using the `scan` module, you can also import an index generated
by a third party application. The 'external' index must have the following format:
```
my_index/
├── descriptor.json
├── _index_0
└── thumbs
├── data.mdb
└── lock.mdb
```
*descriptor.json*:
```json
{
"uuid": "<valid UUID4>",
"version": "_external_v1",
"root": "(optional)",
"name": "<name>",
"rewrite_url": "(optional)",
"type": "json",
"timestamp": 1578971024
}
```
*_index_0*: NDJSON format (One json object per line)
```json
{
"_id": "unique uuid for the file",
"index": "index uuid4 (same one as descriptor.json!)",
"mime": "application/x-cbz",
"size": 14341204,
"mtime": 1578882996,
"extension": "cbz",
"name": "my_book",
"path": "path/to/books",
"content": "text contents of the book",
"title": "Title of the book",
"tag": ["genre.fiction", "author.someguy", "etc..."],
"_keyword": [
{"k": "ISBN", "v": "ABCD34789231"}
],
"_text": [
{"k": "other", "v": "This will be indexed as text"}
]
}
```
You can find the full list of supported fields [here](src/io/serialize.c#L90)
The `_keyword.*` items will be indexed and searchable as **keyword** fields (only full matches allowed).
The `_text.*` items will be indexed and searchable as **text** fields (fuzzy searching allowed)
*thumbs/*:
LMDB key-value store. Keys are **binary** 128-bit UUID4s (`_id` field)
and values are raw image bytes.
Importing an external `binary` type index is technically possible but
it is currently unsupported and has no guaranties of back/forward compatibility.
## Index
### Index options
* `--es-url`
Elasticsearch url and port. If you are using docker, make sure that both containers are on the
same network.
* `-p, --print`
Print index in JSON format to stdout.
* `--script-file`
Path to user script. See [Scripting](scripting/README.md).
* `--batch-size=<int>`
Index batch size. Indexing is generally faster with larger batches, but payloads that
are too large will fail and additional overhead for retrying with smaller sizes may slow
down the process.
* `-f, --force-reset`
Reset Elasticsearch mappings and settings.
**(You must use this option the first time you use the index command)**.
### Index examples
**Push to elasticsearch**
```bash
sist2 index --force-reset --batch-size 1000 --es-url http://localhost:9200 ./my_index/
sist2 index ./my_index/
```
**Save index in JSON format**
```bash
sist2 index --print ./my_index/ > my_index.ndjson
```
**Inspect contents of an index**
```bash
sist2 index --print ./my_index/ | jq | less
```
## Web
### Web options
* `--es-url=<str>` Elasticsearch url.
* `--bind=<str>` Listen on this address.
* `--port=<str>` Listen on this port.
* `--auth=<str>` Basic auth in user:password format
### Web examples
**Single index**
```bash
sist2 web --auth admin:hunter2 --bind 0.0.0.0 --port 8888 my_index
```
**Multiple indices**
```bash
# Indices will be displayed in this order in the web interface
sist2 web index1 index2 index3 index4
```
### rewrite_url
When the `rewrite_url` field is not empty, the web module ignores the `root`
field and will return a HTTP redirect to `<rewrite_url><path>/<name><extension>`
instead of serving the file from disk.
Both the `root` and `rewrite_url` fields are safe to manually modify from the
`descriptor.json` file.
### Link to specific indices
To link to specific indices, you can add a list of comma-separated index name to
the URL: `?i=<name>,<name>`. By default, indices with `"(nsfw)"` in their name are
not displayed.

View File

@@ -5,4 +5,3 @@
cmake .
make
strip sist2
strip sist2_scan

BIN
demo.gif

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 MiB

Binary file not shown.

Binary file not shown.

1
lib/libpng Submodule

Submodule lib/libpng added at 301f7a1429

1
lib/libtiff Submodule

Submodule lib/libtiff added at 3db0ff91bc

View File

@@ -320,7 +320,7 @@ video/x-dv, dif|dv
video/x-fli, fli
video/x-isvideo, isu
video/x-motion-jpeg, mjpg
video/x-ms-asf, asf|asx
video/x-ms-asf, asf|asx|wmv
video/x-qtc, qtc
video/x-sgi-movie, movie|mv
application/x-7z-compressed, 7z
1 application/arj arj
320 video/x-fli fli
321 video/x-isvideo isu
322 video/x-motion-jpeg mjpg
323 video/x-ms-asf asf|asx asf|asx|wmv
324 video/x-qtc qtc
325 video/x-sgi-movie movie|mv
326 application/x-7z-compressed 7z

View File

@@ -1,5 +1,9 @@
{
"properties": {
"_tie": {
"type": "keyword",
"doc_values": true
},
"path": {
"type": "text",
"analyzer": "path_analyzer",
@@ -7,25 +11,30 @@
},
"suggest-path": {
"type": "completion",
"analyzer": "keyword"
"analyzer": "case_insensitive_kw_analyzer"
},
"mime": {
"type": "keyword"
},
"videoc": {
"type": "keyword"
"type": "keyword",
"index": false
},
"audioc": {
"type": "keyword"
"type": "keyword",
"index": false
},
"duration": {
"type": "float"
"type": "float",
"index": false
},
"width": {
"type": "integer"
"type": "integer",
"index": false
},
"height": {
"type": "integer"
"type": "integer",
"index": false
},
"mtime": {
"type": "integer"
@@ -70,6 +79,23 @@
"analyzer": "my_nGram",
"type": "text"
},
"_keyword.*": {
"type": "keyword"
},
"_text.*": {
"analyzer": "content_analyzer",
"type": "text",
"fields": {
"nGram": {
"type": "text",
"analyzer": "my_nGram"
}
}
},
"_url": {
"type": "keyword",
"index": false
},
"content": {
"analyzer": "content_analyzer",
"type": "text",
@@ -83,6 +109,30 @@
},
"tag": {
"type": "keyword"
},
"exif_make": {
"type": "text"
},
"exif_model": {
"type": "text"
},
"exif:software": {
"type": "text"
},
"exif_exposure_time": {
"type": "keyword"
},
"exif_fnumber": {
"type": "keyword"
},
"exif_iso_speed_ratings": {
"type": "keyword"
},
"exif_focal_length": {
"type": "keyword"
},
"exif_user_comment": {
"type": "text"
}
}
}

10
schema/pipeline.json Normal file
View File

@@ -0,0 +1,10 @@
{
"description": "Copy _id to _tie",
"processors": [
{
"script": {
"source": "ctx._tie = ctx._id;"
}
}
]
}

View File

@@ -21,6 +21,12 @@
"lowercase"
]
},
"case_insensitive_kw_analyzer": {
"tokenizer": "keyword",
"filter": [
"lowercase"
]
},
"my_nGram": {
"tokenizer": "my_nGram_tokenizer",
"filter": [

View File

@@ -54,6 +54,11 @@ script.painless.regex.enabled: true
```
Or, if you're using docker add `-e "script.painless.regex.enabled=true"`
**Tag color**
You can specify the color for an individual tag by appending an
hexadecimal color code (`#RRGGBBAA`) to the tag name.
### Examples
If `(20XX)` is in the file name, add the `year.<year>` tag:
@@ -115,3 +120,33 @@ if (ctx._source.path != "") {
tags.add("studio." + names[names.length-1]);
}
```
Parse `EXIF:F Number` tag
```Java
if (ctx._source?.exif_fnumber != null) {
String[] values = ctx._source.exif_fnumber.splitOnToken(' ');
String aperture = String.valueOf(Float.parseFloat(values[0]) / Float.parseFloat(values[1]));
if (aperture == "NaN") {
aperture = "0,0";
}
tags.add("Aperture.f/" + aperture.replace(".", ","));
}
```
Display year and months from `EXIF:DateTime` tag
```Java
if (ctx._source?.exif_datetime != null) {
SimpleDateFormat parser = new SimpleDateFormat("yyyy:MM:dd HH:mm:ss");
Date date = parser.parse(ctx._source.exif_datetime);
SimpleDateFormat yp = new SimpleDateFormat("yyyy");
SimpleDateFormat mp = new SimpleDateFormat("MMMMMMMMM");
String year = yp.format(date);
String month = mp.format(date);
tags.add("Month." + month);
tags.add("Year." + year);
}
```

View File

@@ -5,7 +5,7 @@ THREADS=$(nproc)
cd lib
cd mupdf
make USE_SYSTEM_HARFBUZZ=yes USE_SYSTEM_OPENJPEG=yes HAVE_X11=no HAVE_GLUT=no -j $THREADS
CFLAGS=-fPIC make USE_SYSTEM_HARFBUZZ=yes USE_SYSTEM_OPENJPEG=yes HAVE_X11=no HAVE_GLUT=no -j $THREADS
cd ..
mv mupdf/build/release/libmupdf.a .
@@ -13,8 +13,7 @@ mv mupdf/build/release/libmupdf-third.a .
# openjp2
cd openjpeg
#cmake . -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-O3 -march=native -DNDEBUG"
cmake . -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-O3"
cmake . -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-O3 -DNDEBUG -fPIC"
make -j $THREADS
cd ..
mv openjpeg/bin/libopenjp2.a .
@@ -22,7 +21,7 @@ mv openjpeg/bin/libopenjp2.a .
# harfbuzz
cd harfbuzz
./autogen.sh
./configure --disable-shared --enable-static
CFLAGS=-fPIC ./configure --disable-shared --enable-static
make -j $THREADS
cd ..
mv harfbuzz/src/.libs/libharfbuzz.a .
@@ -33,7 +32,8 @@ cd ffmpeg
--disable-ffprobe --disable-doc\
--disable-manpages --disable-postproc --disable-avfilter \
--disable-alsa --disable-lzma --disable-xlib --disable-debug\
--disable-vdpau --disable-vaapi --disable-sdl2 --disable-network
--disable-vdpau --disable-vaapi --disable-sdl2 --disable-network\
--extra-cflags=-fPIC
make -j $THREADS
cd ..
@@ -74,7 +74,8 @@ mv libmagic/src/.libs/libmagic.a .
cd tesseract
mkdir build
cd build
cmake -DSTATIC=on -DBUILD_TRAINING_TOOLS=off ..
cmake -DSTATIC=on -DBUILD_TRAINING_TOOLS=off -DBUILD_TESTS=off -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_FLAGS="-fPIC" -DAUTO_OPTIMIZE=off ..
make -j $THREADS
cd ../..
mv tesseract/build/libtesseract.a .
@@ -82,9 +83,46 @@ mv tesseract/build/libtesseract.a .
# leptonica
cd leptonica
./autogen.sh
./configure --without-zlib --without-jpeg --without-giflib \
CFLAGS="-fPIC" ./configure --without-zlib --without-jpeg --without-giflib \
--without-giflib --without-libwebp --without-libwebpmux --without-libopenjpeg \
--enable-static --disable-shared
make -j $THREADS
cd ..
mv leptonica/src/.libs/liblept.a .
# tiff
cd libtiff
./autogen.sh
CFLAGS="-fPIC" CXXFLAGS="-fPIC" CXX_FLAGS="-fPIC" ./configure --enable-static --disable-shared --disable-lzw --disable-jpeg --disable-webp \
--disable-lzma --disable-zstd --disable-jbig
make -j $THREADS
cd ..
mv libtiff/libtiff/.libs/libtiff.a .
# png
cd libpng
CFLAGS="-fPIC" ./configure --enable-static --disable-shared
make -j $THREADS
cd ..
mv libpng/.libs/libpng16.a .
# openssl...
git clone --depth 1 -b OpenSSL_1_1_0-stable https://github.com/openssl/openssl
cd openssl
./config --prefix=$(pwd)/../ssl
make depend
make -j $THREADS
make install
cd ..
mv ./openssl/libcrypto.a ./openssl/libssl.a .
# curl
wget -nc https://curl.haxx.se/download/curl-7.68.0.tar.gz
tar -xzf curl-7.68.0.tar.gz
cd curl-7.68.0
./configure --disable-ldap --disable-ldaps --without-librtmp --disable-rtsp --disable-crypto-auth \
--disable-smtp --without-libidn2 --without-nghttp2 --without-brotli --enable-static --disable-shared \
--without-libpsl --with-ssl=$(pwd)/../ssl
make -j $THREADS
cd ..
mv curl-7.68.0/lib/.libs/libcurl.a .

View File

@@ -1,58 +0,0 @@
#!/usr/bin/env bash
cd lib
# mupdf
cd mupdf
HAVE_X11=no HAVE_GLUT=no gmake -j 4
cd ..
mv mupdf/build/release/libmupdf.a .
mv mupdf/build/release/libmupdf-third.a .
# openjp2
cd openjpeg
#cmake . -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-O3 -march=native -DNDEBUG"
cmake . -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-O3"
gmake -j 4
cd ..
mv openjpeg/bin/libopenjp2.a .
# harfbuzz
cd harfbuzz
./autogen.sh
./configure --disable-shared --enable-static
gmake -j 4
cd ..
mv harfbuzz/src/.libs/libharfbuzz.a .
# ffmpeg
cd ffmpeg
./configure --disable-shared --enable-static --disable-ffmpeg --disable-ffplay \
--disable-ffprobe --disable-doc\
--disable-manpages --disable-postproc --disable-avfilter \
--disable-alsa --disable-lzma --disable-xlib --disable-debug\
--disable-vdpau --disable-vaapi --disable-sdl2 --disable-network
gmake -j 4
cd ..
mv ffmpeg/libavcodec/libavcodec.a .
mv ffmpeg/libavformat/libavformat.a .
mv ffmpeg/libavutil/libavutil.a .
mv ffmpeg/libswresample/libswresample.a .
mv ffmpeg/libswscale/libswscale.a .
#bzip2
cd bzip2-1.0.6
make -j 4
cd ..
mv bzip2-1.0.6/libbz2.a .
# magic
cd libmagic
./autogen.sh
./configure --enable-static --disable-shared
make -j 4
cd ..
mv libmagic/src/.libs/libmagic.a .
cd ..

View File

@@ -1,6 +1,9 @@
import json
files = [
"schema/mappings.json",
"schema/settings.json",
"schema/pipeline.json",
]
@@ -9,6 +12,6 @@ def clean(filepath):
for file in files:
with open(file, "rb") as f:
data = f.read()
with open(file, "r") as f:
data = json.dumps(json.load(f), separators=(",", ":")).encode()
print("char %s[%d] = {%s};" % (clean(file), len(data), ",".join(str(int(b)) for b in data)))

BIN
sist2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 889 KiB

View File

@@ -15,6 +15,13 @@
#define DEFAULT_BIND_ADDR "localhost"
#define DEFAULT_PORT "4090"
const char* TESS_DATAPATHS[] = {
"/usr/share/tessdata/",
"/usr/share/tesseract-ocr/tessdata/",
"./",
NULL
};
scan_args_t *scan_args_create() {
scan_args_t *args = calloc(sizeof(scan_args_t), 1);
@@ -136,13 +143,43 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
if (args->tesseract_lang != NULL) {
TessBaseAPI *api = TessBaseAPICreate();
ret = TessBaseAPIInit3(api, TESS_DATAPATH, args->tesseract_lang);
char filename[128];
sprintf(filename, "%s.traineddata", args->tesseract_lang);
const char * path = find_file_in_paths(TESS_DATAPATHS, filename);
if (path == NULL) {
LOG_FATAL("cli.c", "Could not find tesseract language file!");
}
ret = TessBaseAPIInit3(api, path, args->tesseract_lang);
if (ret != 0) {
fprintf(stderr, "Could not initialize tesseract with lang '%s'\n", args->tesseract_lang);
return 1;
}
TessBaseAPIEnd(api);
TessBaseAPIDelete(api);
args->tesseract_path = path;
}
if (args->exclude_regex != NULL) {
const char *error;
int error_offset;
pcre *re = pcre_compile(args->exclude_regex, 0, &error, &error_offset, 0);
if (error != NULL) {
LOG_FATALF("cli.c", "pcre_compile returned error: %s (offset:%d)", error, error_offset)
}
pcre_extra *re_extra = pcre_study(re, 0, &error);
if (error != NULL) {
LOG_FATALF("cli.c", "pcre_study returned error: %s", error)
}
ScanCtx.exclude = re;
ScanCtx.exclude_extra = re_extra;
} else {
ScanCtx.exclude = NULL;
}
LOG_DEBUGF("cli.c", "arg quality=%f", args->quality)
@@ -156,13 +193,18 @@ int scan_args_validate(scan_args_t *args, int argc, const char **argv) {
LOG_DEBUGF("cli.c", "arg depth=%d", args->depth)
LOG_DEBUGF("cli.c", "arg path=%s", args->path)
LOG_DEBUGF("cli.c", "arg archive=%s", args->archive)
LOG_DEBUGF("cli.c", "arg ocr=%s", args->tesseract_lang)
LOG_DEBUGF("cli.c", "arg tesseract_lang=%s", args->tesseract_lang)
LOG_DEBUGF("cli.c", "arg tesseract_path=%s", args->tesseract_path)
LOG_DEBUGF("cli.c", "arg exclude=%s", args->exclude_regex)
LOG_DEBUGF("cli.c", "arg fast=%d", args->fast)
return 0;
}
int index_args_validate(index_args_t *args, int argc, const char **argv) {
LogCtx.verbose = 1;
if (argc < 2) {
fprintf(stderr, "Required positional argument: PATH.\n");
return 1;
@@ -224,6 +266,8 @@ int index_args_validate(index_args_t *args, int argc, const char **argv) {
int web_args_validate(web_args_t *args, int argc, const char **argv) {
LogCtx.verbose = 1;
if (argc < 2) {
fprintf(stderr, "Required positional argument: PATH.\n");
return 1;

View File

@@ -17,10 +17,15 @@ typedef struct scan_args {
char *archive;
archive_mode_t archive_mode;
char *tesseract_lang;
const char *tesseract_path;
char *exclude_regex;
int fast;
} scan_args_t;
scan_args_t *scan_args_create();
void scan_args_destroy(scan_args_t *args);
int scan_args_validate(scan_args_t *args, int argc, const char **argv);
typedef struct index_args {
@@ -44,12 +49,15 @@ typedef struct web_args {
} web_args_t;
index_args_t *index_args_create();
void index_args_destroy(index_args_t *args);
web_args_t *web_args_create();
void web_args_destroy(web_args_t *args);
int index_args_validate(index_args_t *args, int argc, const char **argv);
int web_args_validate(web_args_t *args, int argc, const char **argv);
#endif

View File

@@ -28,6 +28,10 @@ struct {
pthread_mutex_t mupdf_mu;
char * tesseract_lang;
const char * tesseract_path;
pcre *exclude;
pcre_extra *exclude_extra;
int fast;
} ScanCtx;
struct {

View File

@@ -20,6 +20,8 @@ typedef struct es_indexer {
static es_indexer_t *Indexer;
void delete_queue(int max);
void print_json(cJSON *document, const char uuid_str[UUID_STR_LEN]) {
cJSON *line = cJSON_CreateObject();
@@ -27,13 +29,14 @@ void print_json(cJSON *document, const char uuid_str[UUID_STR_LEN]) {
cJSON_AddStringToObject(line, "_id", uuid_str);
cJSON_AddStringToObject(line, "_index", "sist2");
cJSON_AddStringToObject(line, "_type", "_doc");
cJSON_AddItemToObject(line, "_source", document);
cJSON_AddItemReferenceToObject(line, "_source", document);
char *json = cJSON_PrintUnformatted(line);
printf("%s\n", json);
cJSON_free(line);
cJSON_free(json);
cJSON_Delete(line);
}
void index_json(cJSON *document, const char uuid_str[UUID_STR_LEN]) {
@@ -63,12 +66,12 @@ void execute_update_script(const char *script, const char index_id[UUID_STR_LEN]
cJSON *term_obj = cJSON_AddObjectToObject(query, "term");
cJSON_AddStringToObject(term_obj, "index", index_id);
char * str = cJSON_Print(body);
char *str = cJSON_Print(body);
char bulk_url[4096];
snprintf(bulk_url, 4096, "%s/sist2/_update_by_query?pretty", Indexer->es_url);
response_t *r = web_post(bulk_url, str, "Content-Type: application/json");
printf("Executed user script <%d>\n", r->status_code);
LOG_INFOF("elastic.c", "Executed user script <%d>", r->status_code);
cJSON *resp = cJSON_Parse(r->body);
cJSON_free(str);
@@ -79,31 +82,25 @@ void execute_update_script(const char *script, const char index_id[UUID_STR_LEN]
if (error != NULL) {
char *error_str = cJSON_Print(error);
fprintf(stderr, "User script error: \n%s\n", error_str);
LOG_ERRORF("elastic.c", "User script error: \n%s", error_str);
cJSON_free(error_str);
}
cJSON_Delete(resp);
}
void elastic_flush() {
if (Indexer == NULL) {
Indexer = create_indexer(IndexCtx.es_url);
}
void *create_bulk_buffer(int max, int *count, size_t *buf_len) {
es_bulk_line_t *line = Indexer->line_head;
int count = 0;
*count = 0;
size_t buf_size = 0;
size_t buf_cur = 0;
char *buf = malloc(1);
while (line != NULL) {
while (line != NULL && *count < max) {
char action_str[512];
snprintf(action_str, 512,
"{\"index\":{\"_id\":\"%s\", \"_type\":\"_doc\", \"_index\":\"sist2\"}}\n", line->uuid_str);
"{\"index\":{\"_id\":\"%s\", \"_type\":\"_doc\", \"_index\":\"sist2\"}}\n", line->uuid_str);
size_t action_str_len = strlen(action_str);
size_t line_len = strlen(line->line);
@@ -115,47 +112,101 @@ void elastic_flush() {
memcpy(buf + buf_cur, line->line, line_len);
buf_cur += line_len;
es_bulk_line_t *tmp = line;
line = line->next;
free(tmp);
count++;
(*count)++;
}
buf = realloc(buf, buf_size + 1);
*(buf+buf_cur) = '\0';
*(buf + buf_cur) = '\0';
Indexer->line_head = NULL;
Indexer->line_tail = NULL;
Indexer->queued = 0;
*buf_len = buf_cur;
return buf;
}
void _elastic_flush(int max) {
size_t buf_len;
int count;
void *buf = create_bulk_buffer(max, &count, &buf_len);
char bulk_url[4096];
snprintf(bulk_url, 4096, "%s/sist2/_bulk", Indexer->es_url);
snprintf(bulk_url, 4096, "%s/sist2/_bulk?pipeline=tie", Indexer->es_url);
response_t *r = web_post(bulk_url, buf, "Content-Type: application/x-ndjson");
if (r->status_code == 0) {
fprintf(stderr, "Could not connect to %s, make sure that elasticsearch is running!\n", IndexCtx.es_url);
exit(1);
LOG_FATALF("elastic.c", "Could not connect to %s, make sure that elasticsearch is running!\n", IndexCtx.es_url)
}
printf("Indexed %3d documents (%zukB) <%d>\n", count, buf_cur / 1024, r->status_code);
if (r->status_code == 413) {
cJSON *ret_json = cJSON_Parse(r->body);
if (cJSON_GetObjectItem(ret_json, "errors")->valueint != 0) {
cJSON *err;
cJSON_ArrayForEach(err, cJSON_GetObjectItem(ret_json, "items")) {
if (cJSON_GetObjectItem(cJSON_GetObjectItem(err, "index"), "status")->valueint != 201) {
char* str = cJSON_Print(err);
fprintf(stderr, "%s\n", str);
cJSON_free(str);
if (max <= 1) {
LOG_ERRORF("elastic.c", "Single document too large, giving up: {%s}", Indexer->line_head->uuid_str)
free_response(r);
free(buf);
delete_queue(1);
if (Indexer->queued != 0) {
elastic_flush();
}
return;
}
LOG_WARNINGF("elastic.c", "Payload too large, retrying (%d documents)", count);
free_response(r);
free(buf);
_elastic_flush(max / 2);
return;
} else if (r->status_code != 200) {
cJSON *ret_json = cJSON_Parse(r->body);
if (cJSON_GetObjectItem(ret_json, "errors")->valueint != 0) {
cJSON *err;
cJSON_ArrayForEach(err, cJSON_GetObjectItem(ret_json, "items")) {
if (cJSON_GetObjectItem(cJSON_GetObjectItem(err, "index"), "status")->valueint != 201) {
char *str = cJSON_Print(err);
LOG_ERRORF("elastic.c", "%s\n", str);
cJSON_free(str);
}
}
}
cJSON_Delete(ret_json);
delete_queue(Indexer->queued);
} else {
LOG_INFOF("elastic.c", "Indexed %d documents (%zukB) <%d>", count, buf_len / 1024, r->status_code);
delete_queue(max);
if (Indexer->queued != 0) {
elastic_flush();
}
}
cJSON_Delete(ret_json);
free_response(r);
free(buf);
}
void delete_queue(int max) {
for (int i = 0; i < max; i++) {
es_bulk_line_t *tmp = Indexer->line_head;
Indexer->line_head = tmp->next;
if (Indexer->line_head == NULL) {
Indexer->line_tail = NULL;
} else {
free(tmp);
}
Indexer->queued -= 1;
}
}
void elastic_flush() {
if (Indexer == NULL) {
Indexer = create_indexer(IndexCtx.es_url);
}
_elastic_flush(Indexer->queued);
}
void elastic_index_line(es_bulk_line_t *line) {
if (Indexer == NULL) {
@@ -192,13 +243,13 @@ es_indexer_t *create_indexer(const char *url) {
return indexer;
}
void destroy_indexer(char * script, char index_id[UUID_STR_LEN]) {
void destroy_indexer(char *script, char index_id[UUID_STR_LEN]) {
char url[4096];
snprintf(url, sizeof(url), "%s/sist2/_refresh", IndexCtx.es_url);
response_t *r = web_post(url, "", NULL);
printf("Refresh index <%d>\n", r->status_code);
LOG_INFOF("elastic.c", "Refresh index <%d>", r->status_code);
free_response(r);
if (script != NULL) {
@@ -207,12 +258,12 @@ void destroy_indexer(char * script, char index_id[UUID_STR_LEN]) {
snprintf(url, sizeof(url), "%s/sist2/_refresh", IndexCtx.es_url);
r = web_post(url, "", NULL);
printf("Refresh index <%d>\n", r->status_code);
LOG_INFOF("elastic.c", "Refresh index <%d>", r->status_code);
free_response(r);
snprintf(url, sizeof(url), "%s/sist2/_forcemerge", IndexCtx.es_url);
r = web_post(url, "", NULL);
printf("Merge index <%d>\n", r->status_code);
LOG_INFOF("elastic.c", "Merge index <%d>", r->status_code);
free_response(r);
if (Indexer != NULL) {
@@ -232,32 +283,37 @@ void elastic_init(int force_reset) {
if (!index_exists || force_reset) {
r = web_delete(url);
printf("Delete index <%d>\n", r->status_code);
LOG_INFOF("elastic.c", "Delete index <%d>", r->status_code);
free_response(r);
snprintf(url, 4096, "%s/sist2", IndexCtx.es_url);
r = web_put(url, "", NULL);
printf("Create index <%d>\n", r->status_code);
LOG_INFOF("elastic.c", "Create index <%d>", r->status_code);
free_response(r);
snprintf(url, 4096, "%s/sist2/_close", IndexCtx.es_url);
r = web_post(url, "", NULL);
printf("Close index <%d>\n", r->status_code);
LOG_INFOF("elastic.c", "Close index <%d>", r->status_code);
free_response(r);
snprintf(url, 4096, "%s/_ingest/pipeline/tie", IndexCtx.es_url);
r = web_put(url, pipeline_json, "Content-Type: application/json");
LOG_INFOF("elastic.c", "Create pipeline <%d>", r->status_code);
free_response(r);
snprintf(url, 4096, "%s/sist2/_settings", IndexCtx.es_url);
r = web_put(url, settings_json, "Content-Type: application/json");
printf("Update settings <%d>\n", r->status_code);
LOG_INFOF("elastic.c", "Update settings <%d>", r->status_code);
free_response(r);
snprintf(url, 4096, "%s/sist2/_mappings/_doc?include_type_name=true", IndexCtx.es_url);
r = web_put(url, mappings_json, "Content-Type: application/json");
printf("Update mappings <%d>\n", r->status_code);
LOG_INFOF("elastic.c", "Update mappings <%d>", r->status_code);
free_response(r);
snprintf(url, 4096, "%s/sist2/_open", IndexCtx.es_url);
r = web_post(url, "", NULL);
printf("Open index <%d>\n", r->status_code);
LOG_INFOF("elastic.c", "Open index <%d>", r->status_code);
free_response(r);
}
}
@@ -274,3 +330,28 @@ cJSON *elastic_get_document(const char *uuid_str) {
free_response(r);
return json;
}
char *elastic_get_status() {
char url[4096];
snprintf(url, 4096,
"%s/_cluster/state/metadata/sist2?filter_path=metadata.indices.*.state", WebCtx.es_url);
response_t *r = web_get(url);
cJSON *json = NULL;
char *status = malloc(128 * sizeof(char));
status[0] = '\0';
if (r->status_code == 200) {
json = cJSON_Parse(r->body);
const cJSON *metadata = cJSON_GetObjectItem(json, "metadata");
if (metadata != NULL) {
const cJSON *indices = cJSON_GetObjectItem(metadata, "indices");
const cJSON *sist2 = cJSON_GetObjectItem(indices, "sist2");
const cJSON *state = cJSON_GetObjectItem(sist2, "state");
strcpy(status, state->valuestring);
}
}
free_response(r);
cJSON_Delete(json);
return status;
}

View File

@@ -30,4 +30,6 @@ void elastic_init(int force_reset);
cJSON *elastic_get_document(const char *uuid_str);
char *elastic_get_status();
#endif

File diff suppressed because one or more lines are too long

View File

@@ -34,6 +34,7 @@ void write_index_descriptor(char *path, index_descriptor_t *desc) {
cJSON_AddStringToObject(json, "version", desc->version);
cJSON_AddStringToObject(json, "root", desc->root);
cJSON_AddStringToObject(json, "name", desc->name);
cJSON_AddStringToObject(json, "type", desc->type);
cJSON_AddStringToObject(json, "rewrite_url", desc->rewrite_url);
cJSON_AddNumberToObject(json, "timestamp", (double) desc->timestamp);
@@ -56,8 +57,7 @@ index_descriptor_t read_index_descriptor(char *path) {
int fd = open(path, O_RDONLY);
if (fd == -1) {
fprintf(stderr, "Invalid/corrupt index (Could not find descriptor)\n");
exit(1);
LOG_FATAL("serialize.c", "Invalid/corrupt index (Could not find descriptor)\n")
}
char *buf = malloc(info.st_size + 1);
@@ -75,6 +75,11 @@ index_descriptor_t read_index_descriptor(char *path) {
descriptor.root_len = (short) strlen(descriptor.root);
strcpy(descriptor.version, cJSON_GetObjectItem(json, "version")->valuestring);
strcpy(descriptor.uuid, cJSON_GetObjectItem(json, "uuid")->valuestring);
if (cJSON_GetObjectItem(json, "type") == NULL) {
strcpy(descriptor.type, INDEX_TYPE_BIN);
} else {
strcpy(descriptor.type, cJSON_GetObjectItem(json, "type")->valuestring);
}
cJSON_Delete(json);
free(buf);
@@ -113,6 +118,24 @@ char *get_meta_key_text(enum metakey meta_key) {
return "font_name";
case MetaParent:
return "parent";
case MetaExifMake:
return "exif_make";
case MetaExifSoftware:
return "exif_software";
case MetaExifExposureTime:
return "exif_exposure_time";
case MetaExifFNumber:
return "exif_fnumber";
case MetaExifFocalLength:
return "exif_focal_length";
case MetaExifUserComment:
return "exif_user_comment";
case MetaExifIsoSpeedRatings:
return "exif_iso_speed_ratings";
case MetaExifModel:
return "exif_model";
case MetaExifDateTime:
return "exif_datetime";
default:
return NULL;
}
@@ -172,8 +195,8 @@ void thread_cleanup() {
close(index_fd);
}
void read_index(const char *path, const char index_id[UUID_STR_LEN], index_func func) {
void read_index_bin(const char *path, const char *index_id, index_func func) {
line_t line;
dyn_buffer_t buf = dyn_buffer_create();
@@ -191,7 +214,12 @@ void read_index(const char *path, const char index_id[UUID_STR_LEN], index_func
char uuid_str[UUID_STR_LEN];
uuid_unparse(line.uuid, uuid_str);
cJSON_AddStringToObject(document, "mime", mime_get_mime_text(line.mime));
const char* mime_text = mime_get_mime_text(line.mime);
if (mime_text == NULL) {
cJSON_AddNullToObject(document, "mime");
} else {
cJSON_AddStringToObject(document, "mime", mime_get_mime_text(line.mime));
}
cJSON_AddNumberToObject(document, "size", (double) line.size);
cJSON_AddNumberToObject(document, "mtime", line.mtime);
@@ -229,7 +257,7 @@ void read_index(const char *path, const char index_id[UUID_STR_LEN], index_func
case MetaMediaBitrate: {
long value;
fread(&value, sizeof(long), 1, file);
cJSON_AddNumberToObject(document, get_meta_key_text(key), value);
cJSON_AddNumberToObject(document, get_meta_key_text(key), (double) value);
break;
}
case MetaMediaAudioCodec:
@@ -250,6 +278,15 @@ void read_index(const char *path, const char index_id[UUID_STR_LEN], index_func
case MetaGenre:
case MetaFontName:
case MetaParent:
case MetaExifMake:
case MetaExifSoftware:
case MetaExifExposureTime:
case MetaExifFNumber:
case MetaExifFocalLength:
case MetaExifUserComment:
case MetaExifIsoSpeedRatings:
case MetaExifDateTime:
case MetaExifModel:
case MetaTitle: {
buf.cur = 0;
while ((c = getc(file)) != 0) {
@@ -262,7 +299,7 @@ void read_index(const char *path, const char index_id[UUID_STR_LEN], index_func
break;
}
default:
LOG_FATALF("serialize.c", "Invalid meta key (corrupt index): %x", key)
LOG_FATALF("serialize.c", "Invalid meta key (corrupt index): %x", key)
}
key = getc(file);
@@ -275,6 +312,89 @@ void read_index(const char *path, const char index_id[UUID_STR_LEN], index_func
fclose(file);
}
const char *json_type_copy_fields[] = {
"mime", "name", "path", "extension", "index", "size", "mtime", "parent",
// Meta
"title", "content", "width", "height", "duration", "audioc", "videoc",
"bitrate", "artist", "album", "album_artist", "genre", "title", "font_name",
// Special
"tag", "_url"
};
const char *json_type_array_fields[] = {
"_keyword", "_text"
};
void read_index_json(const char *path, UNUSED(const char *index_id), index_func func) {
FILE *file = fopen(path, "r");
while (1) {
char *line = NULL;
size_t len;
size_t read = getline(&line, &len, file);
if (read == -1) {
if (line) {
free(line);
}
break;
}
cJSON *input = cJSON_Parse(line);
if (input == NULL) {
LOG_FATALF("serialize.c", "Could not parse JSON line: \n%s", line)
}
if (line) {
free(line);
}
cJSON *document = cJSON_CreateObject();
const char *uuid_str = cJSON_GetObjectItem(input, "_id")->valuestring;
for (int i = 0; i < (sizeof(json_type_copy_fields) / sizeof(json_type_copy_fields[0])); i++) {
cJSON *value = cJSON_GetObjectItem(input, json_type_copy_fields[i]);
if (value != NULL) {
cJSON_AddItemReferenceToObject(document, json_type_copy_fields[i], value);
}
}
for (int i = 0; i < (sizeof(json_type_array_fields) / sizeof(json_type_array_fields[0])); i++) {
cJSON *arr = cJSON_GetObjectItem(input, json_type_array_fields[i]);
if (arr != NULL) {
cJSON *obj;
cJSON_ArrayForEach(obj, arr) {
char key[1024];
cJSON *k = cJSON_GetObjectItem(obj, "k");
cJSON *v = cJSON_GetObjectItem(obj, "v");
if (k == NULL || v == NULL || !cJSON_IsString(k) || !cJSON_IsString(v)) {
char *str = cJSON_Print(obj);
LOG_FATALF("serialize.c", "Invalid %s member: must contain .k and .v string fields: \n%s",
json_type_array_fields[i], str)
}
snprintf(key, sizeof(key), "%s.%s", json_type_array_fields[i], k->valuestring);
cJSON_AddStringToObject(document, key, v->valuestring);
}
}
}
func(document, uuid_str);
cJSON_Delete(document);
cJSON_Delete(input);
}
fclose(file);
}
void read_index(const char *path, const char index_id[UUID_STR_LEN], const char *type, index_func func) {
if (strcmp(type, INDEX_TYPE_BIN) == 0) {
read_index_bin(path, index_id, func);
} else if (strcmp(type, INDEX_TYPE_JSON) == 0) {
read_index_json(path, index_id, func);
}
}
void incremental_read(GHashTable *table, const char *filepath) {
FILE *file = fopen(filepath, "rb");
line_t line;

View File

@@ -11,7 +11,7 @@ void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
void write_document(document_t *doc);
void read_index(const char *path, const char[UUID_STR_LEN], index_func);
void read_index(const char *path, const char[UUID_STR_LEN], const char *type, index_func);
void incremental_read(GHashTable *table, const char *filepath);

View File

@@ -28,8 +28,18 @@ parse_job_t *create_fs_parse_job(const char *filepath, const struct stat *info,
return job;
}
int sub_strings[30];
#define EXCLUDED(str) (pcre_exec(ScanCtx.exclude, ScanCtx.exclude_extra, filepath, strlen(filepath), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
int handle_entry(const char *filepath, const struct stat *info, int typeflag, struct FTW *ftw) {
if (ftw->level <= ScanCtx.depth && typeflag == FTW_F && S_ISREG(info->st_mode)) {
if (typeflag == FTW_F && S_ISREG(info->st_mode) && ftw->level <= ScanCtx.depth) {
if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) {
LOG_DEBUGF("walk.c", "Excluded: %s", filepath)
return 0;
}
parse_job_t *job = create_fs_parse_job(filepath, info, ftw->base);
tpool_add_work(ScanCtx.pool, parse, job);
}

View File

@@ -56,7 +56,7 @@ void sist_logf(char *filepath, int level, char *format, ...) {
log_len += 1;
}
write(STDOUT_FILENO, log_str, log_len);
write(STDERR_FILENO, log_str, log_len);
}
void sist_log(char *filepath, int level, char *str) {

View File

@@ -6,7 +6,7 @@
#define EPILOG "Made by simon987 <me@simon987.net>. Released under GPL-3.0"
static const char *const Version = "1.2.0";
static const char *const Version = "1.2.17";
static const char *const usage[] = {
"sist2 scan [OPTION]... PATH",
"sist2 index [OPTION]... INDEX",
@@ -29,6 +29,7 @@ void init_dir(const char *dirpath) {
uuid_unparse(uuid, ScanCtx.index.desc.uuid);
time(&ScanCtx.index.desc.timestamp);
strcpy(ScanCtx.index.desc.version, Version);
strcpy(ScanCtx.index.desc.type, INDEX_TYPE_BIN);
write_index_descriptor(path, &ScanCtx.index.desc);
}
@@ -48,14 +49,19 @@ void sist2_scan(scan_args_t *args) {
strncpy(ScanCtx.index.path, args->output, sizeof(ScanCtx.index.path));
strncpy(ScanCtx.index.desc.name, args->name, sizeof(ScanCtx.index.desc.name));
strncpy(ScanCtx.index.desc.root, args->path, sizeof(ScanCtx.index.desc.root));
strncpy(ScanCtx.index.desc.rewrite_url, args->rewrite_url, sizeof(ScanCtx.index.desc.rewrite_url));
ScanCtx.index.desc.root_len = (short) strlen(ScanCtx.index.desc.root);
ScanCtx.tesseract_lang = args->tesseract_lang;
ScanCtx.tesseract_path = args->tesseract_path;
ScanCtx.fast = args->fast;
init_dir(ScanCtx.index.path);
ScanCtx.mime_table = mime_get_mime_table();
ScanCtx.ext_table = mime_get_ext_table();
cbr_init();
char store_path[PATH_MAX];
snprintf(store_path, PATH_MAX, "%sthumbs", ScanCtx.index.path);
mkdir(store_path, S_IWUSR | S_IRUSR | S_IXUSR);
@@ -130,8 +136,12 @@ void sist2_index(index_args_t *args) {
snprintf(descriptor_path, PATH_MAX, "%s/descriptor.json", args->index_path);
index_descriptor_t desc = read_index_descriptor(descriptor_path);
if (strcmp(desc.version, Version) != 0) {
fprintf(stderr, "Version mismatch! Index is v%s but executable is v%s\n", desc.version, Version);
LOG_DEBUGF("main.c", "descriptor version %s (%s)", desc.version, desc.type)
if (strcmp(desc.version, Version) != 0 && strcmp(desc.version, INDEX_VERSION_EXTERNAL) != 0) {
fprintf(stderr, "Version mismatch! Index is %s but executable is %s/%s\n",
desc.version, Version, INDEX_VERSION_EXTERNAL);
return;
}
@@ -153,7 +163,7 @@ void sist2_index(index_args_t *args) {
if (strncmp(de->d_name, "_index_", sizeof("_index_") - 1) == 0) {
char file_path[PATH_MAX];
snprintf(file_path, PATH_MAX, "%s/%s", args->index_path, de->d_name);
read_index(file_path, desc.uuid, f);
read_index(file_path, desc.uuid, desc.type, f);
}
}
closedir(dir);
@@ -231,6 +241,8 @@ int main(int argc, const char *argv[]) {
"shallow: Don't parse archives inside archives. DEFAULT: recurse"),
OPT_STRING(0, "ocr", &scan_args->tesseract_lang, "Tesseract language (use tesseract --list-langs to see "
"which are installed on your machine)"),
OPT_STRING('e', "exclude", &scan_args->exclude_regex, "Files that match this regex will not be scanned"),
OPT_BOOLEAN(0, "fast", &scan_args->fast, "Only index file names & mime type"),
OPT_GROUP("Index options"),
OPT_STRING(0, "es-url", &common_es_url, "Elasticsearch url with port. DEFAULT=http://localhost:9200"),
@@ -277,9 +289,7 @@ int main(int argc, const char *argv[]) {
}
sist2_scan(scan_args);
}
else if (strcmp(argv[0], "index") == 0) {
} else if (strcmp(argv[0], "index") == 0) {
int err = index_args_validate(index_args, argc, argv);
if (err != 0) {
@@ -295,8 +305,7 @@ int main(int argc, const char *argv[]) {
}
sist2_web(web_args);
}
else {
} else {
fprintf(stderr, "Invalid command: '%s'\n", argv[0]);
argparse_usage(&argparse);
return 1;

View File

@@ -1,8 +1,6 @@
#include "arc.h"
#include "src/ctx.h"
#define ARC_BUF_SIZE 8192
int should_parse_filtered_file(const char *filepath, int ext) {
char tmp[PATH_MAX * 2];

View File

@@ -2,6 +2,7 @@
#define SIST2_ARC_H
#include "src/sist.h"
#define ARC_BUF_SIZE 8192
int should_parse_filtered_file(const char *filepath, int ext);

52
src/parsing/cbr.c Normal file
View File

@@ -0,0 +1,52 @@
#import "cbr.h"
#import "src/ctx.h"
unsigned int cbr_mime;
unsigned int cbz_mime;
void cbr_init() {
cbr_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/x-cbr");
cbz_mime = mime_get_mime_by_string(ScanCtx.mime_table, "application/x-cbz");
}
int is_cbr(unsigned int mime) {
return mime == cbr_mime;
}
void parse_cbr(void *buf, size_t buf_len, document_t *doc) {
char *out_buf = malloc(buf_len * 2);
size_t out_buf_used = 0;
struct archive *rar_in = archive_read_new();
archive_read_support_filter_none(rar_in);
archive_read_support_format_rar(rar_in);
archive_read_open_memory(rar_in, buf, buf_len);
struct archive *zip_out = archive_write_new();
archive_write_set_format_zip(zip_out);
archive_write_open_memory(zip_out, out_buf, buf_len * 2, &out_buf_used);
struct archive_entry *entry;
while (archive_read_next_header(rar_in, &entry) == ARCHIVE_OK) {
archive_write_header(zip_out, entry);
char arc_buf[ARC_BUF_SIZE];
int len = archive_read_data(rar_in, arc_buf, ARC_BUF_SIZE);
while (len > 0) {
archive_write_data(zip_out, arc_buf, len);
len = archive_read_data(rar_in, arc_buf, ARC_BUF_SIZE);
}
}
archive_write_close(zip_out);
archive_write_free(zip_out);
archive_read_close(rar_in);
archive_read_free(rar_in);
doc->mime = cbz_mime;
parse_pdf(out_buf, out_buf_used, doc);
doc->mime = cbr_mime;
free(out_buf);
}

12
src/parsing/cbr.h Normal file
View File

@@ -0,0 +1,12 @@
#ifndef SIST2_CBR_H
#define SIST2_CBR_H
#include "src/sist.h"
void cbr_init();
int is_cbr(unsigned int mime);
void parse_cbr(void *buf, size_t buf_len, document_t *doc);
#endif

View File

@@ -1,10 +1,20 @@
#include "doc.h"
#include "src/ctx.h"
void dump_text(mceTextReader_t *reader, dyn_buffer_t *buf) {
int dump_text(mceTextReader_t *reader, dyn_buffer_t *buf) {
mce_skip_attributes(reader);
xmlErrorPtr err = xmlGetLastError();
if (err != NULL) {
if (err->level == XML_ERR_FATAL) {
LOG_ERRORF("doc.c", "Got fatal XML error while parsing document: %s", err->message)
return -1;
} else {
LOG_ERRORF("doc.c", "Got recoverable XML error while parsing document: %s", err->message)
}
}
mce_start_children(reader) {
mce_start_element(reader, NULL, _X("t")) {
mce_skip_attributes(reader);
@@ -18,10 +28,14 @@ void dump_text(mceTextReader_t *reader, dyn_buffer_t *buf) {
} mce_end_element(reader);
mce_start_element(reader, NULL, NULL) {
dump_text(reader, buf);
int ret = dump_text(reader, buf);
if (ret != 0) {
return ret;
}
} mce_end_element(reader);
} mce_end_children(reader)
return 0;
}
__always_inline
@@ -52,23 +66,28 @@ int should_read_part(opcPart part) {
}
__always_inline
void read_part(opcContainer *c, dyn_buffer_t *buf, opcPart part, document_t *doc) {
int read_part(opcContainer *c, dyn_buffer_t *buf, opcPart part, document_t *doc) {
mceTextReader_t reader;
int ret = opcXmlReaderOpen(c, &reader, part, NULL, "UTF-8", 0);
int ret = opcXmlReaderOpen(c, &reader, part, NULL, "UTF-8", XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (ret != OPC_ERROR_NONE) {
LOG_ERRORF(doc->filepath, "(doc.c) opcXmlReaderOpen() returned error code %d", ret);
return;
return -1;
}
mce_start_document(&reader) {
mce_start_element(&reader, NULL, NULL) {
dump_text(&reader, buf);
ret = dump_text(&reader, buf);
if (ret != 0) {
mceTextReaderCleanup(&reader);
return -1;
}
} mce_end_element(&reader);
} mce_end_document(&reader);
mceTextReaderCleanup(&reader);
return 0;
}
void parse_doc(void *mem, size_t mem_len, document_t *doc) {
@@ -88,7 +107,10 @@ void parse_doc(void *mem, size_t mem_len, document_t *doc) {
opcPart part = opcPartGetFirst(c);
do {
if (should_read_part(part)) {
read_part(c, &buf, part, doc);
int ret = read_part(c, &buf, part, doc);
if (ret != 0) {
break;
}
}
} while ((part = opcPartGetNext(c, part)));

View File

@@ -193,6 +193,24 @@ append_video_meta(AVFormatContext *pFormatCtx, AVFrame *frame, document_t *doc,
APPEND_TAG_META(doc, tag, MetaArtist)
} else if (strcmp(tag->key, "ImageDescription") == 0) {
APPEND_TAG_META(doc, tag, MetaContent)
} else if (strcmp(tag->key, "Make") == 0) {
APPEND_TAG_META(doc, tag, MetaExifMake)
} else if (strcmp(tag->key, "Model") == 0) {
APPEND_TAG_META(doc, tag, MetaExifModel)
} else if (strcmp(tag->key, "Software") == 0) {
APPEND_TAG_META(doc, tag, MetaExifSoftware)
} else if (strcmp(tag->key, "FNumber") == 0) {
APPEND_TAG_META(doc, tag, MetaExifFNumber)
} else if (strcmp(tag->key, "FocalLength") == 0) {
APPEND_TAG_META(doc, tag, MetaExifFocalLength)
} else if (strcmp(tag->key, "UserComment") == 0) {
APPEND_TAG_META(doc, tag, MetaExifUserComment)
} else if (strcmp(tag->key, "ISOSpeedRatings") == 0) {
APPEND_TAG_META(doc, tag, MetaExifIsoSpeedRatings)
} else if (strcmp(tag->key, "ExposureTime") == 0) {
APPEND_TAG_META(doc, tag, MetaExifExposureTime)
} else if (strcmp(tag->key, "DateTime") == 0) {
APPEND_TAG_META(doc, tag, MetaExifDateTime)
}
}
}

View File

@@ -8,7 +8,7 @@
#define MIME_EMPTY 1
#define DONT_PARSE 0x80000000
#define SHOULD_PARSE(mime_id) (mime_id & DONT_PARSE) != DONT_PARSE && mime_id != 0
#define SHOULD_PARSE(mime_id) (ScanCtx.fast == 0 && (mime_id & DONT_PARSE) != DONT_PARSE && mime_id != 0)
#define PDF_MASK 0x40000000
#define IS_PDF(mime_id) (mime_id & PDF_MASK) == PDF_MASK

View File

@@ -1293,6 +1293,7 @@ g_hash_table_insert(ext_table, "isu", (gpointer)video_x_isvideo);
g_hash_table_insert(ext_table, "mjpg", (gpointer)video_x_motion_jpeg);
g_hash_table_insert(ext_table, "asf", (gpointer)video_x_ms_asf);
g_hash_table_insert(ext_table, "asx", (gpointer)video_x_ms_asf);
g_hash_table_insert(ext_table, "wmv", (gpointer)video_x_ms_asf);
g_hash_table_insert(ext_table, "qtc", (gpointer)video_x_qtc);
g_hash_table_insert(ext_table, "movie", (gpointer)video_x_sgi_movie);
g_hash_table_insert(ext_table, "mv", (gpointer)video_x_sgi_movie);

View File

@@ -149,6 +149,13 @@ void parse(void *arg) {
if (doc_buf != buf && doc_buf != NULL) {
free(doc_buf);
}
} else if (is_cbr(doc.mime)) {
void *cbr_buf = read_all(job, (char *) buf, bytes_read);
parse_cbr(cbr_buf, doc.size, &doc);
if (cbr_buf != buf && cbr_buf != NULL) {
free(cbr_buf);
}
}
//Parent meta

View File

@@ -1,7 +1,8 @@
#include "pdf.h"
#include "src/ctx.h"
#define MIN_OCR_SIZE 128
#define MIN_OCR_SIZE 350
#define MIN_OCR_LEN 10
__thread text_buffer_t thread_buffer;
@@ -82,7 +83,8 @@ fz_page *render_cover(fz_context *ctx, document_t *doc, fz_document *fzdoc) {
fz_drop_pixmap(ctx, pixmap);
if (err != 0) {
LOG_WARNINGF(doc->filepath, "fz_new_buffer_from_pixmap_as_png() returned error code [%d] %s", err, ctx->error.message)
LOG_WARNINGF(doc->filepath, "fz_new_buffer_from_pixmap_as_png() returned error code [%d] %s", err,
ctx->error.message)
fz_drop_page(ctx, cover);
return NULL;
}
@@ -127,37 +129,40 @@ int read_stext_block(fz_stext_block *block, text_buffer_t *tex) {
return 0;
}
#define IS_VALID_BPP(d) (d==1 || d==2 || d==4 || d==8 || d==16 || d==24 || d==32)
void fill_image(fz_context *ctx, UNUSED(fz_device *dev),
fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
UNUSED(fz_color_params color_params)) {
fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
UNUSED(fz_color_params color_params)) {
int l2factor = 0;
if (img->w > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE) {
if (img->w > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && IS_VALID_BPP(img->n)) {
fz_pixmap *pix = img->get_pixmap(ctx, img, NULL, img->w, img->h, &l2factor);
if (pix->h > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && img->xres != 0) {
TessBaseAPI *api = TessBaseAPICreate();
TessBaseAPIInit3(api, TESS_DATAPATH, ScanCtx.tesseract_lang);
TessBaseAPIInit3(api, ScanCtx.tesseract_path, ScanCtx.tesseract_lang);
TessBaseAPISetImage(api, pix->samples, pix->w, pix->h, pix->n, pix->stride);
TessBaseAPISetSourceResolution(api, pix->xres);
char *text = TessBaseAPIGetUTF8Text(api);
size_t len = strlen(text);
text_buffer_append_string(&thread_buffer, text, len - 1);
LOG_DEBUGF(
"pdf.c",
"(OCR) %dx%d got %dB from tesseract (%s), buffer:%dB",
pix->w, pix->h, len, ScanCtx.tesseract_lang, thread_buffer.dyn_buffer.cur
)
if (len >= MIN_OCR_LEN) {
text_buffer_append_string(&thread_buffer, text, len - 1);
LOG_DEBUGF(
"pdf.c",
"(OCR) %dx%d got %dB from tesseract (%s), buffer:%dB",
pix->w, pix->h, len, ScanCtx.tesseract_lang, thread_buffer.dyn_buffer.cur
)
}
TessBaseAPIEnd(api);
TessBaseAPIDelete(api);
fz_drop_pixmap(ctx, pix);
}
fz_drop_pixmap(ctx, pix);
}
}

View File

@@ -2,7 +2,6 @@
#define SIST2_PDF_H
#include "src/sist.h"
#include <tesseract/capi.h>
void parse_pdf(void *buf, size_t buf_len, document_t *doc);

View File

@@ -3,7 +3,6 @@
#define UUID_STR_LEN 37
#define UNUSED(x) __attribute__((__unused__)) x
#define TESS_DATAPATH "/usr/share/tessdata/"
#include <glib-2.0/glib.h>
#include <unistd.h>
@@ -34,6 +33,9 @@
#include <archive_entry.h>
#include <opc/opc.h>
#include <libxml/xmlstring.h>
#define BOOL int
#include <tesseract/capi.h>
#include <pcre.h>
#include <onion/onion.h>
#include <onion/handler.h>
@@ -58,6 +60,7 @@
#include "parsing/font.h"
#include "parsing/arc.h"
#include "parsing/doc.h"
#include "parsing/cbr.h"
#include "cli.h"
#include "log.h"
#include "utf8.h/utf8.h"

View File

@@ -2,9 +2,9 @@
#define SIST2_TYPES_H
#define META_INT_MASK 0xF0
#define META_STR_MASK 0xE0
#define META_LONG_MASK 0xD0
#define META_INT_MASK 0x80
#define META_STR_MASK 0x40
#define META_LONG_MASK 0x20
#define IS_META_INT(key) (key & META_INT_MASK) == META_INT_MASK
#define IS_META_LONG(key) (key & META_LONG_MASK) == META_LONG_MASK
#define IS_META_STR(meta) (meta->key & META_STR_MASK) == META_STR_MASK
@@ -31,16 +31,31 @@ enum metakey {
MetaTitle = 12 | META_STR_MASK,
MetaFontName = 13 | META_STR_MASK,
MetaParent = 14 | META_STR_MASK,
MetaExifMake = 15 | META_STR_MASK,
MetaExifSoftware = 16 | META_STR_MASK,
MetaExifExposureTime = 17 | META_STR_MASK,
MetaExifFNumber = 18 | META_STR_MASK,
MetaExifFocalLength = 19 | META_STR_MASK,
MetaExifUserComment = 20 | META_STR_MASK,
MetaExifModel = 21 | META_STR_MASK,
MetaExifIsoSpeedRatings = 22 | META_STR_MASK,
MetaExifDateTime = 23 | META_STR_MASK,
//Note to self: this will break after 31 entries
};
#define INDEX_TYPE_BIN "binary"
#define INDEX_TYPE_JSON "json"
#define INDEX_VERSION_EXTERNAL "_external_v1"
typedef struct index_descriptor {
char uuid[UUID_STR_LEN];
char version[6];
char version[64];
long timestamp;
char root[PATH_MAX];
char rewrite_url[8196];
short root_len;
char name[1024];
char type[64];
} index_descriptor_t;
typedef struct index_t {
@@ -80,7 +95,6 @@ typedef int (*read_func_t)(struct vfile *, void *buf, size_t size);
typedef void (*close_func_t)(struct vfile *);
typedef struct vfile {
union {
int fd;
struct archive *arc;

View File

@@ -1,4 +1,5 @@
#include "util.h"
#include "src/ctx.h"
dyn_buffer_t dyn_buffer_create() {
dyn_buffer_t buf;
@@ -90,7 +91,11 @@ text_buffer_t text_buffer_create(int max_size) {
}
void text_buffer_terminate_string(text_buffer_t *buf) {
dyn_buffer_write_char(&buf->dyn_buffer, '\0');
if (buf->dyn_buffer.cur > 0 && *(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) == ' ') {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) = '\0';
} else {
dyn_buffer_write_char(&buf->dyn_buffer, '\0');
}
}
__always_inline
@@ -171,8 +176,8 @@ int text_buffer_append_string0(text_buffer_t *buf, char *str) {
int text_buffer_append_char(text_buffer_t *buf, int c) {
if (SHOULD_IGNORE_CHAR(c)) {
if (!buf->last_char_was_whitespace) {
if (SHOULD_IGNORE_CHAR(c) || c == ' ') {
if (!buf->last_char_was_whitespace && buf->dyn_buffer.cur != 0) {
dyn_buffer_write_char(&buf->dyn_buffer, ' ');
buf->last_char_was_whitespace = TRUE;
@@ -254,8 +259,10 @@ char *abspath(const char *path) {
if (abs == NULL) {
return NULL;
}
abs = realloc(abs, strlen(abs) + 2);
strcat(abs, "/");
if (strlen(abs) > 1) {
abs = realloc(abs, strlen(abs) + 2);
strcat(abs, "/");
}
wordfree(&w);
return abs;
@@ -317,4 +324,29 @@ GHashTable *incremental_get_table() {
return file_table;
}
const char *find_file_in_paths(const char *paths[], const char *filename) {
for (int i = 0; paths[i] != NULL; i++) {
char *apath = abspath(paths[i]);
if (apath == NULL) {
continue;
}
char path[PATH_MAX];
snprintf(path, sizeof(path), "%s%s", apath, filename);
LOG_DEBUGF("util.c", "Looking for '%s' in folder '%s'", filename, apath)
free(apath);
struct stat info;
int ret = stat(path, &info);
if (ret != -1) {
return paths[i];
}
}
return NULL;
}

View File

@@ -7,7 +7,7 @@
#define INITIAL_BUF_SIZE 1024 * 16
#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c))
#define SHOULD_KEEP_CHAR(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'z') || (c > 127))
#define SHOULD_KEEP_CHAR(c) ((c >= '\'' && c <= ';') || (c >= 'A' && c <= 'z') || (c > 127))
typedef struct dyn_buffer {
@@ -74,5 +74,6 @@ int incremental_get(GHashTable *table, unsigned long inode_no);
int incremental_mark_file_for_copy(GHashTable *table, unsigned long inode_no);
const char *find_file_in_paths(const char **paths, const char *filename);
#endif

View File

@@ -110,7 +110,7 @@ int thumbnail(void *p, onion_request *req, onion_response *res) {
int written = onion_response_write(res, data, data_len);
onion_response_flush(res);
if (written != data_len || data_len == 0) {
printf("Couldn't write thumb\n");
LOG_DEBUG("serve.c", "Couldn't write thumbnail");
}
free(data);
@@ -181,7 +181,12 @@ int chunked_response_file(const char *filename, const char *mime,
}
}
onion_response_set_length(res, length);
onion_response_set_header(res, "Content-Type", mime);
if (mime != NULL) {
onion_response_set_header(res, "Content-Type", mime);
} else {
onion_response_set_header(res, "Content-Type", "application/octet-stream");
}
onion_response_write_headers(res);
if ((onion_request_get_flags(request) & OR_HEAD) == OR_HEAD) {
length = 0;
@@ -214,21 +219,13 @@ int chunked_response_file(const char *filename, const char *mime,
return OCS_PROCESSED;
}
int search(void *p, onion_request *req, onion_response *res) {
int search(UNUSED(void *p), onion_request *req, onion_response *res) {
int flags = onion_request_get_flags(req);
if ((flags & OR_METHODS) != OR_POST) {
return OCS_NOT_PROCESSED;
}
char *scroll_param;
const char *scroll = onion_request_get_query(req, "scroll");
if (scroll != NULL) {
scroll_param = "?scroll=3m";
} else {
scroll_param = "";
}
const struct onion_block_t *block = onion_request_get_data(req);
if (block == NULL) {
@@ -236,7 +233,7 @@ int search(void *p, onion_request *req, onion_response *res) {
}
char url[4096];
snprintf(url, 4096, "%s/sist2/_search%s", WebCtx.es_url, scroll_param);
snprintf(url, 4096, "%s/sist2/_search", WebCtx.es_url);
response_t *r = web_post(url, onion_block_data(block), "Content-Type: application/json");
set_default_headers(res);
@@ -254,43 +251,6 @@ int search(void *p, onion_request *req, onion_response *res) {
return OCS_PROCESSED;
}
int scroll(void *p, onion_request *req, onion_response *res) {
int flags = onion_request_get_flags(req);
if ((flags & OR_METHODS) != OR_GET) {
return OCS_NOT_PROCESSED;
}
char url[4096];
snprintf(url, 4096, "%s/_search/scroll", WebCtx.es_url);
const char *scroll_id = onion_request_get_query(req, "scroll_id");
cJSON *json = cJSON_CreateObject();
cJSON_AddStringToObject(json, "scroll_id", scroll_id);
cJSON_AddStringToObject(json, "scroll", "3m");
char *json_str = cJSON_PrintUnformatted(json);
response_t *r = web_post(url, json_str, "Content-Type: application/json");
cJSON_Delete(json);
cJSON_free(json_str);
if (r->status_code != 200) {
free_response(r);
return OCS_NOT_PROCESSED;
}
set_default_headers(res);
onion_response_set_header(res, "Content-Type", "application/json");
onion_response_set_header(res, "Content-Disposition", "application/json");
onion_response_set_length(res, r->size);
onion_response_write(res, r->body, r->size);
free_response(r);
return OCS_PROCESSED;
}
int serve_file_from_url(cJSON *json, index_t *idx, onion_request *req, onion_response *res) {
const char *path = cJSON_GetObjectItem(json, "path")->valuestring;
@@ -327,7 +287,7 @@ int serve_file_from_disk(cJSON *json, index_t *idx, onion_request *req, onion_re
return chunked_response_file(full_path, mime, 1, req, res);
}
int index_info(void *p, onion_request *req, onion_response *res) {
int index_info(UNUSED(void *p), onion_request *req, onion_response *res) {
cJSON *json = cJSON_CreateObject();
cJSON *arr = cJSON_AddArrayToObject(json, "indices");
@@ -353,14 +313,47 @@ int index_info(void *p, onion_request *req, onion_response *res) {
return OCS_PROCESSED;
}
int file(void *p, onion_request *req, onion_response *res) {
int document_info(UNUSED(void *p), onion_request *req, onion_response *res) {
const char *arg_uuid = onion_request_get_query(req, "1");
if (arg_uuid == NULL) {
return OCS_PROCESSED;
}
char *next = arg_uuid;
cJSON *doc = elastic_get_document(arg_uuid);
cJSON *source = cJSON_GetObjectItem(doc, "_source");
cJSON *index_id = cJSON_GetObjectItem(source, "index");
if (index_id == NULL) {
cJSON_Delete(doc);
return OCS_NOT_PROCESSED;
}
index_t *idx = get_index_by_id(index_id->valuestring);
if (idx == NULL) {
cJSON_Delete(doc);
return OCS_NOT_PROCESSED;
}
onion_response_set_header(res, "Content-Type", "application/json");
char *json_str = cJSON_PrintUnformatted(source);
onion_response_write0(res, json_str);
free(json_str);
cJSON_Delete(doc);
return OCS_PROCESSED;
}
int file(UNUSED(void *p), onion_request *req, onion_response *res) {
const char *arg_uuid = onion_request_get_query(req, "1");
if (arg_uuid == NULL) {
return OCS_PROCESSED;
}
const char *next = arg_uuid;
cJSON *doc = NULL;
cJSON *index_id = NULL;
cJSON *source = NULL;
@@ -398,6 +391,23 @@ int file(void *p, onion_request *req, onion_response *res) {
return ret;
}
int status(UNUSED(void *p), UNUSED(onion_request *req), onion_response *res) {
set_default_headers(res);
onion_response_set_header(res, "Content-Type", "application/x-empty");
char *status = elastic_get_status();
if (strcmp(status, "open") == 0) {
onion_response_set_code(res, 204);
} else {
onion_response_set_code(res, 500);
}
free(status);
return OCS_PROCESSED;
}
void serve(const char *hostname, const char *port) {
onion *o = onion_new(O_POOL);
onion_set_timeout(o, 3500);
@@ -416,7 +426,7 @@ void serve(const char *hostname, const char *port) {
onion_url_add(urls, "img/sprite-skin-flat.png", img_sprite_skin_flag);
onion_url_add(urls, "es", search);
onion_url_add(urls, "scroll", scroll);
onion_url_add(urls, "status", status);
onion_url_add(
urls,
"^t/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})/"
@@ -424,6 +434,7 @@ void serve(const char *hostname, const char *port) {
thumbnail
);
onion_url_add(urls, "^f/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})$", file);
onion_url_add(urls, "^d/([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})$", document_info);
onion_url_add(urls, "i", index_info);

File diff suppressed because one or more lines are too long

View File

@@ -2,6 +2,26 @@
outline: 0;
}
.info-icon {
width: 1rem;
margin-right: 0.2rem;
cursor: pointer;
color: #757575;
line-height: 1rem;
height: 1.1rem;
}
.info-icon:hover {
color: inherit;
}
.modal-title {
max-width: calc(100% - 2rem);
overflow: hidden;
text-overflow: ellipsis;
}
.path-row {
display: -ms-flexbox;
display: flex;
@@ -32,7 +52,7 @@ body {
margin-top: 1em;
}
.card {
.card, .modal-content {
margin-top: 1em;
background: #212121;
color: #e0e0e0;
@@ -40,6 +60,27 @@ body {
border: none;
}
.table {
color: #e0e0e0;
}
.table td, .table th {
border: none;
}
.table thead th {
border-bottom: 1px solid #646464;
}
.modal-header .close {
color: #e0e0e0;
text-shadow: none;
}
.modal-header {
border-bottom: 1px solid #646464;
}
.sub-document {
background: #37474F !important;
}
@@ -61,6 +102,7 @@ body {
border-bottom: none;
border-left: none;
border-right: none;
padding: .25rem 0.5rem;
}
.list-group-item:first-child {
@@ -134,6 +176,8 @@ body {
.file-title {
width: 100%;
line-height: 1rem;
height: 1.1rem;
font-size: 10pt;
white-space: nowrap;
text-overflow: ellipsis;
@@ -156,7 +200,7 @@ body {
max-width: 100%;
max-height: 175px;
margin: 0 auto 0;
padding: 3px 3px 0 3px;
padding: 3px 3px 0;
width: auto;
height: auto;
}
@@ -165,7 +209,7 @@ body {
display: block;
max-width: 64px;
max-height: 64px;
margin: 0 auto 0;
margin: 0 auto;
width: auto;
height: auto;
}
@@ -200,11 +244,18 @@ body {
}
mark {
background: #fff217;
background: rgba(251, 191, 41, 0.25);
border-radius: 0;
padding: 1px 0;
color: inherit;
}
.content-div mark {
background: rgba(251, 191, 41, 0.40);
color: white;
}
.content-div {
font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
font-size: 13px;
@@ -348,10 +399,6 @@ option {
margin-top: 1em;
}
.list-group-item {
padding: .25rem 0.5rem;
}
.wrapper-sm {
min-width: 64px;
}
@@ -380,3 +427,25 @@ option {
margin-top: -14px;
font-size: 11px;
}
@media (min-width: 800px) {
.small-btn {
display: none;
}
.large-btn {
display: inherit;
}
}
@media (max-width: 801px) {
.small-btn {
display: inherit;
}
.large-btn {
display: none;
}
}
.input-group > .custom-select:not(:first-child), .input-group > .form-control:not(:first-child) {
border-right: none;
}

View File

@@ -2,6 +2,25 @@
outline: 0;
}
.info-icon {
width: 1rem;
margin-right: 0.2rem;
cursor: pointer;
color: #757575;
line-height: 1rem;
height: 1rem;
}
.info-icon:hover {
color: inherit;
}
.modal-title {
max-width: calc(100% - 2rem);
overflow: hidden;
text-overflow: ellipsis;
}
.path-row {
display: -ms-flexbox;
display: flex;
@@ -100,6 +119,8 @@ body {
.file-title {
width: 100%;
line-height: 1rem;
height: 1.1rem;
font-size: 10pt;
white-space: nowrap;
text-overflow: ellipsis;
@@ -163,6 +184,7 @@ mark {
background: #fff217;
border-radius: 0;
padding: 1px 0;
color: inherit;
}
.content-div {
@@ -266,3 +288,25 @@ mark {
margin-top: -14px;
font-size: 11px;
}
@media (min-width: 800px) {
.small-btn {
display: none;
}
.large-btn {
display: inherit;
}
}
@media (max-width: 801px) {
.small-btn {
display: inherit;
}
.large-btn {
display: none;
}
}
.input-group > .custom-select:not(:first-child), .input-group > .form-control:not(:first-child) {
border-right: none;
}

1
web/css/smartphoto.min.css vendored Normal file

File diff suppressed because one or more lines are too long

View File

@@ -75,6 +75,10 @@ function shouldPlayVideo(hit) {
return videoc !== "hevc" && videoc !== "mpeg2video" && videoc !== "wmv3";
}
function shouldDisplayRawImage(hit) {
return hit["_source"]["mime"] && hit["_source"]["mime"].startsWith("image/") && hit["_source"]["videoc"] !== "tiff";
}
function makePlaceholder(w, h, small) {
let calc;
if (small) {
@@ -96,10 +100,14 @@ function makePlaceholder(w, h, small) {
return el;
}
function ext(hit) {
return hit["_source"].hasOwnProperty("extension") && hit["_source"]["extension"] !== "" ? "." + hit["_source"]["extension"] : "";
}
function makeTitle(hit) {
let title = document.createElement("div");
title.setAttribute("class", "file-title");
let extension = hit["_source"].hasOwnProperty("extension") && hit["_source"]["extension"] !== "" ? "." + hit["_source"]["extension"] : "";
let extension = ext(hit);
applyNameToTitle(hit, title, extension);
@@ -113,7 +121,7 @@ function getTags(hit, mimeCategory) {
switch (mimeCategory) {
case "video":
case "image":
if (hit["_source"].hasOwnProperty("videoc")) {
if (hit["_source"].hasOwnProperty("videoc") && hit["_source"]["videoc"]) {
const formatTag = document.createElement("span");
formatTag.setAttribute("class", "badge badge-pill badge-video");
formatTag.appendChild(document.createTextNode(hit["_source"]["videoc"].replace(" ", "")));
@@ -121,7 +129,7 @@ function getTags(hit, mimeCategory) {
}
break;
case "audio": {
if (hit["_source"].hasOwnProperty("audioc")) {
if (hit["_source"].hasOwnProperty("audioc") && hit["_source"]["audioc"]) {
let formatTag = document.createElement("span");
formatTag.setAttribute("class", "badge badge-pill badge-audio");
formatTag.appendChild(document.createTextNode(hit["_source"]["audioc"]));
@@ -153,11 +161,44 @@ function getTags(hit, mimeCategory) {
return tags
}
/**
*
* @param hit
* @returns {Element}
*/
function infoButtonCb(hit) {
return () => {
getDocumentInfo(hit["_id"]).then(doc => {
$("#modal-title").text(doc["name"] + ext(hit));
const tbody = $("<tbody>");
$("#modal-body").empty()
.append($("<table class='table table-sm'>")
.append($("<thead>")
.append($("<tr>")
.append($("<th>").text("Field"))
.append($("<th>").text("Value"))
)
)
.append(tbody)
);
const displayFields = new Set([
"mime", "size", "mtime", "path", "title", "width", "height", "duration", "audioc", "videoc",
"bitrate", "artist", "album", "album_artist", "genre", "title", "font_name", "tag"
]);
Object.keys(doc)
.filter(key => key.startsWith("_keyword.") || key.startsWith("_text.") || displayFields.has(key) || key.startsWith("exif_"))
.forEach(key => {
tbody.append($("<tr>")
.append($("<td>").text(key))
.append($("<td>").text(doc[key]))
);
});
if (doc.hasOwnProperty("content") && doc["content"]) {
$("#modal-body").append($("<div class='content-div'>").text(doc["content"]))
}
$("#modal").modal();
});
}
}
function createDocCard(hit) {
let docCard = document.createElement("div");
docCard.setAttribute("class", "card");
@@ -172,6 +213,7 @@ function createDocCard(hit) {
let link = document.createElement("a");
link.setAttribute("href", "f/" + hit["_id"]);
link.setAttribute("target", "_blank");
link.style.maxWidth = "calc(100% - 1.2rem)";
link.appendChild(title);
if (hit["_source"].hasOwnProperty("parent")) {
@@ -271,7 +313,15 @@ function createDocCard(hit) {
sizeTag.setAttribute("class", "text-muted");
tagContainer.appendChild(sizeTag);
docCardBody.appendChild(link);
const titleWrapper = document.createElement("div");
titleWrapper.style.display = "flex";
const infoButton = makeInfoButton(hit);
titleWrapper.appendChild(infoButton);
titleWrapper.appendChild(link);
docCardBody.appendChild(titleWrapper);
docCard.appendChild(docCardBody);
docCardBody.appendChild(tagContainer);
@@ -327,6 +377,7 @@ function makeThumbnail(mimeCategory, hit, imgWrapper, small) {
|| hit["_source"]["mime"] === "application/pdf"
|| hit["_source"]["mime"] === "application/epub+zip"
|| hit["_source"]["mime"] === "application/x-cbz"
|| hit["_source"]["mime"] === "application/x-cbr"
|| hit["_source"].hasOwnProperty("font_name")
) {
thumbnail = document.createElement("img");
@@ -337,6 +388,15 @@ function makeThumbnail(mimeCategory, hit, imgWrapper, small) {
}
thumbnail.setAttribute("src", `t/${hit["_source"]["index"]}/${hit["_id"]}`);
if (!hit["_source"]["parent"] && shouldDisplayRawImage(hit)) {
imgWrapper.setAttribute("id", "sp" + hit["_id"]);
imgWrapper.setAttribute("data-src", `t/${hit["_source"]["index"]}/${hit["_id"]}`);
imgWrapper.setAttribute("href", `f/${hit["_id"]}`);
imgWrapper.setAttribute("data-caption", hit["_source"]["path"] + "/" + hit["_source"]["name"] + ext(hit));
imgWrapper.setAttribute("data-group", "p" + Math.floor(docCount / SIZE));
imgWrapper.classList.add("sp");
}
const placeholder = makePlaceholder(hit["_source"]["width"], hit["_source"]["height"], small);
imgWrapper.appendChild(placeholder);
@@ -352,6 +412,14 @@ function makeThumbnail(mimeCategory, hit, imgWrapper, small) {
return thumbnail;
}
function makeInfoButton(hit) {
const infoButton = document.createElement("span");
infoButton.appendChild(document.createTextNode("🛈"));
infoButton.setAttribute("class", "info-icon");
infoButton.addEventListener("click", infoButtonCb(hit));
return infoButton;
}
function createDocLine(hit) {
const mime = hit["_source"]["mime"];
@@ -372,6 +440,8 @@ function createDocLine(hit) {
isSubDocument = true;
}
const infoButton = makeInfoButton(hit);
const title = makeTitle(hit);
let link = document.createElement("a");
@@ -380,8 +450,13 @@ function createDocLine(hit) {
link.appendChild(title);
const titleDiv = document.createElement("div");
titleDiv.setAttribute("class", "file-title");
titleDiv.appendChild(link);
const titleWrapper = document.createElement("div");
titleWrapper.style.display = "flex";
titleWrapper.appendChild(infoButton);
titleWrapper.appendChild(link);
titleDiv.appendChild(titleWrapper);
line.appendChild(media);
@@ -442,8 +517,7 @@ function makePreloader() {
function makePageIndicator(searchResult) {
let pageIndicator = document.createElement("div");
pageIndicator.setAttribute("class", "page-indicator font-weight-light");
const totalHits = searchResult["hits"]["total"].hasOwnProperty("value")
? searchResult["hits"]["total"]["value"] : searchResult["hits"]["total"];
const totalHits = searchResult["aggregations"]["total_count"]["value"];
pageIndicator.appendChild(document.createTextNode(docCount + " / " + totalHits));
return pageIndicator;
}
@@ -472,26 +546,29 @@ function makeStatsCard(searchResult) {
resultMode.appendChild(gridMode);
resultMode.appendChild(listMode);
if (mode === "grid") {
if (CONF.options.display === "grid") {
gridMode.classList.add("active")
} else {
listMode.classList.add("active")
}
gridMode.addEventListener("click", () => {
mode = "grid";
localStorage.setItem("mode", mode);
console.log("what");
console.log(CONF.options);
CONF.options.display = "grid";
console.log(CONF.options);
CONF.save();
console.log(CONF.options);
searchDebounced();
});
listMode.addEventListener("click", () => {
mode = "list";
localStorage.setItem("mode", mode);
CONF.options.display = "list";
CONF.save();
searchDebounced();
});
let stat = document.createElement("span");
const totalHits = searchResult["hits"]["total"].hasOwnProperty("value")
? searchResult["hits"]["total"]["value"] : searchResult["hits"]["total"];
const totalHits = searchResult["aggregations"]["total_count"]["value"];
stat.appendChild(document.createTextNode(totalHits + " results in " + searchResult["took"] + "ms"));
statsCardBody.appendChild(stat);
@@ -511,7 +588,7 @@ function makeStatsCard(searchResult) {
function makeResultContainer() {
let resultContainer = document.createElement("div");
if (mode === "grid") {
if (CONF.options.display === "grid") {
resultContainer.setAttribute("class", "card-columns");
} else {
resultContainer.setAttribute("class", "list-group");

56
web/js/jquery-smartphoto.min.js vendored Normal file

File diff suppressed because one or more lines are too long

View File

@@ -6,19 +6,46 @@ let tagTree;
let searchBar = document.getElementById("searchBar");
let pathBar = document.getElementById("pathBar");
let scroll_id = null;
let lastDoc = null;
let reachedEnd = false;
let docCount = 0;
let coolingDown = false;
let searchBusy = true;
let selectedIndices = [];
let mode;
if (localStorage.getItem("mode") === null) {
mode = "grid";
} else {
mode = localStorage.getItem("mode")
const CONF = new Settings();
const _defaults = {
display: "grid",
fuzzy: true,
highlight: true
};
function Settings() {
this.options = {};
this._onUpdate = function() {
$("#fuzzyToggle").prop("checked", this.options.fuzzy);
}
this.load = function() {
const raw = window.localStorage.getItem("options");
if (raw === null) {
this.options = _defaults;
} else {
this.options = JSON.parse(raw);
}
this._onUpdate();
}
this.save = function() {
window.localStorage.setItem("options", JSON.stringify(this.options));
this._onUpdate();
}
}
function showEsError() {
$.toast({
heading: "Elasticsearch connection error",
@@ -53,6 +80,7 @@ window.onload = () => {
}
window.location.reload();
})
CONF.load();
};
function toggleFuzzy() {
@@ -81,6 +109,13 @@ $.jsonPost("i").then(resp => {
});
});
function getDocumentInfo(id) {
return $.getJSON("d/" + id).fail(e => {
console.log(e);
showEsError();
})
}
function handleTreeClick(tree) {
return (event, node, handler) => {
event.preventTreeDefault();
@@ -144,8 +179,8 @@ $.jsonPost("es", {
target: '#mimeTree'
});
mimeTree.on("node.click", handleTreeClick(mimeTree));
mimeTree.select();
mimeTree.node("any").deselect();
mimeTree.deselect();
mimeTree.node("any").select();
});
function leafTag(tag) {
@@ -242,7 +277,7 @@ new autoComplete({
function insertHits(resultContainer, hits) {
for (let i = 0; i < hits.length; i++) {
if (mode === "grid") {
if (CONF.options.display === "grid") {
resultContainer.appendChild(createDocCard(hits[i]));
} else {
resultContainer.appendChild(createDocLine(hits[i]));
@@ -252,41 +287,18 @@ function insertHits(resultContainer, hits) {
}
window.addEventListener("scroll", function () {
if (!coolingDown && !searchBusy) {
if (!searchBusy) {
let threshold = 400;
if ((window.innerHeight + window.scrollY) >= document.body.offsetHeight - threshold) {
coolingDown = true;
doScroll();
if (!reachedEnd) {
coolingDown = true;
search(lastDoc);
}
}
}
});
function doScroll() {
$.get("scroll", {scroll_id: scroll_id})
.then(searchResult => {
let searchResults = document.getElementById("searchResults");
let hits = searchResult["hits"]["hits"];
//Page indicator
let pageIndicator = makePageIndicator(searchResult);
searchResults.appendChild(pageIndicator);
//Result container
let resultContainer = makeResultContainer();
searchResults.appendChild(resultContainer);
insertHits(resultContainer, hits);
if (hits.length === SIZE) {
coolingDown = false;
}
})
.fail(() => {
window.location.reload();
})
}
function getSelectedNodes(tree) {
let selectedNodes = [];
@@ -307,21 +319,25 @@ function getSelectedNodes(tree) {
return selectedNodes
}
function search() {
function search(after = null) {
lastDoc = null;
if (searchBusy) {
return;
}
searchBusy = true;
//Clear old search results
let searchResults = document.getElementById("searchResults");
while (searchResults.firstChild) {
searchResults.removeChild(searchResults.firstChild);
//Clear old search results
let preload;
if (!after) {
while (searchResults.firstChild) {
searchResults.removeChild(searchResults.firstChild);
}
preload = makePreloader();
searchResults.appendChild(preload);
}
const preload = makePreloader();
searchResults.appendChild(preload);
let query = searchBar.value;
let empty = query === "";
let condition = empty ? "should" : "must";
@@ -355,55 +371,79 @@ function search() {
filters.push([{terms: {"tag": tags}}]);
}
$.jsonPost("es?scroll=1", {
let q = {
"_source": {
excludes: ["content"]
excludes: ["content", "_tie"]
},
query: {
bool: {
[condition]: {
multi_match: {
simple_query_string: {
query: query,
type: "most_fields",
fields: fields,
operator: "and"
default_operator: "and"
}
},
filter: filters
}
},
sort: [
"_score"
"sort": [
{"_score": {"order": "desc"}},
{"_tie": {"order": "asc"}}
],
highlight: {
aggs:
{
total_size: {"sum": {"field": "size"}},
total_count: {"value_count": {"field": "size"}}
},
size: SIZE,
};
if (after) {
q.search_after = [after["_score"], after["_id"]];
}
if (CONF.options.highlight) {
q.highlight = {
pre_tags: ["<mark>"],
post_tags: ["</mark>"],
fields: {
post_tags: ["</mark>"],
fields: {
content: {},
// "content.nGram": {},
name: {},
"name.nGram": {},
font_name: {},
}
},
aggs: {
total_size: {"sum": {"field": "size"}}
},
size: SIZE,
}).then(searchResult => {
scroll_id = searchResult["_scroll_id"];
};
}
preload.remove();
//Search stats
searchResults.appendChild(makeStatsCard(searchResult));
$.jsonPost("es", q).then(searchResult => {
let hits = searchResult["hits"]["hits"];
if (hits) {
lastDoc = hits[hits.length - 1];
}
if (!after) {
preload.remove();
searchResults.appendChild(makeStatsCard(searchResult));
} else {
let pageIndicator = makePageIndicator(searchResult);
searchResults.appendChild(pageIndicator);
}
//Setup page
let resultContainer = makeResultContainer();
searchResults.appendChild(resultContainer);
docCount = 0;
insertHits(resultContainer, searchResult["hits"]["hits"]);
window.setTimeout(() => {
$(".sp").SmartPhoto({animationSpeed: 0, swipeTopToClose: true, showAnimation: false, forceInterval: 50});
}, 100);
if (!after) {
docCount = 0;
}
reachedEnd = hits.length !== SIZE;
insertHits(resultContainer, hits);
searchBusy = false;
});
}
@@ -464,6 +504,14 @@ function updateIndices() {
document.getElementById("indices").addEventListener("change", updateIndices);
updateIndices();
window.onkeyup = function(e) {
if (e.key === "/" || e.key === "Escape") {
const bar = document.getElementById("searchBar");
bar.scrollIntoView();
bar.focus();
}
};
//Suggest
function getPathChoices() {
return new Promise(getPaths => {
@@ -482,3 +530,30 @@ function getPathChoices() {
})
}
function updateSettings() {
CONF.options.display = $("#settingDisplay").val();
CONF.options.fuzzy = $("#settingFuzzy").prop("checked");
CONF.options.highlight = $("#settingHighlight").prop("checked");
CONF.save();
searchDebounced();
$.toast({
heading: "Settings updated",
text: "Settings saved to browser storage",
stack: 3,
bgColor: "#00a4bc",
textColor: "#fff",
position: 'bottom-right',
hideAfter: 3000,
loaderBg: "#08c7e8",
});
}
function loadSettings() {
CONF.load();
$("#settingDisplay").val(CONF.options.display);
$("#settingFuzzy").prop("checked", CONF.options.fuzzy);
$("#settingHighlight").prop("checked", CONF.options.highlight);
}

View File

@@ -11,9 +11,10 @@
<nav class="navbar navbar-expand-lg">
<a class="navbar-brand" href="/">sist2</a>
<span class="badge badge-pill version">v1.2.0</span>
<span class="badge badge-pill version">v1.2.17</span>
<span class="tagline">Lightning-fast file system indexer and search tool </span>
<a style="margin-left: auto" id="theme" class="btn" title="Toggle theme" href="/">Theme</a>
<button style="margin-left: auto" class="btn" type="button" data-toggle="modal" data-target="#settings" onclick="loadSettings()">Settings</button>
<a id="theme" class="btn" title="Toggle theme" href="/">Theme</a>
</nav>
<div class="container">
@@ -25,13 +26,20 @@
<div class="input-group">
<div class="input-group-prepend">
<div class="input-group-text">
<span title="Toggle fuzzy searching" onclick="document.getElementById('fuzzyToggle').click()">Fuzzy&nbsp</span>
<span title="Toggle fuzzy searching" onclick="document.getElementById('fuzzyToggle').click()">Fuzzy&nbsp</span>
<input title="Toggle fuzzy searching" type="checkbox" id="fuzzyToggle"
onclick="toggleFuzzy()" checked>
</div>
</div>
<input id="searchBar" type="search" class="form-control" placeholder="Search">
<div class="input-group-append">
<button class="btn btn-outline-secondary small-btn" type="button" data-toggle="modal"
data-target="#help">?
</button>
<button class="btn btn-outline-secondary large-btn" type="button" data-toggle="modal"
data-target="#help">Help
</button>
</div>
</div>
<input title="File size" id="sizeSlider" name="size">
@@ -45,10 +53,12 @@
<div class="col" id="treeTabs">
<ul class="nav nav-tabs" role="tablist">
<li class="nav-item">
<a class="nav-link active" data-toggle="tab" href="#mime" role="tab" aria-controls="home" aria-selected="true">Mime Types</a>
<a class="nav-link active" data-toggle="tab" href="#mime" role="tab" aria-controls="home"
aria-selected="true">Mime Types</a>
</li>
<li class="nav-item">
<a class="nav-link" data-toggle="tab" href="#tag" role="tab" aria-controls="profile" aria-selected="false" title="User-defined tags">Tags</a>
<a class="nav-link" data-toggle="tab" href="#tag" role="tab" aria-controls="profile"
aria-selected="false" title="User-defined tags">Tags</a>
</li>
</ul>
<div class="tab-content" id="myTabContent">
@@ -65,6 +75,116 @@
</div>
</div>
<div class="modal" id="modal" tabindex="-1" role="dialog" aria-labelledby="modal-title" aria-hidden="true">
<div class="modal-dialog modal-lg modal-dialog-centered" role="document">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title" id="modal-title"></h5>
<button type="button" class="close" data-dismiss="modal" aria-label="Close">
<span aria-hidden="true">&times;</span>
</button>
</div>
<div class="modal-body" id="modal-body"></div>
</div>
</div>
</div>
<div class="modal" id="help" tabindex="-1" role="dialog" aria-labelledby="modal-title" aria-hidden="true">
<div class="modal-dialog modal-lg modal-dialog-centered" role="document">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title">Search help</h5>
<button type="button" class="close" data-dismiss="modal" aria-label="Close">
<span aria-hidden="true">&times;</span>
</button>
</div>
<div class="modal-body">
<table class="table">
<tbody>
<tr>
<td><code>+</code></td>
<td>signifies AND operation</td>
</tr>
<tr>
<td><code>|</code></td>
<td>signifies OR operation</td>
</tr>
<tr>
<td><code>-</code></td>
<td>negates a single token</td>
</tr>
<tr>
<td><code>""</code></td>
<td>wraps a number of tokens to signify a phrase for searching</td>
</tr>
<tr>
<td><code>*</code></td>
<td>at the end of a term signifies a prefix query</td>
</tr>
<tr>
<td><code>(</code> and <code>)</code></td>
<td>signify precedence</td>
</tr>
<tr>
<td><code>~N</code></td>
<td>after a word signifies edit distance (fuzziness)</td>
</tr>
<tr>
<td><code>~N</code></td>
<td>after a phrase signifies slop amount</td>
</tr>
</tbody>
</table>
<p>For example: <code>"fried eggs" +(eggplant | potato) -frittata</code> will match the phrase
<i>fried eggs</i> and either <i>eggplant</i> or <i>potato</i>, but will ignore results
containing <i>frittata</i>.</p>
<p>When neither <code>+</code> or <code>|</code> is specified, the default operator is <code>+</code> (and).</p>
<p>When the <b>Fuzzy</b> option is checked, partial matches are also returned.</p>
<br>
<p>For more information, see <a target="_blank"
href="//www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html">Elasticsearch
documentation</a></p>
</div>
</div>
</div>
</div>
<div class="modal" id="settings" tabindex="-1" role="dialog" aria-labelledby="modal-title" aria-hidden="true">
<div class="modal-dialog modal-dialog-centered" role="document">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title">Settings</h5>
<button type="button" class="close" data-dismiss="modal" aria-label="Close">
<span aria-hidden="true">&times;</span>
</button>
</div>
<div class="modal-body">
<div class="custom-control custom-checkbox">
<input type="checkbox" class="custom-control-input" id="settingHighlight">
<label class="custom-control-label" for="settingHighlight">Enable highlighting</label>
</div>
<div class="custom-control custom-checkbox">
<input type="checkbox" class="custom-control-input" id="settingFuzzy">
<label class="custom-control-label" for="settingFuzzy">Set fuzzy search by default</label>
</div>
<label for="settingDisplay">Display</label>
<select id="settingDisplay" class="form-control form-control-sm">
<option value="grid">Grid</option>
<option value="list">List</option>
</select>
<br>
<button style="float: right" class="btn btn-primary" onclick="updateSettings()">Update settings</button>
</div>
</div>
</div>
</div>
<div id="searchResults"></div>
</div>